This file is indexed.

/usr/include/Bpp/Seq/Container/SiteContainerTools.h is in libbpp-seq-dev 2.1.0-1.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
//
// File: SiteContainerTools.h
// Created by: Julien Dutheil
// Created on: Fri Dec 12 18:55:06 2003
//

#ifndef _SITECONTAINERTOOLS_H_
#define _SITECONTAINERTOOLS_H_

/*
   Copyright or © or Copr. Bio++ Development Team, (November 17, 2004)

   This software is a computer program whose purpose is to provide classes
   for sequences analysis.

   This software is governed by the CeCILL  license under French law and
   abiding by the rules of distribution of free software.  You can  use,
   modify and/ or redistribute the software under the terms of the CeCILL
   license as circulated by CEA, CNRS and INRIA at the following URL
   "http://www.cecill.info".

   As a counterpart to the access to the source code and  rights to copy,
   modify and redistribute granted by the license, users are provided only
   with a limited warranty  and the software's author,  the holder of the
   economic rights,  and the successive licensors  have only  limited
   liability.

   In this respect, the user's attention is drawn to the risks associated
   with loading,  using,  modifying and/or developing or reproducing the
   software by the user in light of its specific status of free software,
   that may mean  that it is complicated to manipulate,  and  that  also
   therefore means  that it is reserved for developers  and  experienced
   professionals having in-depth computer knowledge. Users are therefore
   encouraged to load and test the software's suitability as regards their
   requirements in conditions enabling the security of their systems and/or
   data to be ensured and,  more generally, to use and operate it in the
   same conditions as regards security.

   The fact that you are presently reading this means that you have had
   knowledge of the CeCILL license and that you accept its terms.
 */

#include "SiteContainer.h"
#include "VectorSiteContainer.h"
#include "AlignedSequenceContainer.h"
#include "../AlphabetIndex/AlphabetIndex2.h"
#include "../DistanceMatrix.h"
#include <Bpp/Numeric/Matrix/Matrix.h>

//From the STL:
#include <vector>
#include <map>

namespace bpp
{

typedef std::vector<size_t> SiteSelection;

/**
 * @brief Some utililitary methods to deal with site containers.
 */
class SiteContainerTools
{
	public:
 		SiteContainerTools() {}
		virtual ~SiteContainerTools() {}

	public:

		/**
		 * @brief Retrieves sites without gaps from SiteContainer.
		 *
		 * This function build a new SiteContainer instance with only sites without gaps.
		 * The container passed as input is not modified, all sites are copied.
		 *
		 * @param sites The container to analyse.
		 * @return A pointer toward a new SiteContainer with only sites with no gaps.
		 */
		static SiteContainer* getSitesWithoutGaps(const SiteContainer& sites);

		/**
		 * @brief Retrieves complete sites from SiteContainer.
		 *
		 * This function build a new SiteContainer instance with only complete sites,
		 * i.e. site with fully resolved states (no gap, no unknown caracters).
		 * The container passed as input is not modified, all sites are copied.
		 *
		 * @param sites The container to analyse.
		 * @return A pointer toward a new SiteContainer with only complete sites.
		 */
		static SiteContainer* getCompleteSites(const SiteContainer& sites);

		/**
		 * @brief Get a site set without gap-only sites.
		 *
		 * This function build a new SiteContainer instance without sites with only gaps.
		 * The container passed as input is not modified, all sites are copied.
		 *
     * @see removeGapOnlySites(SiteContainer& sites)
		 * @param sites The container to analyse.
		 * @return A pointer toward a new SiteContainer.
		 */
		static SiteContainer* removeGapOnlySites(const SiteContainer& sites);

    /**
		 * @brief Remove gap-only sites from a site set.
		 *
		 * @param sites The container where the sites have to be removed.
		 */
		static void removeGapOnlySites(SiteContainer& sites);

		/**
		 * @brief Get a site set without gap/unresolved-only sites.
		 *
		 * This function build a new SiteContainer instance without sites with only gaps or unresolved characters.
		 * The container passed as input is not modified, all sites are copied.
		 *
		 * @param sites The container to analyse.
		 * @return A pointer toward a new SiteContainer.
		 */
		static SiteContainer* removeGapOrUnresolvedOnlySites(const SiteContainer& sites);
		
    /**
		 * @brief Remove gap/unresolved-only sites from a site set.
		 *
		 * @param sites The container where the sites have to be removed.
		 */
    static void removeGapOrUnresolvedOnlySites(SiteContainer& sites);

    /**
		 * @brief Get a siteset with sites with less than a given amount of gaps.
		 *
		 * @param sites The container from which the sites have to be removed.
     * @param maxFreqGaps The maximum frequency of gaps in each site.
		 * @return A pointer toward a new SiteContainer.
		 */
     static SiteContainer* removeGapSites(const SiteContainer& sites, double maxFreqGaps);

    /**
		 * @brief Remove sites with a given amount of gaps.
		 *
		 * @param sites The container from which the sites have to be removed.
     * @param maxFreqGaps The maximum frequency of gaps in each site.
		 */
     static void removeGapSites(SiteContainer& sites, double maxFreqGaps);

    /**
     * @brief Get a site set without stop codons, if the alphabet is a CodonAlphabet, otherwise throws an Exception.
     *
     * This function build a new SiteContainer instance without sites that have at least a stop codon.
     * The container passed as input is not modified, all sites are copied.
     *
     * @param sites The container to analyse.
     * @return A pointer toward a new SiteContainer.
     */
    static SiteContainer* removeStopCodonSites(const SiteContainer& sites) throw (AlphabetException);

    /**
		 * @brief Create a new container with a specified set of sites.
		 *
		 * A new VectorSiteContainer is created with specified sites.
		 * The destruction of the container is up to the user.
		 * Sites are specified by their indice, beginning at 0.
		 * No position verification is performed, based on the assumption that
		 * the container passed as an argument is a correct one.
		 * Redundant selection is not checked, so be careful with what you're doing!
		 *
		 * @param sequences The container from wich sequences are to be taken.
		 * @param selection The positions of all sites to retrieve.
		 * @return A new container with all selected sites.
		 */
		static SiteContainer* getSelectedSites(const SiteContainer& sequences, const SiteSelection& selection);

		/**
     * @brief create the consensus sequence of the alignment.
     *
     * In case of ambiguity (for instance a AATT site), one state will be chosen arbitrarily.
     *
     * @param sc a site container
     * @param name the name of the sequence object that will be created. 
     * @param ignoreGap Tell if gap must be counted or not. If not (true option), only fully gapped sites will result in a gap in the consensus sequence. 
     * @param resolveUnknown Tell is unknnown characters must resolved. In a DNA sequence for instance, N will be counted as A=1/4, T=1/4, G=1/4 and C=1/4. Otherwise it will be counted as N=1.
     * If this option is set to true, a consensus sequence will never contain an unknown character.
     * @return A new Sequence object with the consensus sequence.
     */
		static const Sequence* getConsensus(const SiteContainer & sc, const std::string& name = "consensus", bool ignoreGap = true, bool resolveUnknown = false);
    
    /**
     * @brief Change all gaps to unknown state in a container, according to its alphabet.
     *
     * For DNA alphabets, this change all '-' to 'N'.
     *
     * @param sites The container to be modified.
     */
    static void changeGapsToUnknownCharacters(SiteContainer& sites);

    /**
     * @brief Change all unresolved characters to gaps in a container, according to its alphabet.
     *
     * For DNA alphabets, this change all 'N', 'M', 'R', etc.  to '-'.
     *
     * @param sites The container to be modified.
     */
    static void changeUnresolvedCharactersToGaps(SiteContainer& sites);

    /**
     * @brief Resolve a container with "." notations.
     *
     * @code
     * ATGCCGTTGG
     * .C...A..C.
     * ..A....C..
     * @endcode
     * will results in
     * @code
     * ATGCCGTTGG
     * ACCCCATTCG
     * ATACCGTCGG
     * @endcode
     * for instance.
     * The first sequence is here called the "reference" sequence.
     * It need not be the first in the container.
     * The alphabet of the input alignment must be an instance of the DefaultAlphabet class, the only one which support dot characters.
     * A new alignment is created and returned, with the specified alphabet.
     *
     * If several sequences that may be considered as reference are found, the first one is used.
     * 
     * @param dottedAln The input alignment.
     * @param resolvedAlphabet The alphabet of the output alignment.
     * @return A pointer toward a dynamically created SiteContainer with the specified alphabet (can be a DefaultAlphabet).
     * @throw AlphabetException If the alphabet of the input alignment is not of class DefaultAlphabet, or if one character does not match with the output alphabet.
     * @throw Exception If no reference sequence was found, or if the input alignment contains no sequence.
     */
    static SiteContainer* resolveDottedAlignment(const SiteContainer& dottedAln, const Alphabet* resolvedAlphabet) throw (AlphabetException, Exception);

    /**
     * @name Sequences coordinates.
     *
     * @see SequenceWalker For an alternative approach.
     * @{
     */

    /**
     * @brief Get the index of each sequence position in an aligned sequence.
     *
     * If the sequence contains no gap, the translated and the original positions are the same.
     * Position numbers start at 1.
     *
     * @param seq The sequence to translate.
     * @return A map with original sequence positions as keys, and translated positions as values.
     */
    static std::map<size_t, size_t> getSequencePositions(const Sequence& seq);

    /**
     * @brief Get the index of each alignment position in an aligned sequence.
     *
     * If the sequence contains no gap, the translated and the original positions are the same.
     * Position numbers start at 1.
     *
     * @param seq The sequence to translate.
     * @return A map with original alignement positions as keys, and translated positions as values.
     */
    static std::map<size_t, size_t> getAlignmentPositions(const Sequence& seq);

    /**
     * @brief Fill a numeric matrix with the size of the alignment, containing the each sequence position.
     *
     * Positions start at 1, gaps have "position" 0.
     *
     * @param sites The input alignment.
     * @param positions A matrix object which is going to be resized and filled with the corresponding positions.
     * @author Julien Dutheil
     */
    static void getSequencePositions(const SiteContainer& sites, Matrix<size_t>& positions);
    /** @} */

    /**
     * @brief Translate alignement positions from an aligned sequence to the same sequence in a different alignment.
     *
     * Takes each position (starting at 1) in sequence 1, and look for the corresponding position in sequence 2.
     * The two sequences must be the same, excepted for the gaps.
     * If no sequence contains gaps, or if the gaps are at the same place in both sequences, the translated postion will be the same as the original positions.
     * 
     * @param seq1 The sequence to translate.
     * @param seq2 The reference sequence.
     * @return A map with original alignement positions as keys, and translated positions as values.
     * @throw AlphabetMismatchException If the sequences do not share the same alphabet.
     * @throw Exception If the sequence do not match.
     */
    static std::map<size_t, size_t> translateAlignment(const Sequence& seq1, const Sequence& seq2) throw (AlphabetMismatchException, Exception);

    /**
     * @brief Translate sequence positions from a sequence to another in the same alignment.
     *
     * Takes each position (starting at 1) in sequence 1, and look for the corresponding position in sequence 2 at the same site.
     * If no corresponding position is available (i.e. if there is a gap in sequence 2 at the corresponding position), 0 is returned.
     *
     * @param sequences The alignment to use.
     * @param i1 The index of the sequence to translate.
     * @param i2 The index of the reference sequence.
     * @return A map with original sequence positions as keys, and translated positions as values.
     */
    static std::map<size_t, size_t> translateSequence(const SiteContainer& sequences, size_t i1, size_t i2);

    /**
     * @brief Align two sequences using the Needleman-Wunsch dynamic algorithm.
     *
     * If the input sequences contain gaps, they will be ignored.
     *
     * @see BLOSUM50, DefaultNucleotideScore for score matrices.
     *
     * @param seq1 The first sequence.
     * @param seq2 The second sequence.
     * @param s The score matrix to use.
     * @param gap Gap penalty.
     * @return A new SiteContainer instance.
     * @throw AlphabetMismatchException If the sequences and the score matrix do not share the same alphabet.
     */
    static AlignedSequenceContainer* alignNW(const Sequence& seq1, const Sequence& seq2, const AlphabetIndex2& s, double gap) throw (AlphabetMismatchException);

    /**
     * @brief Align two sequences using the Needleman-Wunsch dynamic algorithm.
     *
     * If the input sequences contain gaps, they will be ignored.
     *
     * @see BLOSUM50, DefaultNucleotideScore for score matrices.
     *
     * @param seq1 The first sequence.
     * @param seq2 The second sequence.
     * @param s The score matrix to use.
     * @param opening Gap opening penalty.
     * @param extending Gap extending penalty.
     * @return A new SiteContainer instance.
     * @throw AlphabetMismatchException If the sequences and the score matrix do not share the same alphabet.
     */
    static AlignedSequenceContainer* alignNW(const Sequence& seq1, const Sequence& seq2, const AlphabetIndex2& s, double opening, double extending) throw (AlphabetMismatchException);

    /**
     * @brief Sample sites in an alignment.
     *
     * Original site positions will be kept. The resulting container will hence probably have duplicated
     * positions. You may wish to call the reindexSites() method on the returned container.
     *
     * Note: This method will be optimal with a container with vertical storage like VectorSiteContainer.
     *
     * @param sites An input alignment to sample.
     * @param nbSites The size of the resulting container.
     * @param index [out] If non-null the underlying vector will be appended with the original site indices.
     * @return A sampled alignment with nbSites sites taken from the input one.
     */
    static VectorSiteContainer* sampleSites(const SiteContainer& sites, size_t nbSites, std::vector<size_t>* index = 0);

    /**
     * @brief Bootstrap sites in an alignment.
     *
     * Original site positions will be kept. The resulting container will hence probably have duplicated
     * positions. You may wish to call the reindexSites() method on the returned container.
     *
     * Note: This method will be optimal with a container with vertical storage like VectorSiteContainer.
     *
     * @param sites An input alignment to sample.
     * @return A sampled alignment with the same number of sites than the input one.
     */
    static VectorSiteContainer* bootstrapSites(const SiteContainer& sites);

    /**
     * @brief Compute the similarity/distance score between two aligned sequences.
     *
     * The similarity measures are computed as the proportion of identical match.
     * The distance between the two sequences is defined as 1 - similarity.
     * This function can be used with any type of alphabet.
     *
     * @param seq1 The first sequence.
     * @param seq2 The second sequence.
     * @param dist Shall we return a distance instead of similarity?
     * @param gapOption How to deal with gaps:
     * - SIMILARITY_ALL: all positions are used.
     * - SIMILARITY_NODOUBLEGAP: ignore all positions with a gap in the two sequences.
     * - SIMILARITY_NOGAP: ignore all positions with a gap in at least one of the two sequences.
     * @param unresolvedAsGap Tell if unresolved characters must be considered as gaps when counting.
     * If set to yes, the gap option will also apply to unresolved characters.
     * @return The proportion of matches between the two sequences.
     * @throw SequenceNotAlignedException If the two sequences do not have the same length.
     * @throw AlphabetMismatchException If the two sequences do not share the same alphabet type.
     * @throw Exception If an invalid gapOption is passed.
     */
    static double computeSimilarity(const Sequence& seq1, const Sequence& seq2, bool dist = false, const std::string& gapOption = SIMILARITY_NODOUBLEGAP, bool unresolvedAsGap = true) throw (SequenceNotAlignedException, AlphabetMismatchException, Exception);

    /**
     * @brief Compute the similarity matrix of an alignment.
     *
     * The similarity measures are computed as the proportion of identical match.
     * The distance between the two sequences is defined as 1 - similarity.
     * This function can be used with any type of alphabet.
     * Several options concerning gaps and unresolved characters are proposed:
     * - SIMILARITY_ALL: all positions are used.
     * - SIMILARITY_NOFULLGAP: ignore positions with a gap in all the sequences in the alignment.
     * - SIMILARITY_NODOUBLEGAP: ignore all positions with a gap in the two sequences for each pair.
     * - SIMILARITY_NOGAP: ignore all positions with a gap in at least one of the two sequences for each pair.
     *
     *
     * @see computeSimilarityMatrix
     *
     * @param sites The input alignment.
     * @param dist Shall we return a distance instead of similarity?
     * @param gapOption How to deal with gaps.
     * @param unresolvedAsGap Tell if unresolved characters must be considered as gaps when counting.
     * If set to yes, the gap option will also apply to unresolved characters.
     * @return All pairwise similarity measures.
     */
    static DistanceMatrix* computeSimilarityMatrix(const SiteContainer& sites, bool dist = false, const std::string& gapOption = SIMILARITY_NOFULLGAP, bool unresolvedAsGap = true);

    static const std::string SIMILARITY_ALL;
    static const std::string SIMILARITY_NOFULLGAP;
    static const std::string SIMILARITY_NODOUBLEGAP;
    static const std::string SIMILARITY_NOGAP;

    /**
     * @brief Add the content of a site container to an exhisting one.
     *
     * The input containers are supposed to have unique sequence names.
     * If it is not the case, several things can happen:
     * - If the two containers have exactly the same names in the same order, then the content of the second one will be added as is to the first one.
     * - If the second container does not have exactly the same sequences names or in a different order, then a reordered selection of the second contianer is created first,
     *   and in that case, only the first sequence with a given name will be used and duplicated.
     * In any case, note that the second container should always contains all the sequence names from the first one,
     * otherwise an exception will be thrown.
     *
     * @author Julien Dutheil
     *
     * @param seqCont1 First container.
     * @param seqCont2 Second container. This container must contain sequences with the same names as in seqcont1.
     * Additional sequences will be ignored.
     * @param leavePositionAsIs Tell is site position should be unchanged. Otherwise (the default) is to add the size of container 1 to the positions in container 2.
     * @throw AlphabetMismatchException If the alphabet in the 2 containers do not match.
     * @throw Exception If sequence names do not match.
     */
    static void merge(SiteContainer& seqCont1, const SiteContainer& seqCont2, bool leavePositionAsIs = false) throw (AlphabetMismatchException, Exception);

    /**
     * @brief Compare an alignment to a reference alignment, and compute the column scores.
     *
     * Calculations are made according to formula for the "CS" score in Thompson et al 1999, Nucleic Acids Research (1999):27(13);2682–2690.
     *
     * @param positions1 Alignment index for the test alignment.
     * @param positions2 Alignment index for the reference alignment.
     * @param na         The score to use if the tested column is full of gap.
     * @return A vector of score, as 0 or 1.
     * @see getSequencePositions for creating the alignment indexes.
     * @warning The indexes for the two alignments must have the sequences in the exact same order!
     * @author Julien Dutheil
     */
    static std::vector<int> getColumnScores(const Matrix<size_t>& positions1, const Matrix<size_t>& positions2, int na = 0);
   
    /**
     * @brief Compare an alignment to a reference alignment, and compute the sum-of-pairs scores.
     *
     * Calculations are made according to formula for the "SPS" score in Thompson et al 1999, Nucleic Acids Research (1999):27(13);2682–2690.
     *
     * @param positions1 Alignment index for the test alignment.
     * @param positions2 Alignment index for the reference alignment.
     * @param na         The score to use if the tested column is not testable, that is not containing at least to residues.
     * @return A vector of score, between 0 and 1 (+ na value).
     * @see getSequencePositions for creating the alignment indexes.
     * @warning The indexes for the two alignments must have the sequences in the exact same order!
     * @author Julien Dutheil
     */
    static std::vector<double> getSumOfPairsScores(const Matrix<size_t>& positions1, const Matrix<size_t>& positions2, double na = 0);
};

} //end of namespace bpp.

#endif	//_SITECONTAINERTOOLS_H_