/usr/include/OpenMS/DATASTRUCTURES/SuffixArrayTrypticCompressed.h is in libopenms-dev 1.11.1-3.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 | // --------------------------------------------------------------------------
// OpenMS -- Open-Source Mass Spectrometry
// --------------------------------------------------------------------------
// Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
// ETH Zurich, and Freie Universitaet Berlin 2002-2013.
//
// This software is released under a three-clause BSD license:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of any author or any participating institution
// may be used to endorse or promote products derived from this software
// without specific prior written permission.
// For a full list of authors, refer to the file AUTHORS.
// --------------------------------------------------------------------------
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
// INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// --------------------------------------------------------------------------
// $Maintainer: Clemens Groepl,Andreas Bertsch$
// $Authors: Chris Bauer $
// --------------------------------------------------------------------------
#ifndef OPENMS_DATASTRUCTURES_SUFFIXARRAYTRYPTICCOMPRESSED_H
#define OPENMS_DATASTRUCTURES_SUFFIXARRAYTRYPTICCOMPRESSED_H
#include <OpenMS/CONCEPT/Exception.h>
#include <OpenMS/DATASTRUCTURES/SuffixArray.h>
#include <OpenMS/CHEMISTRY/WeightWrapper.h>
namespace OpenMS
{
class String;
/**
@brief Class that implements a suffix array for a String. It can be used to find peptide Candidates for a MS spectrum
This class implements a suffix array. It can just be used for finding peptide Candidates for a given MS Spectrum within a certain mass tolerance. The suffix array can be saved to disc for reused so it has to be build just once. The suffix array consits of a vector of pair of ints for every suffix, a vector of LCP values and a so called skip vector.
Only the sufices that are matching the function isDigestingEnd are created. Besides a suffix will not reach till the end of the string but till the next occurence of the separator ($). So only the interessting sufices will be saved. This will reduce the used space.
*/
class OPENMS_DLLAPI SuffixArrayTrypticCompressed :
public SuffixArray
, public WeightWrapper
{
public:
/**
@brief constructor taking the string and the filename for writing or reading
@param st the string as const reference with which the suffix array will be build
@param filename the filename for writing or reading the suffix array
@param weight_mode if not monoistopic weight should be used, this parameters can be set to AVERAGE
@throw Exception::InvalidValue if string does not start with empty string ($)
@throw FileNotFound is thrown if the given file was not found
The constructor checks if a suffix array with given filename (without file extension) exists or not. In the first case it will simple be loaded and otherwise it will be build. Bulding the suffix array consists of several steps. At first all indices for a digesting enzyme (defined by using function isDigestingEnd) are created as an vector of SignedSize pairs. After creating all relevant indices they are sorted and the lcp and skip vectors are created.
*/
SuffixArrayTrypticCompressed(const String & st, const String & filename, const WeightWrapper::WEIGHTMODE weight_mode = WeightWrapper::MONO);
/**
@brief copy constructor
*/
SuffixArrayTrypticCompressed(const SuffixArrayTrypticCompressed & sa);
/**
@brief destructor
*/
virtual ~SuffixArrayTrypticCompressed();
/**
@brief transforms suffix array to a printable String
*/
String toString();
/**
@brief the function that will find all peptide candidates for a given spectrum
@param spec const reference of DoubleReal vector describing the spectrum
@param candidates output parameter which contains the candidates of the masses given in spec
@return a vector of SignedSize pairs.
@throw InvalidValue if the spectrum is not sorted ascendingly
for every mass within the spectrum all candidates described by as pairs of ints are returned. All masses are searched for the same time in just one suffix array traversal. In order to accelerate the traversal the skip and lcp table are used. The mass wont be calculated for each entry but it will be updated during traversal using a stack datastructure
*/
void findSpec(std::vector<std::vector<std::pair<std::pair<SignedSize, SignedSize>, DoubleReal> > > & candidates, const std::vector<DoubleReal> & spec);
/**
@brief saves the suffix array to disc
@param file_name const reference string describing the filename
@return bool if operation was succesful
@throw Exception::UnableToCreateFile if file could not be created (e.x. if you have no rigths)
*/
bool save(const String & file_name);
/**
@brief opens the suffix array
@param file_name const reference string describing the filename
@return bool if operation was succesful
@throw FileNotFound
*/
bool open(const String & file_name);
/**
@brief setter for tolerance
@param t DoubleReal with tolerance
@throw Exception::InvalidValue if tolerance is negative
*/
void setTolerance(DoubleReal t);
/**
@brief getter for tolerance
@return DoubleReal with tolerance
*/
DoubleReal getTolerance() const;
/**
@brief returns if an enzyme will cut after first character
@param aa1 const char as first aminoacid
@param aa2 const char as second aminoacid
@return bool descibing if it is a digesting site
*/
bool isDigestingEnd(const char aa1, const char aa2) const;
/**
@brief setter for tags
@param tags const vector of strings with tags with length 3 each
@throw InvalidValue if at least one tag does not have size of 3
*/
void setTags(const std::vector<String> & tags);
/**
@brief getter for tags
@return const vector of string with tags
*/
const std::vector<String> & getTags();
/**
@brief setter for use_tags
@param use_tags indicating whether tags should be used or not
*/
void setUseTags(bool use_tags);
/**
@brief getter for use_tags
@return bool indicating whether tags are used or not
*/
bool getUseTags();
/**
@brief setter for number of modifications
@param number_of_mods
*/
void setNumberOfModifications(Size number_of_mods);
/**
@brief getter for number of modifications
@return unsigned SignedSize describing number of modifications
*/
Size getNumberOfModifications();
/**
@brief output for statistic
*/
void printStatistic();
protected:
/**
@brief constructor
*/
SuffixArrayTrypticCompressed();
/**
@brief gets the index of the next sperator for a given index
@param p const SignedSize describing a position in the string
@return SignedSize with the index of the next occurence of the sperator or -1 if there is no more separator
*/
SignedSize getNextSep_(const SignedSize p) const;
/**
@brief gets the lcp for two strings described as pairs of ints
@param last_point const pair of ints describing a substring
@param current_point const pair of ints describing a substring
@return SignedSize with the length of the lowest common prefix
*/
SignedSize getLCP_(const std::pair<SignedSize, SignedSize> & last_point, const std::pair<SignedSize, SignedSize> & current_point);
/**
@brief binary search for finding the index of the first element of the spectrum that matches the desired mass within the tolerance.
@param spec const reference to spectrum
@param m mass
@return SignedSize with the index of the first occurence
@note requires that there is at least one occurence
*/
SignedSize findFirst_(const std::vector<DoubleReal> & spec, DoubleReal & m);
/**
@brief binary search for finding the index of the first element of the spectrum that matches the desired mass within the tolerance. it searches recursivly.
@param spec const reference to spectrum
@param m mass
@param start start index
@param end end index
@return SignedSize with the index of the first occurence
@note requires that there is at least one occurence
*/
SignedSize findFirst_(const std::vector<DoubleReal> & spec, DoubleReal & m, SignedSize start, SignedSize end);
/**
@brief treats the suffix array as a tree and parses the tree using postorder traversion. This is realised by a recursive algorithm.
@param start_index SignedSize describing the start index in indices_ vector
@param stop_index SignedSize describing the end index in indices_ vector
@param depth at with depth the traversion is at the actual position
@param walked_in how many characters we have seen from root to actual position
@param edge_len how many characters we have seen from last node to actual position
@param out_number reference to vector of pairs of ints. For every node it will be filled with how many outgoing edge a node has in dependece of its depth
@param edge_length will be filled with the edge_length in dependence of its depth
@param leafe_depth will be filled with the depth of every leafe
@note intialize: walked_in=0, depth=1, edge_len=1
*/
void parseTree_(SignedSize start_index, SignedSize stop_index, SignedSize depth, SignedSize walked_in, SignedSize edge_len, std::vector<std::pair<SignedSize, SignedSize> > & out_number, std::vector<std::pair<SignedSize, SignedSize> > & edge_length, std::vector<SignedSize> & leafe_depth);
/**
@brief indicates if a node during traversal has more outgoings
@param start_index SignedSize describing the start index in indices_ vector
@param stop_index SignedSize describing the end index in indices_ vector
@param walked_in how many characters we have seen from root to actual position
*/
bool hasMoreOutgoings_(SignedSize start_index, SignedSize stop_index, SignedSize walked_in);
const String & s_; ///< the string with which the suffix array is build
DoubleReal tol_; ///< mass tolerance for finding candidates
std::vector<std::pair<SignedSize, SignedSize> > indices_; ///< vector of pairs of ints describing all relevant sufices
std::vector<SignedSize> lcp_; ///< vector of ints with lcp values
std::vector<SignedSize> skip_; ///< vector of ints with skip values
//const SignedSize getIndex_ (const String & s);
DoubleReal masse_[256]; ///< mass table
Size number_of_modifications_; ///< number of allowed modifications
std::vector<String> tags_; ///< all given tags
bool use_tags_; ///< indicates whether tags are used or not
SignedSize progress_;
};
}
#endif //OPENMS_DATASTRUCTURES_SUFFIXARRAYTRYPTICCOMPRESSED_H
|