/usr/include/OpenMS/DATASTRUCTURES/SuffixArraySeqan.h is in libopenms-dev 1.11.1-3.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 | // --------------------------------------------------------------------------
// OpenMS -- Open-Source Mass Spectrometry
// --------------------------------------------------------------------------
// Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
// ETH Zurich, and Freie Universitaet Berlin 2002-2013.
//
// This software is released under a three-clause BSD license:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of any author or any participating institution
// may be used to endorse or promote products derived from this software
// without specific prior written permission.
// For a full list of authors, refer to the file AUTHORS.
// --------------------------------------------------------------------------
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
// INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// --------------------------------------------------------------------------
// $Maintainer: Clemens Groepl,Andreas Bertsch$
// $Authors: Chris Bauer $
// --------------------------------------------------------------------------
#ifndef OPENMS_DATASTRUCTURES_SUFFIXARRAYSEQAN_H
#define OPENMS_DATASTRUCTURES_SUFFIXARRAYSEQAN_H
#include <vector>
#include <OpenMS/DATASTRUCTURES/String.h>
#include <OpenMS/DATASTRUCTURES/SuffixArray.h>
#include <OpenMS/DATASTRUCTURES/SeqanIncludeWrapper.h>
#include <OpenMS/CHEMISTRY/WeightWrapper.h>
namespace OpenMS
{
/**
@brief Class that uses SEQAN library for a suffix array. It can be used to find peptide Candidates for a MS spectrum
This class uses SEQAN suffix array. It can just be used for finding peptide Candidates for a given MS Spectrum within a certain mass tolerance. The suffix array can be saved to disc for reused so it has to be build just once.
*/
class OPENMS_DLLAPI SuffixArraySeqan :
public SuffixArray
, public WeightWrapper
{
typedef seqan::TopDown<seqan::ParentLinks<> > TIterSpec;
typedef seqan::Index<seqan::String<char>, seqan::IndexEsa<TIterSpec> > TIndex;
typedef seqan::Iter<TIndex, seqan::VSTree<TIterSpec> > TIter;
// TODO ??? was: typedef seqan::Index<seqan::String<char>, seqan::Index_ESA<seqan::TopDown<seqan::ParentLinks<seqan::Preorder> > > > TIndex;
public:
/**
@brief constructor
@param st const string reference with the string for which the suffix array should be build
@param filename const string reference with filename for opening or saving the suffix array
@param weight_mode if not monoistopic weight should be used, this parameters can be set to AVERAGE
@throw FileNotFound is thrown if the given file is not found
@throw InvalidValue if the given suffix array string is invalid
*/
SuffixArraySeqan(const String & st, const String & filename, const WeightWrapper::WEIGHTMODE weight_mode = WeightWrapper::MONO);
/**
@brief copy constructor
*/
SuffixArraySeqan(const SuffixArraySeqan & source);
/**
@brief destructor
*/
virtual ~SuffixArraySeqan();
/**
@brief converts suffix array to a printable string
*/
String toString();
/**
@brief the function that will find all peptide candidates for a given spectrum
@param spec const reference of DoubleReal vector describing the spectrum
@param candidates output parameters which holds the candidates of the masses given in spec after call
@return a vector of SignedSize pairs.
for every mass within the spectrum all candidates described by as pairs of ints are returned. All masses are searched for the same time in just one suffix array traversal. In order to accelerate the traversal the skip and lcp table are used. The mass wont be calculated for each entry but it will be updated during traversal using a stack datastructure
*/
void findSpec(std::vector<std::vector<std::pair<std::pair<SignedSize, SignedSize>, DoubleReal> > > & candidates, const std::vector<DoubleReal> & spec);
/**
@brief saves the suffix array to disc
@param filename const reference string describing the filename
@return bool if operation was succesful
@throw UnableToCreateFile is thrown if the output files could not be created
*/
bool save(const String & filename);
/**
@brief opens the suffix array
@param filename const reference string describing the filename
@return bool if operation was succesful
@throw FileNotFound is thrown if the given file could not be found
*/
bool open(const String & filename);
/**
@brief setter for tolerance
@param t DoubleReal with tolerance, only 0 or greater is allowed
@throw InvalidValue is thrown if given tolerance is negative
*/
void setTolerance(DoubleReal t);
/**
@brief getter for tolerance
@return DoubleReal with tolerance
*/
DoubleReal getTolerance() const;
/**
@brief returns if an enzyme will cut after first character
@param aa1 const char as first aminoacid
@param aa2 const char as second aminoacid
@return bool descibing if it is a digesting site
*/
bool isDigestingEnd(const char aa1, const char aa2) const;
/**
@brief setter for tags
@param tags reference to vector of strings with tags
@note sets use_tags = true
*/
void setTags(const std::vector<OpenMS::String> & tags);
/**
@brief getter for tags
@return const reference to vector of strings
*/
const std::vector<OpenMS::String> & getTags();
/**
@brief setter for use_tags
@param use_tags indicating whether tags should be used or not
*/
void setUseTags(bool use_tags);
/**
@brief getter for use_tags
@return bool indicating whether tags are used or not
*/
bool getUseTags();
/**
@brief setter for number of modifications
@param number_of_mods
*/
void setNumberOfModifications(Size number_of_mods);
/**
@brief getter for number of modifications
@return number of modifications
*/
Size getNumberOfModifications();
void printStatistic();
protected:
/**
@brief overwriting goNextSubTree_ from seqan index_esa_stree.h for mass update during suffix array traversal
the suffix array is treated as a suffix tree. this function skips the subtree under the actual node and goes directly to the next subtree that has not been visited yet. During this traversal the mass will be updated using the stack with edge masses.
@param it reference to the suffix array iterator
@param m reference to actual mass
@param allm reference to the stack with history of traversal
@param mod_map input parameters which specifies the modification massen allowed in the candidates
@see goNext
*/
inline void goNextSubTree_(TIter & it, DoubleReal & m, std::stack<DoubleReal> & allm, std::stack<std::map<DoubleReal, SignedSize> > & mod_map)
{
// preorder dfs
if (!goRight(it))
{
while (true)
{
if (goUp(it))
{
m -= allm.top();
allm.pop();
mod_map.pop();
}
else
{
break;
}
if (goRight(it))
{
m -= allm.top();
allm.pop();
mod_map.pop();
break;
}
}
}
else
{
m -= allm.top();
allm.pop();
mod_map.pop();
}
if (isRoot(it))
{
clear(it);
}
}
/**
@brief goes to the next sub tree
@param it reference to the suffix array iterator
@see goNext
*/
inline void goNextSubTree_(TIter & it)
{
// preorder dfs
if (!goRight(it))
{
while (true)
{
if (!goUp(it))
{
break;
}
if (goRight(it))
{
break;
}
}
}
if (isRoot(it))
{
clear(it);
}
}
/**
@brief overwriting goNext from seqan index_esa_stree.h for mass update during suffix array traversal
the suffix array is treated as a suffix tree. this function goes to the next node that has not been visited yet. During this traversal the mass will be updated using the stack with edge masses.
@param it reference to the suffix array iterator
@param m reference to actual mass
@param allm reference to the stack with history of traversal
@param mod_map input parameters which specifies the modification masses allowed in the candidates
@see goNextSubTree_
*/
inline void goNext_(TIter & it, DoubleReal & m, std::stack<DoubleReal> & allm, std::stack<std::map<DoubleReal, SignedSize> > & mod_map)
{
// preorder dfs
if (!goDown(it))
{
goNextSubTree_(it, m, allm, mod_map);
}
}
inline void parseTree_(TIter & it, std::vector<std::pair<SignedSize, SignedSize> > & out_number, std::vector<std::pair<SignedSize, SignedSize> > & edge_length, std::vector<SignedSize> & leafe_depth)
{
SignedSize depth = 1;
while (!atEnd(it))
{
SignedSize le = 0;
bool isLeaf = false;
if (length(parentEdgeLabel(it)) > 0)
{
if (countChildren(it) > 0)
{
edge_length.push_back(std::pair<SignedSize, SignedSize>(depth, length(parentEdgeLabel(it))));
}
else
{
//le <- length(representative(it));
//isLeaf = true;
}
}
if (countChildren(it) > 0)
{
out_number.push_back(std::pair<SignedSize, SignedSize>(depth, countChildren(it)));
}
else
{
leafe_depth.push_back(depth);
}
if (goDown(it))
{
depth++;
}
else if (!goRight(it))
{
while (!goRight(it))
{
goUp(it);
if (isLeaf)
{
edge_length.push_back(std::pair<SignedSize, SignedSize>(depth, le - length(parentEdgeLabel(it))));
isLeaf = false;
}
depth--;
if (isRoot(it)) return;
}
}
else
{
}
}
}
TIndex index_; ///< seqan suffix array
TIter * it_; ///< seqan suffix array iterator
/**
@brief binary search for finding the index of the first element of the spectrum that matches the desired mass within the tolerance.
@param spec const reference to spectrum
@param m mass
@return SignedSize with the index of the first occurence
@note requires that there is at least one occurence
*/
SignedSize findFirst_(const std::vector<DoubleReal> & spec, DoubleReal & m);
/**
@brief binary search for finding the index of the first element of the spectrum that matches the desired mass within the tolerance. it searches recursivly.
@param spec const reference to spectrum
@param m mass
@param start start index
@param end end index
@return SignedSize with the index of the first occurence
@note requires that there is at least one occurence
*/
SignedSize findFirst_(const std::vector<DoubleReal> & spec, DoubleReal & m, SignedSize start, SignedSize end);
const String & s_; ///< reference to strings for which the suffix array is build
DoubleReal masse_[255]; ///< amino acid masses
SignedSize number_of_modifications_; ///< number of allowed modifications
std::vector<String> tags_; ///< all tags
bool use_tags_; ///< if tags are used
DoubleReal tol_; ///< tolerance
};
}
#endif //OPENMS_DATASTRUCTURES_SUFFIXARRAYSEQAN_H
|