This file is indexed.

/usr/include/libMems-1.6/libMems/MatchFinder.h is in libmems-1.6-dev 1.6.0+4725-4.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
/*******************************************************************************
 * $Id: MatchFinder.h,v 1.23 2004/03/01 02:40:08 darling Exp $
 * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
 * This file is licensed under the GPL.
 * Please see the file called COPYING for licensing details.
 * **************
 ******************************************************************************/

#ifndef _MatchFinder_h_
#define _MatchFinder_h_

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include "libMems/SortedMerList.h"
#include "libMems/Match.h"
#include "libMems/MatchList.h"
#include <list>
#include <iostream>
#include <boost/pool/pool_alloc.hpp>

namespace mems {

struct idmer{
	gnSeqI	position;	//starting position of this mer in the genome
	uint64 	mer; 		//the actual sequence
	sarID_t	id;			//the sequence identifier.
};

// typedef std::list<idmer, boost::fast_pool_allocator<idmer> > IdmerList;
// using boost::fast_pool_allocator<idmer> results in a significant speedup
// over std::allocator.  testing on a Salmonella vs. Y. pestis comparison shows
// a 30% speedup
typedef std::list<idmer> IdmerList;

const unsigned int PROGRESS_GRANULARITY = 100;

/**
 * This pure virtual class implements a general framework for finding
 * exactly matching mers.  It is extended by the MemHash and MemScorer
 * classes.
 * @see MemHash
 * @see MemScorer
 */
class MatchFinder : public genome::gnClone{
public:
	MatchFinder();
	~MatchFinder();
	MatchFinder(const MatchFinder& mf);
	virtual void Clear();
	/**
	 * Adds a sequence to use when searching for exact matches.
	 * @param sar A pointer to the sorted mer list for the new sequence
	 * @param seq A pointer to the genome::gnSequence corresponding to the new sequence.
	 */
	virtual boolean AddSequence( SortedMerList* sar, genome::gnSequence* seq = NULL );
	/**
	 * Given the index of a sequence and an index into the sorted mer list, this function
	 * will search the other sorted mer lists for the same mer.  This function returns the
	 * position of the mer in each sequence in the breakpoints vector.
	 */
	virtual void GetBreakpoint( uint32 sarI, gnSeqI startI, std::vector<gnSeqI>& breakpoints ) const;
	virtual uint32 Multiplicity(void){return seq_count;};
	/** NOT IMPLEMENTED: Sets the number of ambiguities allowed in a mer match*/
	virtual void SetAmbiguityTolerance(uint32 ambiguity_tol){ambiguity_tolerance = ambiguity_tol;}
	/** @return the number of ambiguities allowed in a mer match */
	virtual uint32 AmbiguityTolerance(){return ambiguity_tolerance;}
	/** @return The progress of the current operation.  Ranges from 0 to 100.  -1 indicates no computation is being performed */
	virtual float GetProgress() const {return m_progress;}

	/** Finds all the matches between the sequences */
	virtual void FindMatchSeeds();
	/** Finds all the matches between the sequences, starting at a particular offset */
	virtual void FindMatchSeeds( const std::vector<gnSeqI>& start_offsets );

	/**
	 * Logs progress to the designated ostream.  Set to null to skip progress logging.
	 */
	virtual void LogProgress( std::ostream* os );
	void SetOffsetLog( std::ostream* offset_stream ){ this->offset_stream = offset_stream; }
protected:
	/** 
	 * Searches for mer matches in a designated range of the sequence's sorted mer lists 
	 * @throws InvalidData thrown if the start_points are bad or if the sorted mer lists were sorted on different mer sizes
	 * @return true if completed searching, false if repetitive mers were encountered and FindMatches must be called again.
	 */
	virtual boolean SearchRange(std::vector<gnSeqI>& start_points, std::vector<gnSeqI>& search_len);
	/** Called whenever a mer match is found */
	virtual boolean HashMatch(IdmerList& match_list) = 0;
	virtual boolean EnumerateMatches(IdmerList& match_list);

	template< class MatchType >
	void FindSubsets(const MatchType& mhe, std::vector<MatchType>& subset_matches);

	template< class UngappedMatchType >
	void ExtendMatch(UngappedMatchType& mhe, std::vector<UngappedMatchType>& subset_matches, gnSeqI max_backward = GNSEQI_END, gnSeqI max_forward = GNSEQI_END);

	virtual SortedMerList* GetSar(uint32 sarI) const;
	std::vector<SortedMerList*> sar_table;
	std::vector<genome::gnSequence*> seq_table;
	
	uint32 mer_size;
	uint32 seq_count;
	uint32 ambiguity_tolerance;
	
	// for subset matches
	std::vector< std::vector< uint32 > > alpha_map;
	uint alpha_map_size;
	uint alphabet_bits;
	
	float m_progress;
	std::ostream* log_stream;

	uint64 mers_processed;	/**< The number of mers processed thus far */
	uint64 total_mers;	/**< The total number of mers to search */
	std::ostream* offset_stream;	/**< log for the current offset in each SML */
};

/** 
 * InvalidData exceptions are thrown when the input to an algorithm is invalid
 */
CREATE_EXCEPTION( InvalidData );

inline
SortedMerList* MatchFinder::GetSar(uint32 sarI) const{
	return sar_table[sarI];
}

inline
bool idmer_lessthan(idmer& a_v, idmer& m_v){
	return (a_v.mer < m_v.mer);// ? true : false;
};

//id less than function for STL sort functions
inline
bool idmer_id_lessthan(idmer& a_v, idmer& m_v){
	return (a_v.id < m_v.id);// ? true : false;
};



// takes as input a fully extended mem and returns the subset matches on the lower side
template< class MatchType >
void MatchFinder::FindSubsets(const MatchType& mhe, std::vector<MatchType>& subset_matches){

	SMLHeader head = GetSar( 0 )->GetHeader();
	uint shift_amt = 64 - head.alphabet_bits;
	uint rshift_amt = head.alphabet_bits * ( GetSar(0)->SeedLength() - 1 );

	uint seqI, alphaI;

	// initialize subset match data structures
	alpha_map_size = 1;
	alpha_map_size <<= alphabet_bits;
	if( alpha_map.size() != alpha_map_size ){
		alpha_map.clear();
		alpha_map.reserve( alpha_map_size );
		std::vector< uint32 > tmp_list;
		tmp_list.reserve( seq_count );
		for( uint alphaI = 0; alphaI < alpha_map_size; ++alphaI )
			alpha_map.push_back( tmp_list );
	}else{
		for( uint alphaI = 0; alphaI < alpha_map_size; ++alphaI )
			alpha_map[ alphaI ].clear();
	}
	
	
	for( seqI = 0; seqI < seq_count; ++seqI ){
		//check that all mers at the new position match
		int64 mer_to_get = mhe[ seqI ];
		if( mer_to_get == NO_MATCH )
			continue;
		if(mer_to_get < 0){
			mer_to_get *= -1;
			mer_to_get += mhe.Length() - GetSar(0)->SeedLength();
		}

		uint64 cur_mer = GetSar( seqI )->GetMer( mer_to_get - 1 );

		boolean parity;
		if( mhe[ seqI ] < 0 )
			parity = cur_mer & 0x1;
		else
			parity = !(cur_mer & 0x1);

		if( parity ){
			cur_mer >>= shift_amt;
		}else{
			cur_mer <<= rshift_amt;
			cur_mer = ~cur_mer;
			cur_mer >>= shift_amt;
		}

		alpha_map[ cur_mer ].push_back( seqI );

	}
	
	for( alphaI = 0; alphaI < alpha_map_size; ++alphaI ){
		if( alpha_map[ alphaI ].size() < 2 ){
			alpha_map[ alphaI ].clear();
			continue;
		}
		// this is a subset
		MatchType cur_subset = mhe;
		cur_subset.SetLength( mhe.Length() );
		for( uint sqI = 0; sqI < mhe.SeqCount(); ++sqI )
			cur_subset.SetStart( sqI, NO_MATCH );	// init everything to NO_MATCH
		for( uint subI = 0; subI < alpha_map[ alphaI ].size(); ++subI )
			cur_subset.SetStart( alpha_map[ alphaI ][ subI ], mhe[ alpha_map[ alphaI ][ subI ] ] );
		subset_matches.push_back( cur_subset );
		alpha_map[ alphaI ].clear();
	}
}

// BUGS:
// matches which span the end-start of a circular sequence will be hashed a second time
template< class UngappedMatchType >
void MatchFinder::ExtendMatch(UngappedMatchType& mhe, std::vector<UngappedMatchType>& subset_matches, gnSeqI max_backward, gnSeqI max_forward){
	uint64 cur_mer;
	uint64 mer_mask = GetSar(0)->GetSeedMask();

	//which sequences are used in this match?
	uint32* cur_seqs = new uint32[mhe.SeqCount()];
	uint32 used_seqs = 0;
	for(uint32 seqI = 0; seqI < mhe.SeqCount(); ++seqI){
		if(mhe[seqI] != NO_MATCH){
			cur_seqs[used_seqs] = seqI;
			++used_seqs;
		}
	}
	//First extend backwards then extend forwards.  The following loop does them both.
	int jump_size = GetSar(0)->SeedLength();
	uint extend_limit = 0;	/**< Tracks the distance to the most distant overlapping matching seed */
	uint extend_attempts = 0;	/**< Counts the total number of overlapping seeds checked */
	boolean extend_again = false;	/**< Set to true if any overlapping seeds matched, the search will be restarted from that point */
	for(uint32 directionI = 0; directionI < 4; ++directionI){
		//how far can we go?	
		//first calculate the maximum amount of traversal
		//then do fewer comparisons.
		int64 maxlen = GNSEQI_END;
		if(directionI == 0)
			maxlen = max_backward;
		else if(directionI == 1)
			maxlen = max_forward;
		else
			maxlen = GetSar(0)->SeedLength();
		for(uint32 maxI = 0; maxI < used_seqs; ++maxI)
			if(GetSar(cur_seqs[maxI])->IsCircular()){
				if(GetSar(cur_seqs[maxI])->Length() < maxlen)
					maxlen = GetSar(cur_seqs[maxI])->Length();
			}else if(mhe[cur_seqs[maxI]] < 0){
				int64 rc_len = GetSar(cur_seqs[maxI])->Length() - mhe.Length() + mhe[cur_seqs[maxI]] + 1;
				if( rc_len < maxlen)
					maxlen = rc_len;
			}else if(mhe[cur_seqs[maxI]] - 1 < maxlen)
				maxlen = mhe[cur_seqs[maxI]] - 1;
		uint32 j=0;
		uint32 i = used_seqs;	// set to used_seqs in case maxlen is already less than jump size.

		extend_limit = 0;
		extend_attempts = 0;

		while(maxlen - jump_size >= 0){
			mhe.SetLength(mhe.Length() + jump_size);
			maxlen -= jump_size;
			for(j=0; j < used_seqs; ++j){
				if(mhe[cur_seqs[j]] > 0){
					mhe.SetStart(cur_seqs[j], mhe[cur_seqs[j]] - jump_size);
					if(mhe[cur_seqs[j]] <= 0)
						mhe.SetStart(cur_seqs[j], mhe[cur_seqs[j]] + GetSar(cur_seqs[j])->Length());
				}
			}
			//check that all mers at the new position match
			int64 mer_to_get = mhe[cur_seqs[0]];
			if(mer_to_get < 0){
				mer_to_get *= -1;
				mer_to_get += mhe.Length() - GetSar(0)->SeedLength();
			}
			cur_mer = GetSar(cur_seqs[0])->GetSeedMer(mer_to_get - 1);
			boolean parity;
			if( mhe[cur_seqs[0]] < 0 )
				parity = cur_mer & 0x1;
			else
				parity = !(cur_mer & 0x1);
			cur_mer &= mer_mask;

			for(i=1; i < used_seqs; ++i){
				mer_to_get = mhe[cur_seqs[i]];
				if(mer_to_get < 0){
					//Convert the cur_seqs[i] entry since negative implies reverse complement
					mer_to_get *= -1;
					mer_to_get += mhe.Length() - GetSar(0)->SeedLength();
				}
				uint64 comp_mer = GetSar(cur_seqs[i])->GetSeedMer(mer_to_get - 1);
				boolean comp_parity;				
				if( mhe[cur_seqs[i]] < 0 )
					comp_parity = comp_mer & 0x1;
				else
					comp_parity = !(comp_mer & 0x1);
				comp_mer &= mer_mask;
				
				if(cur_mer != comp_mer || parity != comp_parity ){
					if( directionI < 2 )
						maxlen = 0;
					break;
				}
			}
			extend_attempts += jump_size;
			if( i == used_seqs )
				extend_limit = extend_attempts;
			if( directionI > 1 && extend_attempts == GetSar(0)->SeedLength() )
				break;
		}
		//this stuff cleans up if there was a mismatch
		if(i < used_seqs){
			mhe.SetLength(mhe.Length() - jump_size);
			for(;j > 0; j--){
				if(mhe[cur_seqs[j - 1]] >= 0)
					mhe.SetStart(cur_seqs[j - 1], mhe[cur_seqs[j - 1]] + jump_size);
			}
		}
		// check whether any of the overlapping seeds matched.
		// if so, set the match to that length and set the flag to start the search again
		if( directionI > 1 && extend_attempts > 0 ){
			if( extend_limit > 0 )
				extend_again = true;
			// minus jump_size because the cleanup above already moved the length back a little
			int unmatched_diff = extend_attempts - extend_limit;
			if( i < used_seqs )
				unmatched_diff -= jump_size;
			if( (unmatched_diff > mhe.Length()) && unmatched_diff >= 0 )
				std::cerr << "oh sheethockey mushrooms\n";
			mhe.SetLength(mhe.Length() - unmatched_diff);
			for(j=0; j < used_seqs; ++j){
				if(mhe[cur_seqs[j]] > 0){
					mhe.SetStart(cur_seqs[j], mhe[cur_seqs[j]] + unmatched_diff);
					if(mhe[cur_seqs[j]] > GetSar(cur_seqs[j])->Length() )
						mhe.SetStart(cur_seqs[j], mhe[cur_seqs[j]] - GetSar(cur_seqs[j])->Length() );
				}
			}
		}
		//Invert the sequence directions so that we extend in the other direction
		//next time through the loop.  The second time we do this we are setting
		//sequence directions back to normal.
		mhe.Invert();

		//if we've already been through twice then decrease the jump size
		if(directionI >= 1)
			jump_size = 1;
		if( directionI == 3 && extend_again ){
			directionI = -1;	// will become 0 on next iteration
			jump_size = GetSar(0)->SeedLength();
			extend_again = false;
		}
	}
	// after the match has been fully extended, search for subset matches
	// this code only works when using SOLID seeds-- so it's been disabled
/*	if( used_seqs > 2 ){
		FindSubsets( mhe, subset_matches );
		mhe.Invert();
		FindSubsets( mhe, subset_matches );
		mhe.Invert();
	}
*/
	// set the subsets so their reference sequence is always positive
	for(uint32 subsetI = 0; subsetI < subset_matches.size(); ++subsetI){
		if( subset_matches[subsetI][subset_matches[subsetI].FirstStart()] < 0 )
			subset_matches[subsetI].Invert();
		subset_matches[subsetI].CalculateOffset();
	}

	delete[] cur_seqs;
}



}

#endif	//_MatchFinder_h_