This file is indexed.

/usr/include/anfo/sequence.h is in libanfo0-dev 0.98-4.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
//    Copyright 2009 Udo Stenzel
//    This file is part of ANFO
//
//    ANFO is free software: you can redistribute it and/or modify
//    it under the terms of the GNU General Public License as published by
//    the Free Software Foundation, either version 3 of the License, or
//    (at your option) any later version.
//
//    Anfo is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//    GNU General Public License for more details.
//
//    You should have received a copy of the GNU General Public License
//    along with Anfo.  If not, see <http://www.gnu.org/licenses/>.

#ifndef INCLUDED_SEQUENCE_H
#define INCLUDED_SEQUENCE_H

#include <cassert>
#include <climits>
#include <limits>
#include <istream>
#include <string>
#include <vector>

#include <math.h>
#include <stdint.h>

#include <google/protobuf/io/zero_copy_stream.h>

#include "output.pb.h"

/*! \defgroup typedefs Useful typedefs
 * These typedefs are used mostly for their documentation value;  C++
 * unfortunately won't be able to check their differences
 * most of the time.
 *
 * @{ */

//! \brief Type used to store short DNA sequences.
//!
//! Oligos are stored with two bits per bases where [0,1,2,3] mean
//! [A,C,T,G].  The two LSBs contain the first base, up to 32 bases can
//! be stored.
typedef uint64_t Oligo ;

//! \brief Type used to store a single nucleotide base.
//!
//! We encode [A,C,T,G] as [0,1,2,3], same as in \c Oligo.
typedef uint8_t  Nucleotide ;	// 0,1,2,3 == A,C,T,G

//! \brief Type used to store ambiguity codes.
//!
//! We encode [A,C,T,G] as [1,2,4,8].  A one shifted by the \c Nucleotide
//! code gives the approriate ambiguity code, other codes can be created
//! by combining bases with a logical OR.  Zero encodes a gap, 15 is an
//! N.
typedef uint8_t  Ambicode ;		// 1,2,4,8 == A,C,T,G

//! @}

//! \brief Decodes a character to a nucleotide.
//!
//! Takes an arbitrary character and determines the IUPAC ambiguity code
//! it stands for.  Small and capital letters are understood, everything
//! unrecognized becomes a gap.

inline Ambicode to_ambicode( char c ) {
	switch( c ) {
		case 'a': case 'A': return 1 ;
		case 'b': case 'B': return 1 ^ 0xf ;
		case 'c': case 'C': return 2 ;
		case 'd': case 'D': return 2 ^ 0xf ;
		case 'g': case 'G': return 8 ;
		case 'h': case 'H': return 8 ^ 0xf ;
		case 't': case 'T': return 4 ;
		case 'u': case 'U': return 4 ;
		case 'v': case 'V': return 4 ^ 0xf ;

		case 'n': case 'N': return 15 ;
		case 'm': case 'M': return  3 ;
		case 'k': case 'K': return 12 ;
		case 'w': case 'W': return  5 ;
		case 's': case 'S': return 10 ;
		case 'r': case 'R': return  9 ;
		case 'y': case 'Y': return  6 ;
		default:            return  0 ;
	}
}

inline char from_ambicode( Ambicode a ) { return "-ACMTWYHGRSVKDBN"[a] ; }

//! \brief Complement an ambiguity code.
inline Ambicode complement( Ambicode w ) { return 0xf & (w << 2 | w >> 2) ; }

inline char compl_ascii( char x ) { return from_ambicode( complement( to_ambicode( x ) ) ) ; }

//! \brief Reverse-complements a pair of ambiguity codes.
//! \internal
inline uint8_t reverse_complement( uint8_t xy )
{
	return (xy & 0x03) << 6 |
		   (xy & 0x0c) << 2 |
		   (xy & 0x30) >> 2 |
		   (xy & 0xc0) >> 6 ;
}

//! \brief Checks whether a character codes for a nucleotide.
//! This is euivalent to decoding the character and checking that it
//! doesn't encode a gap.
inline bool encodes_nuc( char c ) { return to_ambicode(c) != 0 ; }

//! Dumb pointer to DNA
//! A pointer with sub-byte precision is needed for our mmaped genomes.
//! The assumption here is that this fits into 64 bits, which is true on
//! any ix86_64 bit that doesn't implement full 64bit addresses, and that
//! will be all of them for quite some time to come.  It's also true on a
//! 32 bit system, obviously.  We also steal another bit to encode the
//! strand we're pointing to.
//!
//! To make arithmetic easier, the encoding is as follows:  The main
//! pointer is converted to an int64_t, it is then shifted left and the
//! sub-byte index (just one bit) is shifted in.  Then the sign is
//! inverted if and only if this is a pointer to the RC strand.  This way,
//! arithmetic works on both strands the same way.  To dereference a
//! pointer, the absolute value of the number is taken, interpreted as
//! signed 63 bit value, sign extended and reinterpreted as pointer.
class DnaP
{
	private:
		int64_t p_ ;

	public:
		//! constructs a DNA pointer
		//! The pointer is initialized from a pointer to bytes,
		//! understood to point to the ambiguity code in the 4 LSBs.  It
		//! is then converted to a reverse pointer and finally the offset
		//! is added, taking directionality into account.
		//! \param p pointer to ambiguity-encoded DNA
		//! \param complement set to true to make a pointer to the
		//!                   reverse-complemented strand
		//! \param off offset to add to the final pointer
		explicit DnaP( const uint8_t *p = 0, bool complement = false, int off = 0 ) { assign( p, complement, off ) ; }

		void assign( const uint8_t *p = 0, bool complement = false, int off = 0 )
		{
			p_ = (reinterpret_cast<int64_t>(p) << 1) & std::numeric_limits<int64_t>::max() ;
			if( complement ) p_ = -p_ ;
			p_ += off ;
			assert( unsafe_ptr() == p ) ;
		}

		Ambicode operator * () const { return (*this)[0] ; }
		Ambicode operator [] ( int64_t ix ) const {
			int64_t p = p_ + ix ;
			int64_t p2 = llabs(p) ;
			if( (p2 << 1) < 0 ) p2 |= std::numeric_limits<int64_t>::min() ;
			uint8_t w = *( reinterpret_cast<uint8_t*>( p2 >> 1 ) ) ;
			w = 0xf & ( p & 1 ? w >> 4 : w ) ;
			return p < 0 ? complement(w) : w ;  
		}

		bool operator! () const { return !p_ ; }
		operator const void*() const { return (const void*)p_ ; }

		bool is_reversed() const { return p_ < 0 ; }
		DnaP abs() const { DnaP q ; q.p_ = llabs(p_) ; return q ; }
		DnaP reverse() const { DnaP q ; q.p_ = -p_ ; return q ; }
		DnaP reverse_if( bool p ) const { return p ? reverse() : *this ; }

		//! \brief returns a pointer to the underlying storage.
		//! \internal
		//! If you call this function for anything, you're on your own.
		const uint8_t *unsafe_ptr() const
		{ 
			int64_t p2 = llabs(p_) ;
			if( (p2 << 1) < 0 ) p2 |= std::numeric_limits<int64_t>::min() ;
			return reinterpret_cast<uint8_t*>( p2 >> 1 ) ;
		}

		DnaP &operator ++ () { p_++ ; return *this ; }
		DnaP &operator -- () { p_-- ; return *this ; }

		DnaP &operator += ( int64_t  o ) { p_ += o ; return *this ; }
		DnaP &operator += ( uint32_t o ) { p_ += o ; return *this ; }
		DnaP &operator += ( int32_t  o ) { p_ += o ; return *this ; }
		DnaP &operator -= ( int64_t  o ) { p_ -= o ; return *this ; }
		DnaP &operator -= ( uint32_t o ) { p_ -= o ; return *this ; }
		DnaP &operator -= ( int32_t  o ) { p_ -= o ; return *this ; }

		unsigned long get() const { return p_ ; }

		friend inline int64_t operator -  ( const DnaP &a, const DnaP &b ) { return a.p_  - b.p_ ; }
		friend inline bool    operator == ( const DnaP &a, const DnaP &b ) { return a.p_ == b.p_ ; }
		friend inline bool    operator != ( const DnaP &a, const DnaP &b ) { return a.p_ != b.p_ ; }
		
		friend inline std::ostream& operator << ( std::ostream& s, const DnaP &p )
		{ return s << std::hex << p.p_ ; }
} ;

inline DnaP operator + ( const DnaP& a, int64_t  o ) { DnaP b = a ; return b += o ; }
inline DnaP operator + ( const DnaP& a, int32_t  o ) { DnaP b = a ; return b += o ; }
inline DnaP operator + ( const DnaP& a, uint32_t o ) { DnaP b = a ; return b += o ; }
inline DnaP operator - ( const DnaP& a, int64_t  o ) { DnaP b = a ; return b -= o ; }
inline DnaP operator - ( const DnaP& a, int32_t  o ) { DnaP b = a ; return b -= o ; }
inline DnaP operator - ( const DnaP& a, uint32_t o ) { DnaP b = a ; return b -= o ; }

class QDnaP
{
	private:
		const uint16_t *p_ ;

	public:
		explicit QDnaP( const uint16_t *p = 0 ) : p_(p) {}

		QDnaP &operator = ( const QDnaP& p ) { p_ = p.p_ ; return *this ; }

		Ambicode operator * () const { return *p_ & 0xf ; }
		uint8_t qual( size_t ix = 0 ) const { return p_[ix] >> 8 ; }

		Ambicode operator [] ( size_t ix ) const { return p_[ix] & 0xf ; }

		operator const void*() const { return p_ ; }

		QDnaP &operator ++ () { p_++ ; return *this ; }
		QDnaP &operator -- () { p_-- ; return *this ; }

		QDnaP &operator += ( int64_t  o ) { p_ += o ; return *this ; }
		QDnaP &operator += ( uint32_t o ) { p_ += o ; return *this ; }
		QDnaP &operator += ( int32_t  o ) { p_ += o ; return *this ; }
		QDnaP &operator -= ( int64_t  o ) { p_ -= o ; return *this ; }
		QDnaP &operator -= ( uint32_t o ) { p_ -= o ; return *this ; }
		QDnaP &operator -= ( int32_t  o ) { p_ -= o ; return *this ; }

		unsigned long get() const { return (unsigned long)p_ ; }

		friend inline size_t operator -  ( const QDnaP &a, const QDnaP &b ) { return a.p_  - b.p_ ; }
		friend inline bool   operator == ( const QDnaP &a, const QDnaP &b ) { return a.p_ == b.p_ ; }
		friend inline bool   operator != ( const QDnaP &a, const QDnaP &b ) { return a.p_ != b.p_ ; }
		
		friend inline std::ostream& operator << ( std::ostream& s, const QDnaP &p )
		{ return s << std::hex << p.p_ ; }
} ;

inline QDnaP operator + ( const QDnaP& a, int64_t  o ) { QDnaP b = a ; return b += o ; }
inline QDnaP operator + ( const QDnaP& a, int32_t  o ) { QDnaP b = a ; return b += o ; }
inline QDnaP operator + ( const QDnaP& a, uint32_t o ) { QDnaP b = a ; return b += o ; }
inline QDnaP operator - ( const QDnaP& a, int64_t  o ) { QDnaP b = a ; return b -= o ; }
inline QDnaP operator - ( const QDnaP& a, int32_t  o ) { QDnaP b = a ; return b -= o ; }
inline QDnaP operator - ( const QDnaP& a, uint32_t o ) { QDnaP b = a ; return b -= o ; }

//! \brief Wrapper for easier output of gap-terminated sequences.
//! \internal
struct Sequ
{
	Sequ( const DnaP p, int length = std::numeric_limits<int>::max() )
		: p_(p), l_(length) {}

	DnaP p_ ;
	int l_ ;
} ;

inline std::ostream& operator << ( std::ostream& s, const Sequ& d )
{
	for( DnaP p = d.p_, q = d.p_ + d.l_ ; *p && p != q ; ++p )
		s << from_ambicode( *p ) ;
	return s ;
}


//! \brief sequence with quality scores
//! We store four quality scores for the four possible bases.  If the
//! sequence was read from a 4Q file (currently not supported, not that
//! anyone cares), that's okay.  If it came from FASTQ or even FASTA,
//! they are just an extrapolation (error probability divided by 3).
//! The sequence is duplicated and reverse-complemented, so that
//! negative offsets actually yield pointers into the reverse strand.
//! Note, the forward strand has offsets [1..len], the reverse strand
//! has offsets [-len..-1].  At offset 0, there's a gap.  The interval
//! [begin,end) also represents the forward strand, without gaps.
class QSequence
{
	public:
		//! \brief An ambiguity code and four qualities together.
		//! The qualities are undefined if the ambiguity code encodes a
		//! gap.  We keep copies of the quality scores to avoid loss of
		//! precision when regenerating a FastA file.
		struct Base {
			uint8_t ambicode ;
			uint8_t qscore ;
			uint8_t qscores[4] ;

			Base() : ambicode(0), qscore(0)
			{
				qscores[0] = qscores[1] = qscores[2] = qscores[3] = 0 ;
			}
			Base( uint8_t a, int q_score ) ;
		} ;

	private:
		std::vector< Base > seq_ ;

		// offset of the central zero
		int middle_offset() const { return seq_.size()/2 ; }
	public:
		typedef std::vector< Base >::const_iterator const_iterator ;
		typedef std::vector< Base >::const_reverse_iterator const_reverse_iterator ;
		typedef std::vector< Base >::iterator iterator ;
		typedef Base const *const_pointer ;
		typedef Base const &const_reference ;
		typedef Base pointer ;
		typedef Base &reference ;
		typedef Base value_type ;

		//! \brief creates an empty sequence
		//! The sequence is terminated correctly by three gaps (one in
		//! the middle, one at either end) and has no bases.  It is
		//! basically considered invalid and used as a placeholder where
		//! necessary.
		QSequence()
		{
			seq_.push_back( Base() ) ; 
			seq_.push_back( Base() ) ; 
			seq_.push_back( Base() ) ; 
		}

		//! \brief creates a quality sequence from a generic Read
		//! Missing qualities are set to a constant, which is equivalent
		//! to a Q score of 30 by default.  A reverse-complemented
		//! version is prepended.
		QSequence( const output::Read& r, int default_q = 30 ) ;

		void swap( QSequence& rhs ) { std::swap( seq_, rhs.seq_ ) ; }

		const_pointer start() const { return &seq_[ middle_offset()] ; }
		unsigned length() const { return seq_.size()/2 -1 ; }

		const_iterator begin() const { return seq_.begin() + middle_offset() ; }
		const_iterator end() const { return seq_.end() - 1 ; }

		reference operator [] ( size_t ix ) { return seq_[middle_offset()+ix] ; }
		const_reference operator [] ( size_t ix ) const { return seq_[middle_offset()+ix] ; }
} ;

//! \brief reads a sequence from a FASTA, FASTQ or 4Q file
//! To unify all formats (well-defined or not...) this function accepts
//! both '@' and '>' as header delimiter.  Any lines starting ';' and
//! following the header are treated as extended description.  Quality
//! scores are optional and introduced by '+'; they can be either ASCII
//! (U Rockefeller idiocy), or 33-based raw Phred scale values (Sanger
//! standard), or 64-based Solexa scale values (Solexa idiocy).  The tag
//! "SQ " (4Q definition) is ignored in a sequence line.  Quality scores
//! can also be introduced by '*', in which case the quality line tag is
//! read and the remainder is parsed as quality scores in 4Q format.  If
//! no bases are found, but four Q scores are available, bases are
//! "called" internally.
//!
//!	\todo Add sensible error checking (e.g. consistent line lengths, no
//!	      additional junk, no wrong characters).
//!
//! \param zis stream to read from
//! \param r read structure to place the sequence in
//! \param solexa_scores 
//!     If set, quality scores are converted from Solexa to Phred
//!     conventions.
//! \param origin
//!     The origin is subtracted from raw scores in ASCII files.  It is
//!     33 for original FASTQ files, but 64 for files with Illumina
//!     brain damage, in which case it may or may not have to be
//!     combined with setting solexa_scores.
//! \return true iff a sequence could be read
bool read_fastq( google::protobuf::io::ZeroCopyInputStream *zis, output::Read& r, bool solexa_scores, char origin ) ;

//! \brief reads a sequence from a SAM file
//! \param zis stream to read from
//! \return true iff a sequence could be read
bool read_sam( google::protobuf::io::ZeroCopyInputStream *zis, const std::string& genome, output::Result& r ) ;

#endif