/usr/include/anfo/sequence.h is in libanfo0-dev 0.98-4.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 | // Copyright 2009 Udo Stenzel
// This file is part of ANFO
//
// ANFO is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// Anfo is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with Anfo. If not, see <http://www.gnu.org/licenses/>.
#ifndef INCLUDED_SEQUENCE_H
#define INCLUDED_SEQUENCE_H
#include <cassert>
#include <climits>
#include <limits>
#include <istream>
#include <string>
#include <vector>
#include <math.h>
#include <stdint.h>
#include <google/protobuf/io/zero_copy_stream.h>
#include "output.pb.h"
/*! \defgroup typedefs Useful typedefs
* These typedefs are used mostly for their documentation value; C++
* unfortunately won't be able to check their differences
* most of the time.
*
* @{ */
//! \brief Type used to store short DNA sequences.
//!
//! Oligos are stored with two bits per bases where [0,1,2,3] mean
//! [A,C,T,G]. The two LSBs contain the first base, up to 32 bases can
//! be stored.
typedef uint64_t Oligo ;
//! \brief Type used to store a single nucleotide base.
//!
//! We encode [A,C,T,G] as [0,1,2,3], same as in \c Oligo.
typedef uint8_t Nucleotide ; // 0,1,2,3 == A,C,T,G
//! \brief Type used to store ambiguity codes.
//!
//! We encode [A,C,T,G] as [1,2,4,8]. A one shifted by the \c Nucleotide
//! code gives the approriate ambiguity code, other codes can be created
//! by combining bases with a logical OR. Zero encodes a gap, 15 is an
//! N.
typedef uint8_t Ambicode ; // 1,2,4,8 == A,C,T,G
//! @}
//! \brief Decodes a character to a nucleotide.
//!
//! Takes an arbitrary character and determines the IUPAC ambiguity code
//! it stands for. Small and capital letters are understood, everything
//! unrecognized becomes a gap.
inline Ambicode to_ambicode( char c ) {
switch( c ) {
case 'a': case 'A': return 1 ;
case 'b': case 'B': return 1 ^ 0xf ;
case 'c': case 'C': return 2 ;
case 'd': case 'D': return 2 ^ 0xf ;
case 'g': case 'G': return 8 ;
case 'h': case 'H': return 8 ^ 0xf ;
case 't': case 'T': return 4 ;
case 'u': case 'U': return 4 ;
case 'v': case 'V': return 4 ^ 0xf ;
case 'n': case 'N': return 15 ;
case 'm': case 'M': return 3 ;
case 'k': case 'K': return 12 ;
case 'w': case 'W': return 5 ;
case 's': case 'S': return 10 ;
case 'r': case 'R': return 9 ;
case 'y': case 'Y': return 6 ;
default: return 0 ;
}
}
inline char from_ambicode( Ambicode a ) { return "-ACMTWYHGRSVKDBN"[a] ; }
//! \brief Complement an ambiguity code.
inline Ambicode complement( Ambicode w ) { return 0xf & (w << 2 | w >> 2) ; }
inline char compl_ascii( char x ) { return from_ambicode( complement( to_ambicode( x ) ) ) ; }
//! \brief Reverse-complements a pair of ambiguity codes.
//! \internal
inline uint8_t reverse_complement( uint8_t xy )
{
return (xy & 0x03) << 6 |
(xy & 0x0c) << 2 |
(xy & 0x30) >> 2 |
(xy & 0xc0) >> 6 ;
}
//! \brief Checks whether a character codes for a nucleotide.
//! This is euivalent to decoding the character and checking that it
//! doesn't encode a gap.
inline bool encodes_nuc( char c ) { return to_ambicode(c) != 0 ; }
//! Dumb pointer to DNA
//! A pointer with sub-byte precision is needed for our mmaped genomes.
//! The assumption here is that this fits into 64 bits, which is true on
//! any ix86_64 bit that doesn't implement full 64bit addresses, and that
//! will be all of them for quite some time to come. It's also true on a
//! 32 bit system, obviously. We also steal another bit to encode the
//! strand we're pointing to.
//!
//! To make arithmetic easier, the encoding is as follows: The main
//! pointer is converted to an int64_t, it is then shifted left and the
//! sub-byte index (just one bit) is shifted in. Then the sign is
//! inverted if and only if this is a pointer to the RC strand. This way,
//! arithmetic works on both strands the same way. To dereference a
//! pointer, the absolute value of the number is taken, interpreted as
//! signed 63 bit value, sign extended and reinterpreted as pointer.
class DnaP
{
private:
int64_t p_ ;
public:
//! constructs a DNA pointer
//! The pointer is initialized from a pointer to bytes,
//! understood to point to the ambiguity code in the 4 LSBs. It
//! is then converted to a reverse pointer and finally the offset
//! is added, taking directionality into account.
//! \param p pointer to ambiguity-encoded DNA
//! \param complement set to true to make a pointer to the
//! reverse-complemented strand
//! \param off offset to add to the final pointer
explicit DnaP( const uint8_t *p = 0, bool complement = false, int off = 0 ) { assign( p, complement, off ) ; }
void assign( const uint8_t *p = 0, bool complement = false, int off = 0 )
{
p_ = (reinterpret_cast<int64_t>(p) << 1) & std::numeric_limits<int64_t>::max() ;
if( complement ) p_ = -p_ ;
p_ += off ;
assert( unsafe_ptr() == p ) ;
}
Ambicode operator * () const { return (*this)[0] ; }
Ambicode operator [] ( int64_t ix ) const {
int64_t p = p_ + ix ;
int64_t p2 = llabs(p) ;
if( (p2 << 1) < 0 ) p2 |= std::numeric_limits<int64_t>::min() ;
uint8_t w = *( reinterpret_cast<uint8_t*>( p2 >> 1 ) ) ;
w = 0xf & ( p & 1 ? w >> 4 : w ) ;
return p < 0 ? complement(w) : w ;
}
bool operator! () const { return !p_ ; }
operator const void*() const { return (const void*)p_ ; }
bool is_reversed() const { return p_ < 0 ; }
DnaP abs() const { DnaP q ; q.p_ = llabs(p_) ; return q ; }
DnaP reverse() const { DnaP q ; q.p_ = -p_ ; return q ; }
DnaP reverse_if( bool p ) const { return p ? reverse() : *this ; }
//! \brief returns a pointer to the underlying storage.
//! \internal
//! If you call this function for anything, you're on your own.
const uint8_t *unsafe_ptr() const
{
int64_t p2 = llabs(p_) ;
if( (p2 << 1) < 0 ) p2 |= std::numeric_limits<int64_t>::min() ;
return reinterpret_cast<uint8_t*>( p2 >> 1 ) ;
}
DnaP &operator ++ () { p_++ ; return *this ; }
DnaP &operator -- () { p_-- ; return *this ; }
DnaP &operator += ( int64_t o ) { p_ += o ; return *this ; }
DnaP &operator += ( uint32_t o ) { p_ += o ; return *this ; }
DnaP &operator += ( int32_t o ) { p_ += o ; return *this ; }
DnaP &operator -= ( int64_t o ) { p_ -= o ; return *this ; }
DnaP &operator -= ( uint32_t o ) { p_ -= o ; return *this ; }
DnaP &operator -= ( int32_t o ) { p_ -= o ; return *this ; }
unsigned long get() const { return p_ ; }
friend inline int64_t operator - ( const DnaP &a, const DnaP &b ) { return a.p_ - b.p_ ; }
friend inline bool operator == ( const DnaP &a, const DnaP &b ) { return a.p_ == b.p_ ; }
friend inline bool operator != ( const DnaP &a, const DnaP &b ) { return a.p_ != b.p_ ; }
friend inline std::ostream& operator << ( std::ostream& s, const DnaP &p )
{ return s << std::hex << p.p_ ; }
} ;
inline DnaP operator + ( const DnaP& a, int64_t o ) { DnaP b = a ; return b += o ; }
inline DnaP operator + ( const DnaP& a, int32_t o ) { DnaP b = a ; return b += o ; }
inline DnaP operator + ( const DnaP& a, uint32_t o ) { DnaP b = a ; return b += o ; }
inline DnaP operator - ( const DnaP& a, int64_t o ) { DnaP b = a ; return b -= o ; }
inline DnaP operator - ( const DnaP& a, int32_t o ) { DnaP b = a ; return b -= o ; }
inline DnaP operator - ( const DnaP& a, uint32_t o ) { DnaP b = a ; return b -= o ; }
class QDnaP
{
private:
const uint16_t *p_ ;
public:
explicit QDnaP( const uint16_t *p = 0 ) : p_(p) {}
QDnaP &operator = ( const QDnaP& p ) { p_ = p.p_ ; return *this ; }
Ambicode operator * () const { return *p_ & 0xf ; }
uint8_t qual( size_t ix = 0 ) const { return p_[ix] >> 8 ; }
Ambicode operator [] ( size_t ix ) const { return p_[ix] & 0xf ; }
operator const void*() const { return p_ ; }
QDnaP &operator ++ () { p_++ ; return *this ; }
QDnaP &operator -- () { p_-- ; return *this ; }
QDnaP &operator += ( int64_t o ) { p_ += o ; return *this ; }
QDnaP &operator += ( uint32_t o ) { p_ += o ; return *this ; }
QDnaP &operator += ( int32_t o ) { p_ += o ; return *this ; }
QDnaP &operator -= ( int64_t o ) { p_ -= o ; return *this ; }
QDnaP &operator -= ( uint32_t o ) { p_ -= o ; return *this ; }
QDnaP &operator -= ( int32_t o ) { p_ -= o ; return *this ; }
unsigned long get() const { return (unsigned long)p_ ; }
friend inline size_t operator - ( const QDnaP &a, const QDnaP &b ) { return a.p_ - b.p_ ; }
friend inline bool operator == ( const QDnaP &a, const QDnaP &b ) { return a.p_ == b.p_ ; }
friend inline bool operator != ( const QDnaP &a, const QDnaP &b ) { return a.p_ != b.p_ ; }
friend inline std::ostream& operator << ( std::ostream& s, const QDnaP &p )
{ return s << std::hex << p.p_ ; }
} ;
inline QDnaP operator + ( const QDnaP& a, int64_t o ) { QDnaP b = a ; return b += o ; }
inline QDnaP operator + ( const QDnaP& a, int32_t o ) { QDnaP b = a ; return b += o ; }
inline QDnaP operator + ( const QDnaP& a, uint32_t o ) { QDnaP b = a ; return b += o ; }
inline QDnaP operator - ( const QDnaP& a, int64_t o ) { QDnaP b = a ; return b -= o ; }
inline QDnaP operator - ( const QDnaP& a, int32_t o ) { QDnaP b = a ; return b -= o ; }
inline QDnaP operator - ( const QDnaP& a, uint32_t o ) { QDnaP b = a ; return b -= o ; }
//! \brief Wrapper for easier output of gap-terminated sequences.
//! \internal
struct Sequ
{
Sequ( const DnaP p, int length = std::numeric_limits<int>::max() )
: p_(p), l_(length) {}
DnaP p_ ;
int l_ ;
} ;
inline std::ostream& operator << ( std::ostream& s, const Sequ& d )
{
for( DnaP p = d.p_, q = d.p_ + d.l_ ; *p && p != q ; ++p )
s << from_ambicode( *p ) ;
return s ;
}
//! \brief sequence with quality scores
//! We store four quality scores for the four possible bases. If the
//! sequence was read from a 4Q file (currently not supported, not that
//! anyone cares), that's okay. If it came from FASTQ or even FASTA,
//! they are just an extrapolation (error probability divided by 3).
//! The sequence is duplicated and reverse-complemented, so that
//! negative offsets actually yield pointers into the reverse strand.
//! Note, the forward strand has offsets [1..len], the reverse strand
//! has offsets [-len..-1]. At offset 0, there's a gap. The interval
//! [begin,end) also represents the forward strand, without gaps.
class QSequence
{
public:
//! \brief An ambiguity code and four qualities together.
//! The qualities are undefined if the ambiguity code encodes a
//! gap. We keep copies of the quality scores to avoid loss of
//! precision when regenerating a FastA file.
struct Base {
uint8_t ambicode ;
uint8_t qscore ;
uint8_t qscores[4] ;
Base() : ambicode(0), qscore(0)
{
qscores[0] = qscores[1] = qscores[2] = qscores[3] = 0 ;
}
Base( uint8_t a, int q_score ) ;
} ;
private:
std::vector< Base > seq_ ;
// offset of the central zero
int middle_offset() const { return seq_.size()/2 ; }
public:
typedef std::vector< Base >::const_iterator const_iterator ;
typedef std::vector< Base >::const_reverse_iterator const_reverse_iterator ;
typedef std::vector< Base >::iterator iterator ;
typedef Base const *const_pointer ;
typedef Base const &const_reference ;
typedef Base pointer ;
typedef Base &reference ;
typedef Base value_type ;
//! \brief creates an empty sequence
//! The sequence is terminated correctly by three gaps (one in
//! the middle, one at either end) and has no bases. It is
//! basically considered invalid and used as a placeholder where
//! necessary.
QSequence()
{
seq_.push_back( Base() ) ;
seq_.push_back( Base() ) ;
seq_.push_back( Base() ) ;
}
//! \brief creates a quality sequence from a generic Read
//! Missing qualities are set to a constant, which is equivalent
//! to a Q score of 30 by default. A reverse-complemented
//! version is prepended.
QSequence( const output::Read& r, int default_q = 30 ) ;
void swap( QSequence& rhs ) { std::swap( seq_, rhs.seq_ ) ; }
const_pointer start() const { return &seq_[ middle_offset()] ; }
unsigned length() const { return seq_.size()/2 -1 ; }
const_iterator begin() const { return seq_.begin() + middle_offset() ; }
const_iterator end() const { return seq_.end() - 1 ; }
reference operator [] ( size_t ix ) { return seq_[middle_offset()+ix] ; }
const_reference operator [] ( size_t ix ) const { return seq_[middle_offset()+ix] ; }
} ;
//! \brief reads a sequence from a FASTA, FASTQ or 4Q file
//! To unify all formats (well-defined or not...) this function accepts
//! both '@' and '>' as header delimiter. Any lines starting ';' and
//! following the header are treated as extended description. Quality
//! scores are optional and introduced by '+'; they can be either ASCII
//! (U Rockefeller idiocy), or 33-based raw Phred scale values (Sanger
//! standard), or 64-based Solexa scale values (Solexa idiocy). The tag
//! "SQ " (4Q definition) is ignored in a sequence line. Quality scores
//! can also be introduced by '*', in which case the quality line tag is
//! read and the remainder is parsed as quality scores in 4Q format. If
//! no bases are found, but four Q scores are available, bases are
//! "called" internally.
//!
//! \todo Add sensible error checking (e.g. consistent line lengths, no
//! additional junk, no wrong characters).
//!
//! \param zis stream to read from
//! \param r read structure to place the sequence in
//! \param solexa_scores
//! If set, quality scores are converted from Solexa to Phred
//! conventions.
//! \param origin
//! The origin is subtracted from raw scores in ASCII files. It is
//! 33 for original FASTQ files, but 64 for files with Illumina
//! brain damage, in which case it may or may not have to be
//! combined with setting solexa_scores.
//! \return true iff a sequence could be read
bool read_fastq( google::protobuf::io::ZeroCopyInputStream *zis, output::Read& r, bool solexa_scores, char origin ) ;
//! \brief reads a sequence from a SAM file
//! \param zis stream to read from
//! \return true iff a sequence could be read
bool read_sam( google::protobuf::io::ZeroCopyInputStream *zis, const std::string& genome, output::Result& r ) ;
#endif
|