This file is indexed.

/usr/include/anfo/ducttape.h is in libanfo0-dev 0.98-4+b1.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
//    Copyright 2009 Udo Stenzel
//    This file is part of ANFO
//
//    ANFO is free software: you can redistribute it and/or modify
//    it under the terms of the GNU General Public License as published by
//    the Free Software Foundation, either version 3 of the License, or
//    (at your option) any later version.
//
//    Anfo is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//    GNU General Public License for more details.
//
//    You should have received a copy of the GNU General Public License
//    along with Anfo.  If not, see <http://www.gnu.org/licenses/>.

#ifndef INCLUDED_DUCTTAPE_H
#define INCLUDED_DUCTTAPE_H

#include "align_fwd.h"
#include "stream.h"
#include "util.h"
#include <string>
#include <vector>

namespace streams {

//! \brief calls a genome wide consensus
//! Duct-taping something is almost the same as assembling it, it's just
//! not as durable, and that's what we do to a genome here.  Input is a
//! stream ordered by genome coordinate.  For every position covered at
//! least once, we calculate the likelihood for every possible
//! combination of two alleles.  In principle, those bases can be
//! written out as soon as reads stop covering them, however, since we
//! will write contigs, we can only write them when we no longer have
//! coverage (or reach the end of a contig in the reference?), as the
//! GLZ format wants to know the contig length in advance.
//!
//! \todo What to do at edges?  Sometimes reads will overlap the ends of
//!       contigs, and in principle, we might want to call a consensus
//!       there, too.  Probably nothing at all...
//! \todo What about indels?  How to deal with the shifting coordinates
//!       if our consensus actually contains indels?

class DuctTaper : public Stream
{
	private:
		string name_ ;
		Chan report_ ;
		Logdom het_prior_ ;
		bool have_foot_ ;

		// tracking of current reference sequence; we need to start a
		// new contig if one of these changes
		std::string cur_genome_ ;
		std::string cur_sequence_ ;

		// start of current contig on the reference; we're always on the
		// forward strand
		uint32_t contig_start_ ;
		uint32_t contig_end_ ;

		// Collects number of observed bases or gaps per position in
		// contig (to allow calling of indels, and because someone might
		// ask for it) and accumulated likelihoods for each base.
		// Accumulated quantity is \f$ P(\omega | x) * P(y->x) * P(y)
		// \f$, for we can easily calculate a probability from that.
		// Order is A,C,G,T,-.  Also tracks whether a column is inserted
		// relative to the reference.
		struct Acc {
			int seen[4] ;		// # of times A,C,G,T was seen
			int gapped ;		// # of times a gap was seen
			int crossed ;		// # of times a read overlapped the gap before this col.
			bool is_ins ;		// was this iserted rel. to reference?
			Logdom lk[10] ; 	// likelihoods for allele pairs

			Acc( bool ins = false, int gap = 0 ) : gapped(gap), crossed(0), is_ins(ins)
			{ seen[0] = seen[1] = seen[2] = seen[3] = 0 ; }

			bool pristine() const
			{ for( int i = 0 ; i != 4 ; ++i ) if( seen[i] ) return false ; return gapped == 0 ; }
		} ;
		typedef std::vector< Acc > Accs ;
		Accs observed_ ;

		// for silly statistics
		size_t nreads_, num_ ; 

		// for MAPQ -- GLF wants MAPQ to be the RMS of the MAPQs that
		// went into a contig.
		int mapq_accum_ ;

		// needed for probability calculations
		adna_parblock adna_ ;

		void flush_contig() ;

	public:
		DuctTaper( const string& name, double hp = 0 )
			: name_(name), het_prior_( Logdom::from_float(hp) )
			, have_foot_(false), contig_start_(0), contig_end_(0)
			, nreads_(0), num_(0), mapq_accum_(0), adna_() {}

		virtual void put_header( const Header& ) ;
		virtual void put_result( const Result& ) ;
		virtual void put_footer( const Footer& ) ;
		virtual Result fetch_result() ;

	private:
		void put_result_recent(  const Result& ) ;
		void put_result_ancient( const Result& ) ;
} ;

class GlzWriter : public Stream
{
	private:
		DeflateStream gos_ ;
		Chan chan_ ;

	public:
		GlzWriter( const pair< google::protobuf::io::ZeroCopyOutputStream*, string >& p ) : gos_( p.first ) {}
		virtual void put_result( const Result& ) ;
} ;

//! \brief writes consensuus in text format
//! Format as agreed upon internally:  
//! > "hg18" chromosome ±start
//! ; "pt2" chromosome ±start
//! [repeat as necessary?]
//! <hg18-base> <pt2-base> <nt-majority-base (A,C,G,T)>
//!   <Q-score> <depth> <#gaps> <#A> <#C> <#G> <#T> <4 likelihood-ratios>
//! [repeat as necessary]
//!
//! \todo reconstruction of the involved whole genome alignments is not
//!       yet possible
class ThreeAlnWriter : public Stream
{
	private:
		std::auto_ptr< std::ostream > out_ ;
		std::string name_ ;
		Chan chan_ ;

	public:
		ThreeAlnWriter( const pair< ostream*, string > &p ) : out_( p.first ), name_( p.second ) {}
		virtual void put_result( const Result& ) ;
} ;

} // namespace

#endif