/usr/share/pyshared/qiime/golay.py is in qiime 1.8.0+dfsg-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 | #!/usr/bin/env python
import numpy
__author__ = "Justin Kuczynski"
__copyright__ = "Copyright 2011, The QIIME Project"
__credits__ = ["Justin Kuczynski"]
__license__ = "GPL"
__version__ = "1.8.0"
__maintainer__ = "Justin Kuczynski"
__email__ = "justinak@gmail.com"
"""
module provides golay encoding/decoding of DNA barcodes.
may not be the same golay code as previously used.
Provides mainly decode_nt()
If you wish to assign a read DNA seq barcode to a list of known originals,
correcting for errors if necessary, I recommend you use the generic version
in barcode.py . Golay decoding assumes that the sequence can be any valid
golay sequence (4096 options), not just those you used in the study.
this module implements *a* golay (24,12,8) code. That's 24 bit codewords,
12 bits of information, min 8 bits (hamming distance) between any 2 codewords
there are 2**12 = 4096 codewords, all <= 3 bit errors are corrected, 4 bit
errors are detected, and worse errors can cause erroneously corrected codewords.
Since DNA has 4 nucleotides, each base represents 2 bits (see
DEFAULT_GOLAY_NT_TO_BITS). The default values represent A <--> C and G <--> T
as 2 bit errors, all other nucleotide swaps are 1 bit errors.
e.g.:
cw1 = AAACCCGGGTTT (24 bits, 12 nucleotides)
cw2 = AAACCCGGGTTA (bit distance of 2 from cw1)
cw3 = AAACCCGGGTTG (bit distance of 1 from cw1)
A specific golay code was chosen, as there are multiple.
bitvectors as referenced here are often just listlike objects with ints
refs:
http://www-math.mit.edu/phase2/UJM/vol1/MKANEM~1.PDF
http://cwww.ee.nctu.edu.tw/~ftchien/class/ecc08f/topic2.pdf
error correcting coding theory (Rhee)
the art of error correcting coding (Morelos-Zaragoza)
TO DOs:
* types are messy, numpy arrays, lists, tuples, all doing the same stuff
* haven't tested on all 2**24 bitvectors, could do that to be thorough
* test speed performance
"""
def get_invalid_golay_barcodes(seqs):
result = []
for e in seqs:
if len(e) != 12:
result.append(e)
elif decode(e)[1] > 0:
result.append(e)
return result
def decode(seq, nt_to_bits=None):
"""decodes a nucleotide string of 12 bases, using bitwise error checking
inputs:
- seq, a string of nucleotides
- nt_to_bits, e.g.: { "A":"11", "C":"00", "T":"10", "G":"01"}
output:
corrected_seq (str), num_bit_errors
corrected_seq is None if 4 bit error detected"""
if nt_to_bits == None:
nt_to_bits = DEFAULT_GOLAY_NT_TO_BITS
received_bits = _seq_to_bits(seq, nt_to_bits)
corrected_bits, num_errors = decode_bits(received_bits) # errors in # bits
if corrected_bits == None:
return None, num_errors
else:
# put match into nucleotide format
return _bits_to_seq(corrected_bits, nt_to_bits), num_errors
# alt name for the decode function for consistency with hamming decoding
decode_golay_12 = decode
def encode(bits, nt_to_bits=None):
""" takes any 12 bits, returns the golay 24bit codeword in nucleotide format
bits is a list/array, 12 long, e.g.: [0,0,0,0,0,0,0,0,0,1,0,0]
nt_to_bits is e.g.: {"A":"11", "C":"00", "T":"10", "G":"01"},None => default
output is e.g.: 'AGTCTATTGGCT'
"""
if nt_to_bits == None:
nt_to_bits = DEFAULT_GOLAY_NT_TO_BITS
bits = numpy.array(bits).reshape((12,1))
# cheap way to do binary xor in matrix dot
res = numpy.dot(DEFAULT_G.T, bits)
codeword = divmod(res.ravel(),2)[1]
return _bits_to_seq(codeword, nt_to_bits)
def decode_bits(received_bitvec):
""" decode a recieved 24 bit vector to a corrected 24 bit vector
uses golay defaults
input: received bitvec is 24 bits long, listlike
output: corrected_vec, num_bit_errors
corrected_vec is None iff num_errors = 4"""
rec = received_bitvec
syn = numpy.dot(DEFAULT_H, rec) % 2
try:
err = numpy.array(DEFAULT_SYNDROME_LUT[tuple(syn)])
except KeyError:
return None, 4
corrected = (rec + err) % 2 # best guess for transmitted bitvector
return corrected, numpy.sum(err)
## begin support fns
def _make_3bit_errors(veclen=24):
""" return list of all bitvectors with <= 3 bits as 1's, rest 0's
returns list of lists, each 24 bits long by default.
not included:
[0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0]
included:
[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
"""
errorvecs = []
# all zeros
errorvecs.append([0]*veclen)
# one 1
for i in range(veclen):
vec = [0]*veclen
vec[i] = 1
errorvecs.append(vec)
# two 1s
for i in range(veclen):
for j in range(i+1,veclen):
vec = [0]*veclen
vec[i] = 1
vec[j] = 1
errorvecs.append(vec)
# three 1s
for i in range(veclen):
for j in range(i+1,veclen):
for k in range(j+1,veclen):
vec = [0]*veclen
vec[i] = 1
vec[j] = 1
vec[k] = 1
errorvecs.append(vec)
return errorvecs
def _seq_to_bits(seq, nt_to_bits):
""" e.g.: "AAG" -> array([0,0,0,0,1,0])
output is array of ints, 1's and 0's
nt_to_bits is e.g.: {"A":"11", "C":"00", "T":"10", "G":"01"}
"""
bitstring = ""
for nt in seq:
bitstring += nt_to_bits[nt]
bits = numpy.array(map(int,bitstring))
return bits
def _bits_to_seq(bits, nt_to_bits):
""" e.g.: array([0,0,0,0,1,0]) -> "AAG"
nt_to_bits is e.g.: {"A":"11", "C":"00", "T":"10", "G":"01"}
"""
bits_to_nt = dict(zip(nt_to_bits.values(), nt_to_bits.keys()))
seq = ""
for i in range(0,len(bits),2): #take bits in twos
bit1 = str(int(round(bits[i])))
bit2 = str(int(round(bits[i+1])))
seq += bits_to_nt[bit1+bit2]
return seq
## end support fns
## BEGIN module level constants
DEFAULT_GOLAY_NT_TO_BITS = { "A":"11", "C":"00", "T":"10", "G":"01"}
# We use this matrix as the parity submatrix P
DEFAULT_P = numpy.array([
[0,1,1,1,1,1,1,1,1,1,1,1,],
[1,1,1,0,1,1,1,0,0,0,1,0,],
[1,1,0,1,1,1,0,0,0,1,0,1,],
[1,0,1,1,1,0,0,0,1,0,1,1,],
[1,1,1,1,0,0,0,1,0,1,1,0,],
[1,1,1,0,0,0,1,0,1,1,0,1,],
[1,1,0,0,0,1,0,1,1,0,1,1,],
[1,0,0,0,1,0,1,1,0,1,1,1,],
[1,0,0,1,0,1,1,0,1,1,1,0,],
[1,0,1,0,1,1,0,1,1,1,0,0,],
[1,1,0,1,1,0,1,1,1,0,0,0,],
[1,0,1,1,0,1,1,1,0,0,0,1,],
],dtype='int') #from http://courses.csusm.edu/math540ak/codes.pdf
# generator mtx G, where transmitted codewords (24bits) are
# G.T dot msg or (msg dot G) (msg is 12 bit message)
# 2**12 = 4096 total codewords, one for each msg
# (all mod 2 arithmetic)
# other G matrices give golay (24,12,8) codes, but this one
# matches existing codes from pre 2010 used in knight lab
DEFAULT_G = numpy.concatenate((DEFAULT_P, numpy.eye(12,dtype="int")),axis=1)
# pairity check matrix H satisfies G dot H.T = zeros (mod 2 arithmetic)
# also satisfies syn = H dot rec = H dot err (rec is recieved 24 bits,
# err is 24 bit error string added to transmitted 24 bit vec)
# (all mod 2 arithmetic)
DEFAULT_H = numpy.concatenate((numpy.eye(12,dtype="int"),DEFAULT_P.T),axis=1)
_ALL_3BIT_ERRORS = _make_3bit_errors()
# len = 2325. (1 (all zeros) + 24 (one 1) + 276 (two 1s) + 2024)
# syndrome lookup table is the key to (fast, syndrome) decoding
# decode() uses syndrome lookup table
DEFAULT_SYNDROME_LUT = {}
# key: syndrome (12 bits). Val: 24 bit err for that syn
# we include the all zeros error (key = all zeros syndrome)
# build syndrome lookup table
for errvec in _ALL_3BIT_ERRORS:
syn = tuple( numpy.dot(DEFAULT_H, errvec) % 2)
DEFAULT_SYNDROME_LUT[syn] = ( errvec)
## END module level constants
|