/usr/lib/python2.7/dist-packages/cogent/core/genetic_code.py is in python-cogent 1.9-9.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 | #!/usr/bin/env python
"""Translates RNA or DNA string to amino acid sequence.
NOTE: * is used to denote termination (as per NCBI standard).
NOTE: Although the genetic code objects convert DNA to RNA and vice
versa, lists of codons that they produce will be provided in DNA format.
"""
from string import maketrans
import re
__author__ = "Greg Caporaso and Rob Knight"
__copyright__ = "Copyright 2007-2016, The Cogent Project"
__credits__ = ["Greg Caporaso", "Rob Knight", "Peter Maxwell"]
__license__ = "GPL"
__version__ = "1.9"
__maintainer__ = "Greg Caporaso"
__email__ = "caporaso@colorado.edu"
__status__ = "Production"
class GeneticCodeError(Exception):
pass
class GeneticCodeInitError(ValueError, GeneticCodeError):
pass
class InvalidCodonError(KeyError, GeneticCodeError):
pass
_dna_trans = maketrans('TCAG','AGTC')
def _simple_rc(seq):
"""simple reverse-complement: works only on unambiguous uppercase DNA"""
return seq.translate(_dna_trans)[::-1]
class GeneticCode(object):
"""Holds codon to amino acid mapping, and vice versa.
Usage: gc = GeneticCode(CodeSequence)
sgc = GeneticCode(
'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG')
sgc['UUU'] == 'F'
sgc['TTT'] == 'F'
sgc['F'] == ['TTT', 'TTC'] #in arbitrary order
sgc['*'] == ['TAA', 'TAG', 'TGA'] #in arbitrary order
CodeSequence : 64 character string containing NCBI genetic code translation
GeneticCode is immutable once created.
"""
#class data: need the bases, the list of codons in UUU -> GGG order, and
#a mapping from positions in the list back to codons. These should be the
#same for all GeneticCode instances, and are immutable (therefore private).
_nt = "TCAG"
_codons = [a+b+c for a in _nt for b in _nt for c in _nt]
def __init__(self, CodeSequence, ID=None, Name=None, StartCodonSequence=None):
"""Returns new GeneticCode object.
CodeSequence : 64-character string containing NCBI representation
of the genetic code. Raises GeneticCodeInitError if length != 64.
"""
if (len(CodeSequence) != 64):
raise GeneticCodeInitError,\
"CodeSequence: %s has length %d, but expected 64"\
% (CodeSequence, len(CodeSequence))
self.CodeSequence = CodeSequence
self.ID = ID
self.Name = Name
self.StartCodonSequence = StartCodonSequence
start_codons = {}
if StartCodonSequence:
for codon, aa in zip(self._codons, StartCodonSequence):
if aa != '-':
start_codons[codon] = aa
self.StartCodons = start_codons
codon_lookup = dict(zip(self._codons, CodeSequence))
self.Codons = codon_lookup
#create synonyms for each aa
aa_lookup = {}
for codon in self._codons:
aa = codon_lookup[codon]
if aa not in aa_lookup:
aa_lookup[aa] = [codon]
else:
aa_lookup[aa].append(codon)
self.Synonyms = aa_lookup
sense_codons = codon_lookup.copy()
#create sense codons
stop_codons = self['*']
for c in stop_codons:
del sense_codons[c]
self.SenseCodons = sense_codons
#create anticodons
ac = {}
for aa, codons in self.Synonyms.items():
ac[aa] = map(_simple_rc, codons)
self.Anticodons = ac
def _analyze_quartet(self, codons, aa):
"""Analyzes a quartet of codons and amino acids: returns list of lists.
Each list contains one block, splitting at R/Y if necessary.
codons should be a list of 4 codons.
aa should be a list of 4 amino acid symbols.
Possible states:
- All amino acids are the same: returns list of one quartet.
- Two groups of 2 aa: returns list of two doublets.
- One group of 2 and 2 groups of 1: list of one doublet, 2 singles.
- 4 groups of 1: four singles.
Note: codon blocks like Ile in the standard code (AUU, AUC, AUA) will
be split when they cross the R/Y boundary, so [[AUU, AUC], [AUA]]. This
would also apply to a block like AUC AUA AUG -> [[AUC],[AUA,AUG]],
although this latter pattern is not observed in the standard code.
"""
if aa[0] == aa[1]:
first_doublet = True
else:
first_doublet = False
if aa[2] == aa[3]:
second_doublet = True
else:
second_doublet = False
if first_doublet and second_doublet and aa[1] == aa[2]:
return [codons]
else:
blocks = []
if first_doublet:
blocks.append(codons[:2])
else:
blocks.extend([[codons[0]],[codons[1]]])
if second_doublet:
blocks.append(codons[2:])
else:
blocks.extend([[codons[2]],[codons[3]]])
return blocks
def _get_blocks(self):
"""Returns list of lists of codon blocks in the genetic code.
A codon block can be:
- a quartet, if all 4 XYn codons have the same amino acid.
- a doublet, if XYt and XYc or XYa and XYg have the same aa.
- a singlet, otherwise.
Returns a list of the quartets, doublets, and singlets in the order
UUU -> GGG.
Note that a doublet cannot span the purine/pyrimidine boundary, and
a quartet cannot span the boundary between two codon blocks whose first
two bases differ.
"""
if hasattr(self, '_blocks'):
return self._blocks
else:
blocks = []
curr_codons = []
curr_aa = []
for index, codon, aa in zip(range(64),self._codons,self.CodeSequence):
#we're in a new block if it's a new quartet or a different aa
new_quartet = not index % 4
if new_quartet and curr_codons:
blocks.extend(self._analyze_quartet(curr_codons, curr_aa))
curr_codons = []
curr_aa = []
curr_codons.append(codon)
curr_aa.append(aa)
#don't forget to append last block
if curr_codons:
blocks.extend(self._analyze_quartet(curr_codons, curr_aa))
self._blocks = blocks
return self._blocks
Blocks = property(_get_blocks)
def __str__(self):
"""Returns CodeSequence that constructs the GeneticCode."""
return self.CodeSequence
def __repr__(self):
"""Returns reconstructable representation of the GeneticCode."""
return 'GeneticCode(%s)' % str(self)
def __cmp__(self, other):
""" Allows two GeneticCode objects to be compared to each other.
Two GeneticCode objects are equal if they have equal CodeSequences.
"""
return cmp(str(self), str(other))
def __getitem__(self, item):
"""Returns amino acid corresponding to codon, or codons for an aa.
Returns [] for empty list of codons, 'X' for unknown amino acid.
"""
item = str(item)
if len(item) == 1: #amino acid
return self.Synonyms.get(item, [])
elif len(item) == 3: #codon
key = item.upper()
key = key.replace('U', 'T')
return self.Codons.get(key, 'X')
else:
raise InvalidCodonError, "Codon or aa %s has wrong length" % item
def translate(self, dna, start=0):
""" Translates DNA to protein with current GeneticCode.
dna = a string of nucleotides
start = position to begin translation (used to implement frames)
Returns string containing amino acid sequence. Translates the entire
sequence: it is the caller's responsibility to find open reading frames.
NOTE: should return Protein object when we have a class for it.
"""
if not dna:
return ''
if start + 1 > len(dna):
raise ValueError, "Translation starts after end of RNA"
return ''.join([self[dna[i:i+3]] for i in range(start, len(dna)-2, 3)])
def getStopIndices(self, dna, start=0):
"""returns indexes for stop codons in the specified frame"""
stops = self['*']
stop_pattern = '(%s)' % '|'.join(stops)
stop_pattern = re.compile(stop_pattern)
seq = str(dna)
found = [hit.start() for hit in stop_pattern.finditer(seq)]
found = [index for index in found if index % 3 == start]
return found
def sixframes(self, dna):
"""Returns six-frame translation as dict containing {frame:translation}
"""
reverse = dna.rc()
return [self.translate(dna, start) for start in range(3)] + \
[self.translate(reverse, start) for start in range(3)]
def isStart(self, codon):
"""Returns True if codon is a start codon, False otherwise."""
fixed_codon = codon.upper().replace('U','T')
return fixed_codon in self.StartCodons
def isStop(self, codon):
"""Returns True if codon is a stop codon, False otherwise."""
return self[codon] == '*'
def changes(self, other):
"""Returns dict of {codon:'XY'} for codons that differ.
X is the string representation of the amino acid in self, Y is the
string representation of the amino acid in other. Always returns a
2-character string.
"""
changes = {}
try:
other_code = other.CodeSequence
except AttributeError: #try using other directly as sequence
other_code = other
for codon, old, new in zip(self._codons, self.CodeSequence, other_code):
if old != new:
changes[codon] = old+new
return changes
NcbiGeneticCodeData = [GeneticCode(*data) for data in [
[
'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG',
1,
'Standard Nuclear',
'---M---------------M---------------M----------------------------',
],
[
'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG',
2,
'Vertebrate Mitochondrial',
'--------------------------------MMMM---------------M------------',
],
[
'FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG',
3,
'Yeast Mitochondrial',
'----------------------------------MM----------------------------',
],
[
'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG',
4,
'Mold, Protozoan, and Coelenterate Mitochondrial, and Mycoplasma/Spiroplasma Nuclear',
'--MM---------------M------------MMMM---------------M------------',
],
[
'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG',
5,
'Invertebrate Mitochondrial',
'---M----------------------------MMMM---------------M------------',
],
[
'FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG',
6,
'Ciliate, Dasycladacean and Hexamita Nuclear',
'-----------------------------------M----------------------------',
],
[
'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG',
9,
'Echinoderm and Flatworm Mitochondrial',
'-----------------------------------M---------------M------------',
],
[
'FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG',
10,
'Euplotid Nuclear',
'-----------------------------------M----------------------------',
],
[
'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG',
11,
'Bacterial Nuclear and Plant Plastid',
'---M---------------M------------MMMM---------------M------------',
],
[
'FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG',
12,
'Alternative Yeast Nuclear',
'-------------------M---------------M----------------------------',
],
[
'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG',
13,
'Ascidian Mitochondrial',
'-----------------------------------M----------------------------',
],
[
'FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG',
14,
'Alternative Flatworm Mitochondrial',
'-----------------------------------M----------------------------',
],
[
'FFLLSSSSYY*QCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG',
15,
'Blepharisma Nuclear',
'-----------------------------------M----------------------------',
],
[
'FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG',
16,
'Chlorophycean Mitochondrial',
'-----------------------------------M----------------------------',
],
[
'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG',
20,
'Trematode Mitochondrial',
'-----------------------------------M---------------M------------',
],
[
'FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG',
22,
'Scenedesmus obliquus Mitochondrial',
'-----------------------------------M----------------------------',
],
[
'FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG',
23,
'Thraustochytrium Mitochondrial',
],
]]
#build dict of GeneticCodes keyed by ID (as int, not str)
GeneticCodes = dict([(i.ID, i) for i in NcbiGeneticCodeData])
#add str versions for convenience
for key, value in GeneticCodes.items():
GeneticCodes[str(key)] = value
DEFAULT = GeneticCodes[1]
|