/usr/lib/python2.7/dist-packages/cogent/util/recode_alignment.py is in python-cogent 1.9-9.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 | #!/usr/bin/env python
# Author: Greg Caporaso (gregcaporaso@gmail.com)
# recode_alignment.py
"""This file contains functions for recoding alignment objects with
reduced-state alphabets, and also defines some reduced-state alphabets.
Example alphabet definitions:
charge_his_3: Three states for +/-/no charge, with histidine counted as
a charged residue.
size_2: Two states for large/small. Splits were manually derived by
ordering the Mr of the residues and finding a natural split in the
middle of the data set (residues with Mr <= 133 -> small;
Mr >= 146 -> large).
Ordered Mr: [75,89,105,115,117,119,121,131,131,132,133,146,146,147,
149,155,165,174,181,204]
Differences between neighboring Mr values: [14,16,10,2,2,2,10,0,1,1,13,
0,1,2,6,10,9,7,23]
The difference between 133 and 146 (or D and K/Q) seems like the most
natural split.
Alphabet naming convention: standard alphabets (those defined in this file)
are named based on an identifier (e.g., charge_his) followed by an underscore,
followed by the number of states in the reduced alphabet (e.g., 3).
Alphabet definition convention: When defining reduced alphabets, a reduced
state should be represented by the first char listed in that state. This
allows for alignments to not have to support new characeters (by changing
the MolType of the alignment) and it allows for easier interpretation of the
alignment states.
Many of the alphabets presented here are discussed in:
Detecting Coevolution by Disregarding Evolution? Tree-Ignorant Metrics of
Coevolution Perform As Well As Tree-Aware Metrics; J. Gregory Caporaso,
Sandra Smit, Brett C. Easton, Lawrence Hunter, Gavin A. Huttley, and
Rob Knight. BMC Evolutionary Biology, 2008.
"""
from __future__ import division
from optparse import OptionParser
from numpy import take, array, zeros
from cogent.core.alignment import DenseAlignment, Alignment
from cogent.evolve.models import DSO78_matrix, DSO78_freqs
from cogent import PROTEIN
__author__ = "Greg Caporaso"
__copyright__ = "Copyright 2007-2016, The Cogent Project"
__credits__ = ["Greg Caporaso"]
__license__ = "GPL"
__version__ = "1.9"
__maintainer__ = "Greg Caporaso"
__email__ = "gregcaporaso@gmail.com"
__status__ = "Beta"
class RecodeError(Exception):
""" A generic error to be raised when errors occur in recoding """
pass
alphabets = {\
# Note: 3-STATE CHARGE ALPHAS ASSIGN B/Z TO UNCHARGED -- I FIGURE THAT'S
# SAFER THAT ASSIGNING THEM TO EITHER +/-, BUT MAYBE THAT'S NOT ACCURATE.
# AN ALTERNATIVE WOULD BE TO ASSIGN THEM TO WHATEVER THEIR MORE COMMON
# VALUE WAS IN (e.g.) ALL PROTEINS
'charge_2': [('K','KRDEBZ'), ('A','ACFGHILMNPQSTVWYX')],\
'charge_3': [('K','KR'), ('D','DE'), ('A','ACFGHILMNPQSTVWYBXZ')],\
'charge_his_2': [('K','KRDEHBZ'), ('A','ACFGILMNPQSTVWYX')],\
'charge_his_3': [('K','KRH'), ('D','DE'), ('A','ACFGILMNPQSTVWYBXZ')],\
'size_2': [('G','GAVLISPTCNDXB'),('M','MFYWQKHREZ')],\
'hydropathy_3':[('R','RKDENQHBZ'),('Y','YWSTG'), ('P','PAMCFLVIX')],\
# B/Z are assigned to the acidic residues, since D/E are more common --
# need to figure out if this is the best way to handle them
'polarity_his_4':[('D','DEBZ'),('R','RHK'),\
('A','AILMFPWV'),('G','GSTCYNQ')],
# This is a modified A1_4 alphabet to capture natural breaks in the metric
'a1_4m':[('C','CVILF'),('M','MWA'),('T','GSTPYH'),('N','QNDERK')],
'a1_2':[('C','CVILFMWAGS'),('T','TPYHQNDERK')],
'a1_3':[('C','CVILFMW'),('A','AGSTPY'),('H','HQNDERK')],
'a1_4':[('C','CVILF'),('M','MWAGS'),('T','TPYHQ'),('N','NDERK')],
'a1_5':[('C','CVIL'),('F','FMWA'),('G','GSTP'),('Y','YHQN'),('D','DERK')],
'a1_6':[('C','CVI'),('L','LFMW'),('A','AGS'),('T','TPY'),('H','HQND'),\
('E','ERK')],
'a1_7':[('C','CVI'),('L','LFM'),('W','WAG'),('S','ST'),('P','PYH'),\
('Q','QND'),('E','ERK')],
'a1_8':[('C','CVI'),('L','LF'),('M','MWA'),('G','GS'),('T','TPY'),\
('H','HQ'),('N','NDE'),('R','RK')],
'a1_9':[('C','CV'),('I','IL'),('F','FMW'),('A','AG'),\
('S','ST'),('P','PY'),\
('H','HQN'),('D','DE'),('R','RK')],
'a1_10':[('C','CV'),('I','IL'),('F','FM'),\
('W','WA'),('G','GS'),('T','TP'),\
('Y','YH'),('Q','QN'),('D','DE'),('R','RK')],
'a2_2':[('M','MEALFKIHVQ'),('R','RWDTCNYSGP')],
'a2_3':[('M','MEALFKI'),('H','HVQRWD'),('T','TCNYSGP')],
'a2_4':[('M','MEALF'),('K','KIHVQ'),('R','RWDTC'),('N','NYSGP')],
'a2_5':[('M','MEAL'),('F','FKIH'),('V','VQRW'),('D','DTCN'),('Y','YSGP')],
'a2_6':[('M','MEA'),('L','LFKI'),('H','HVQ'),('R','RWD'),('T','TCNY'),\
('S','SGP')],
'a2_7':[('M','MEA'),('L','LFK'),('I','IHV'),('Q','QR'),('W','WDT'),\
('C','CNY'),('S','SGP')],
'a2_8':[('M','MEA'),('L','LF'),('K','KIH'),('V','VQ'),('R','RWD'),\
('T','TC'),('N','NYS'),('G','GP')],
'a2_9':[('M','ME'),('A','AL'),('F','FKI'),('H','HV'),('Q','QR'),\
('W','WD'),('T','TCN'),('Y','YS'),('G','GP')],
'a2_10':[('M','ME'),('A','AL'),('F','FK'),('I','IH'),('V','VQ'),\
('R','RW'),('D','DT'),('C','CN'),('Y','YS'),('G','GP')],
'a3_2':[('S','SDQHPLCAVK'),('W','WNGERFITMY')],
'a3_3':[('S','SDQHPLC'),('A','AVKWNG'),('E','ERFITMY')],
'a3_4':[('S','SDQHP'),('L','LCAVK'),('W','WNGER'),('F','FITMY')],
'a3_5':[('S','SDQH'),('P','PLCA'),('V','VKWN'),('G','GERF'),('I','ITMY')],
'a3_6':[('S','SDQ'),('H','HPLC'),('A','AVK'),('W','WNG'),('E','ERFI'),\
('T','TMY')],
'a3_7':[('S','SDQ'),('H','HPL'),('C','CAV'),('K','KW'),('N','NGE'),\
('R','RFI'),('T','TMY')],
'a3_8':[('S','SDQ'),('H','HP'),('L','LCA'),('V','VK'),('W','WNG'),\
('E','ER'),('F','FIT'),('M','MY')],
'a3_9':[('S','SD'),('Q','QH'),('P','PLC'),('A','AV'),('K','KW'),\
('N','NG'),('E','ERF'),('I','IT'),('M','MY')],
'a3_10':[('S','SD'),('Q','QH'),('P','PL'),('C','CA'),('V','VK'),\
('W','WN'),('G','GE'),('R','RF'),('I','IT'),('M','MY')],
'a4_2':[('W','WHCMYQFKDN'),('E','EIPRSTGVLA')],
'a4_3':[('W','WHCMYQF'),('K','KDNEIP'),('R','RSTGVLA')],
'a4_4':[('W','WHCMY'),('Q','QFKDN'),('E','EIPRS'),('T','TGVLA')],
'a4_5':[('W','WHCM'),('Y','YQFK'),('D','DNEI'),('P','PRST'),('G','GVLA')],
'a4_6':[('W','WHC'),('M','MYQF'),('K','KDN'),('E','EIP'),('R','RSTG'),\
('V','VLA')],
'a4_7':[('W','WHC'),('M','MYQ'),('F','FKD'),('N','NE'),('I','IPR'),\
('S','STG'),('V','VLA')],
'a4_8':[('W','WHC'),('M','MY'),('Q','QFK'),('D','DN'),('E','EIP'),\
('R','RS'),('T','TGV'),('L','LA')],
'a4_9':[('W','WH'),('C','CM'),('Y','YQF'),('K','KD'),('N','NE'),\
('I','IP'),('R','RST'),('G','GV'),('L','LA')],
'a4_10':[('W','WH'),('C','CM'),('Y','YQ'),('F','FK'),('D','DN'),\
('E','EI'),('P','PR'),('S','ST'),('G','GV'),('L','LA')],
'a5_2':[('D','DSQPVLECWA'),('H','HFINMTYKGR')],
'a5_3':[('D','DSQPVLE'),('C','CWAHFI'),('N','NMTYKGR')],
'a5_4':[('D','DSQPV'),('L','LECWA'),('H','HFINM'),('T','TYKGR')],
'a5_5':[('D','DSQP'),('V','VLEC'),('W','WAHF'),('I','INMT'),('Y','YKGR')],
'a5_6':[('D','DSQ'),('P','PVLE'),('C','CWA'),('H','HFI'),('N','NMTY'),\
('K','KGR')],
'a5_7':[('D','DSQ'),('P','PVL'),('E','ECW'),('A','AH'),('F','FIN'),\
('M','MTY'),('K','KGR')],
'a5_8':[('D','DSQ'),('P','PV'),('L','LEC'),('W','WA'),('H','HFI'),\
('N','NM'),('T','TYK'),('G','GR')],
'a5_9':[('D','DS'),('Q','QP'),('V','VLE'),('C','CW'),('A','AH'),\
('F','FI'),('N','NMT'),('Y','YK'),('G','GR')],
'a5_10':[('D','DS'),('Q','QP'),('V','VL'),('E','EC'),('W','WA'),\
('H','HF'),('I','IN'),('M','MT'),('Y','YK'),('G','GR')],
# orig does no recoding, but is provided for convenience so if you want to
# iterate over all reduced alphabets and the full alphabet, you can do that
# without having specify the original alphabet differently.
'orig':zip('ACDEFGHIKLMNPQRSTVWY','ACDEFGHIKLMNPQRSTVWY')}
def build_alphabet_map(alphabet_id=None,alphabet_def=None):
""" return dict mapping old alphabet chars to new alphabet chars
alphabet_id: string identifying an alphabet in
cogent.util.recode_alignment.alphabets.
(See cogent.util.recode_alignment.alphabets.keys()
for valid alphabet_ids.)
alphabet_def: list of two-element tuples where first element is
the new alphabet character and the second elements is an iterable
object containing the old alphabet chars which should be mapped to
the new char.
e.g., [('A','CVILFMWAGSTPYH'),('B','QNDERKBZ')]
(See cogent.util.recode_alignment.alphabets.values()
for more examples.)
NOTE: Only one of the two parameters should be provided -- you either
provide the alphabet, or it is looked up. If you do provide both,
the alphabet_id is ignored.
"""
try:
alphabet_def = alphabet_def or alphabets[alphabet_id]
except KeyError:
if not alphabet_id:
raise ValueError,\
"Must provide an alphabet_id or alphabet definiton."
raise ValueError, "Invalid alphabet id."
result = {}
for new, old in alphabet_def:
for old_c in old:
result[old_c] = new
return result
def recode_dense_alignment(aln,alphabet_id=None,alphabet_def=None):
"""Return new DenseAlignment recoded in the provided reduced-state alphabet
aln: the DenseAlignment object to be recoded
alphabet_id: string identifying an alphabet in
cogent.util.recode_alignment.alphabets.
(See cogent.util.recode_alignment.alphabets.keys()
for valid alphabet_ids.)
alphabet_def: list of two-element tuples where first element is
the new alphabet character and the second elements is an iterable
object containing the old alphabet chars which should be mapped to
the new char.
e.g., [('A','CVILFMWAGSTPYH'),('B','QNDERKBZ')]
(See cogent.util.recode_alignment.alphabets.values()
for more examples.)
Note: either alphabet_id OR alphabet_def must be passed. Either
provide the alphabet, or have it is looked up. If both are provided
the alphabet_id is ignored.
"""
# Construct a dict mapping from UInt8s in alignment to their
# associated characters. This dict is then used for looking
# up chars in the new and old alphabets.
byte_map = dict(zip(aln.Alphabet,range(len(aln.Alphabet))))
# Construct a dict mapping old characters to new characters.
alphabet_map = build_alphabet_map(alphabet_id=alphabet_id,\
alphabet_def=alphabet_def)
# Create the recoded version of seqs.Alphabet
new_indices = range(len(aln.Alphabet))
for old, new in alphabet_map.items():
new_indices[byte_map[old]] = byte_map[new]
# Map the old alphabet onto the new alphabet. Note: characters that
# that are not mapped are ignored. Returns a new DenseAlignment.
return DenseAlignment(take(new_indices,aln.ArraySeqs).transpose(),\
aln.Names[:],MolType=aln.MolType)
def recode_alignment(aln,alphabet_id=None,alphabet_def=None):
raise NotImplementedError
def recode_freq_vector(alphabet_def,freqs,ignores='BXZ'):
""" recode the bg_freqs to reflect the recoding defined in alphabet_def
alphabet_def: list of tuples where new char is first tuple element
and sequence of old chars is second tuple element. (For examples,
see cogent.util.recode_alignment.alphabets.values())
freqs: dict mapping chars to their frequencies
ignores: the degenerate characters -- we don't want to include these
in the new freqs, b/c they'll be counted multiple times. Also,
isn't clear what should be done if an alphabet were to split them
apart.
Note: there is no error-checking here, so you need to be sure that
the alphabet and the frequencies are compatible (i.e., freqs and the
old characters must overlap perfectly, with the exception of the
degenerate characters, which are ignored by default).
"""
result = {}
for new,olds in alphabet_def:
for old in olds:
if old in ignores: continue
try:
result[new] += freqs[old]
except KeyError:
result[new] = freqs[old]
return result
## The following code is for recoding substitution matrices
def square_matrix_to_dict(matrix,key_order='ACDEFGHIKLMNPQRSTVWY'):
result = {}
for c,row in zip(key_order,matrix):
result[c] = dict(zip(key_order,row))
return result
def recode_count_matrix(alphabet,count_matrix,aa_order):
"""Recodes a subsitution count matrix
alphabet: the alphabet to be used for recoding the matrix
(see cogent.util.recode_alignment.alphabets.values()) for
examples
count_matrix: matrix to be recoded (e.g.,
cogent.evolve.models.DSO78_matrix)
aa_order: the order of the rows/cols in the matrix as a string
(for cogent.evolve.models.DSO78_matrix this would be
'ACDEFGHIKLMNPQRSTVWY')
"""
m = square_matrix_to_dict(count_matrix,aa_order)
result = zeros(len(aa_order)**2).reshape(len(aa_order),len(aa_order))
result = square_matrix_to_dict(result,aa_order)
for row_new,row_olds in alphabet:
for col_new,col_olds in alphabet:
if row_new not in col_olds:
new_count = 0.
for row_old in row_olds:
for col_old in col_olds:
try:
new_count += m[row_old][col_old]
except KeyError:
# hit a char that's not in the sub matrix --
# probablyan ambiguous residue (i.e., B, X, or Z)
pass
result[row_new][col_new] = new_count
cm = []
for row_c in aa_order:
r = []
for col_c in aa_order:
r.append(result[row_c][col_c])
cm.append(r)
return array(cm)
def recode_counts_and_freqs(alphabet,count_matrix=DSO78_matrix,\
freqs=DSO78_freqs,aa_order='ACDEFGHIKLMNPQRSTVWY'):
""" recode a substituion count matrix and a vector of character freqs
"""
recoded_freqs = recode_freq_vector(alphabet,freqs)
for aa in aa_order:
if aa not in recoded_freqs:
recoded_freqs[aa] = 0.0
recoded_counts = recode_count_matrix(alphabet,count_matrix,aa_order)
return recoded_counts,recoded_freqs
|