This file is indexed.

/usr/lib/python2.7/dist-packages/cogent/util/recode_alignment.py is in python-cogent 1.9-9.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
#!/usr/bin/env python
# Author: Greg Caporaso (gregcaporaso@gmail.com)
# recode_alignment.py

"""This file contains functions for recoding alignment objects with
 reduced-state alphabets, and also defines some reduced-state alphabets.

Example alphabet definitions:
charge_his_3: Three states for +/-/no charge, with histidine counted as 
 a charged residue. 
size_2: Two states for large/small. Splits were manually derived by 
    ordering the Mr of the residues and finding a natural split in the 
    middle of the data set (residues with Mr <= 133 -> small; 
    Mr >= 146 -> large).
    Ordered Mr: [75,89,105,115,117,119,121,131,131,132,133,146,146,147,
                 149,155,165,174,181,204]
    Differences between neighboring Mr values: [14,16,10,2,2,2,10,0,1,1,13,
                                                0,1,2,6,10,9,7,23] 
    The difference between 133 and 146 (or D and K/Q) seems like the most 
        natural split.

Alphabet naming convention: standard alphabets (those defined in this file)
 are named based on an identifier (e.g., charge_his) followed by an underscore,
 followed by the number of states in the reduced alphabet (e.g., 3). 

Alphabet definition convention: When defining reduced alphabets, a reduced 
 state should be represented by the first char listed in that state. This 
 allows for alignments to not have to support new characeters (by changing 
 the MolType of the alignment) and it allows for easier interpretation of the 
 alignment states.
 
Many of the alphabets presented here are discussed in:
Detecting Coevolution by Disregarding Evolution? Tree-Ignorant Metrics of 
Coevolution Perform As Well As Tree-Aware Metrics; J. Gregory Caporaso, 
Sandra Smit, Brett C. Easton, Lawrence Hunter, Gavin A. Huttley, and 
Rob Knight. BMC Evolutionary Biology, 2008. 


"""
from __future__ import division
from optparse import OptionParser
from numpy import take, array, zeros
from cogent.core.alignment import DenseAlignment, Alignment
from cogent.evolve.models import DSO78_matrix, DSO78_freqs
from cogent import PROTEIN

__author__ = "Greg Caporaso"
__copyright__ = "Copyright 2007-2016, The Cogent Project"
__credits__ = ["Greg Caporaso"]
__license__ = "GPL"
__version__ = "1.9"
__maintainer__ = "Greg Caporaso"
__email__ = "gregcaporaso@gmail.com"
__status__ = "Beta"

class RecodeError(Exception):
    """ A generic error to be raised when errors occur in recoding """
    pass

alphabets = {\
    # Note: 3-STATE CHARGE ALPHAS ASSIGN B/Z TO UNCHARGED -- I FIGURE THAT'S 
    # SAFER THAT ASSIGNING THEM TO EITHER +/-, BUT MAYBE THAT'S NOT ACCURATE.
    # AN ALTERNATIVE WOULD BE TO ASSIGN THEM TO WHATEVER THEIR MORE COMMON
    # VALUE WAS IN (e.g.) ALL PROTEINS
    'charge_2': [('K','KRDEBZ'), ('A','ACFGHILMNPQSTVWYX')],\
    'charge_3': [('K','KR'), ('D','DE'), ('A','ACFGHILMNPQSTVWYBXZ')],\
    'charge_his_2': [('K','KRDEHBZ'), ('A','ACFGILMNPQSTVWYX')],\
    'charge_his_3': [('K','KRH'), ('D','DE'), ('A','ACFGILMNPQSTVWYBXZ')],\
    'size_2': [('G','GAVLISPTCNDXB'),('M','MFYWQKHREZ')],\
    'hydropathy_3':[('R','RKDENQHBZ'),('Y','YWSTG'), ('P','PAMCFLVIX')],\
    # B/Z are assigned to the acidic residues, since D/E are more common -- 
    # need to figure out if this is the best way to handle them
    'polarity_his_4':[('D','DEBZ'),('R','RHK'),\
     ('A','AILMFPWV'),('G','GSTCYNQ')],
    # This is a modified A1_4 alphabet to capture natural breaks in the metric
    'a1_4m':[('C','CVILF'),('M','MWA'),('T','GSTPYH'),('N','QNDERK')],

    'a1_2':[('C','CVILFMWAGS'),('T','TPYHQNDERK')],
    'a1_3':[('C','CVILFMW'),('A','AGSTPY'),('H','HQNDERK')],
    'a1_4':[('C','CVILF'),('M','MWAGS'),('T','TPYHQ'),('N','NDERK')],
    'a1_5':[('C','CVIL'),('F','FMWA'),('G','GSTP'),('Y','YHQN'),('D','DERK')],
    'a1_6':[('C','CVI'),('L','LFMW'),('A','AGS'),('T','TPY'),('H','HQND'),\
        ('E','ERK')],
    'a1_7':[('C','CVI'),('L','LFM'),('W','WAG'),('S','ST'),('P','PYH'),\
        ('Q','QND'),('E','ERK')],
    'a1_8':[('C','CVI'),('L','LF'),('M','MWA'),('G','GS'),('T','TPY'),\
        ('H','HQ'),('N','NDE'),('R','RK')],
    'a1_9':[('C','CV'),('I','IL'),('F','FMW'),('A','AG'),\
     ('S','ST'),('P','PY'),\
     ('H','HQN'),('D','DE'),('R','RK')],
    'a1_10':[('C','CV'),('I','IL'),('F','FM'),\
     ('W','WA'),('G','GS'),('T','TP'),\
     ('Y','YH'),('Q','QN'),('D','DE'),('R','RK')],
    'a2_2':[('M','MEALFKIHVQ'),('R','RWDTCNYSGP')],
    'a2_3':[('M','MEALFKI'),('H','HVQRWD'),('T','TCNYSGP')],
    'a2_4':[('M','MEALF'),('K','KIHVQ'),('R','RWDTC'),('N','NYSGP')],
    'a2_5':[('M','MEAL'),('F','FKIH'),('V','VQRW'),('D','DTCN'),('Y','YSGP')],
    'a2_6':[('M','MEA'),('L','LFKI'),('H','HVQ'),('R','RWD'),('T','TCNY'),\
        ('S','SGP')],
    'a2_7':[('M','MEA'),('L','LFK'),('I','IHV'),('Q','QR'),('W','WDT'),\
        ('C','CNY'),('S','SGP')],
    'a2_8':[('M','MEA'),('L','LF'),('K','KIH'),('V','VQ'),('R','RWD'),\
        ('T','TC'),('N','NYS'),('G','GP')],
    'a2_9':[('M','ME'),('A','AL'),('F','FKI'),('H','HV'),('Q','QR'),\
        ('W','WD'),('T','TCN'),('Y','YS'),('G','GP')],
    'a2_10':[('M','ME'),('A','AL'),('F','FK'),('I','IH'),('V','VQ'),\
        ('R','RW'),('D','DT'),('C','CN'),('Y','YS'),('G','GP')],
    'a3_2':[('S','SDQHPLCAVK'),('W','WNGERFITMY')],
    'a3_3':[('S','SDQHPLC'),('A','AVKWNG'),('E','ERFITMY')],
    'a3_4':[('S','SDQHP'),('L','LCAVK'),('W','WNGER'),('F','FITMY')],
    'a3_5':[('S','SDQH'),('P','PLCA'),('V','VKWN'),('G','GERF'),('I','ITMY')],
    'a3_6':[('S','SDQ'),('H','HPLC'),('A','AVK'),('W','WNG'),('E','ERFI'),\
        ('T','TMY')],
    'a3_7':[('S','SDQ'),('H','HPL'),('C','CAV'),('K','KW'),('N','NGE'),\
        ('R','RFI'),('T','TMY')],
    'a3_8':[('S','SDQ'),('H','HP'),('L','LCA'),('V','VK'),('W','WNG'),\
        ('E','ER'),('F','FIT'),('M','MY')],
    'a3_9':[('S','SD'),('Q','QH'),('P','PLC'),('A','AV'),('K','KW'),\
        ('N','NG'),('E','ERF'),('I','IT'),('M','MY')],
    'a3_10':[('S','SD'),('Q','QH'),('P','PL'),('C','CA'),('V','VK'),\
        ('W','WN'),('G','GE'),('R','RF'),('I','IT'),('M','MY')],
    'a4_2':[('W','WHCMYQFKDN'),('E','EIPRSTGVLA')],
    'a4_3':[('W','WHCMYQF'),('K','KDNEIP'),('R','RSTGVLA')],
    'a4_4':[('W','WHCMY'),('Q','QFKDN'),('E','EIPRS'),('T','TGVLA')],
    'a4_5':[('W','WHCM'),('Y','YQFK'),('D','DNEI'),('P','PRST'),('G','GVLA')],
    'a4_6':[('W','WHC'),('M','MYQF'),('K','KDN'),('E','EIP'),('R','RSTG'),\
        ('V','VLA')],
    'a4_7':[('W','WHC'),('M','MYQ'),('F','FKD'),('N','NE'),('I','IPR'),\
        ('S','STG'),('V','VLA')],
    'a4_8':[('W','WHC'),('M','MY'),('Q','QFK'),('D','DN'),('E','EIP'),\
        ('R','RS'),('T','TGV'),('L','LA')],
    'a4_9':[('W','WH'),('C','CM'),('Y','YQF'),('K','KD'),('N','NE'),\
        ('I','IP'),('R','RST'),('G','GV'),('L','LA')],
    'a4_10':[('W','WH'),('C','CM'),('Y','YQ'),('F','FK'),('D','DN'),\
        ('E','EI'),('P','PR'),('S','ST'),('G','GV'),('L','LA')],
    'a5_2':[('D','DSQPVLECWA'),('H','HFINMTYKGR')],
    'a5_3':[('D','DSQPVLE'),('C','CWAHFI'),('N','NMTYKGR')],
    'a5_4':[('D','DSQPV'),('L','LECWA'),('H','HFINM'),('T','TYKGR')],
    'a5_5':[('D','DSQP'),('V','VLEC'),('W','WAHF'),('I','INMT'),('Y','YKGR')],
    'a5_6':[('D','DSQ'),('P','PVLE'),('C','CWA'),('H','HFI'),('N','NMTY'),\
        ('K','KGR')],
    'a5_7':[('D','DSQ'),('P','PVL'),('E','ECW'),('A','AH'),('F','FIN'),\
        ('M','MTY'),('K','KGR')],
    'a5_8':[('D','DSQ'),('P','PV'),('L','LEC'),('W','WA'),('H','HFI'),\
        ('N','NM'),('T','TYK'),('G','GR')],
    'a5_9':[('D','DS'),('Q','QP'),('V','VLE'),('C','CW'),('A','AH'),\
        ('F','FI'),('N','NMT'),('Y','YK'),('G','GR')],
    'a5_10':[('D','DS'),('Q','QP'),('V','VL'),('E','EC'),('W','WA'),\
        ('H','HF'),('I','IN'),('M','MT'),('Y','YK'),('G','GR')],
    # orig does no recoding, but is provided for convenience so if you want to 
    # iterate over all reduced alphabets and the full alphabet, you can do that
    # without having specify the original alphabet differently.
    'orig':zip('ACDEFGHIKLMNPQRSTVWY','ACDEFGHIKLMNPQRSTVWY')} 

def build_alphabet_map(alphabet_id=None,alphabet_def=None):
    """ return dict mapping old alphabet chars to new alphabet chars

        alphabet_id: string identifying an alphabet in 
            cogent.util.recode_alignment.alphabets. 
            (See cogent.util.recode_alignment.alphabets.keys()
            for valid alphabet_ids.)
        alphabet_def: list of two-element tuples where first element is 
            the new alphabet character and the second elements is an iterable 
            object containing the old alphabet chars which should be mapped to
            the new char. 
            e.g., [('A','CVILFMWAGSTPYH'),('B','QNDERKBZ')] 
            (See cogent.util.recode_alignment.alphabets.values() 
            for more examples.)  

        NOTE: Only one of the two parameters should be provided -- you either
            provide the alphabet, or it is looked up. If you do provide both,
            the alphabet_id is ignored.

    """
    try:
        alphabet_def = alphabet_def or alphabets[alphabet_id]
    except KeyError:
        if not alphabet_id:
            raise ValueError,\
                "Must provide an alphabet_id or alphabet definiton."
        raise ValueError, "Invalid alphabet id."
        
    result = {}
    for new, old in alphabet_def:
        for old_c in old:
            result[old_c] = new

    return result

def recode_dense_alignment(aln,alphabet_id=None,alphabet_def=None):
    """Return new DenseAlignment recoded in the provided reduced-state alphabet
    
        aln: the DenseAlignment object to be recoded
        alphabet_id: string identifying an alphabet in 
            cogent.util.recode_alignment.alphabets. 
            (See cogent.util.recode_alignment.alphabets.keys()
            for valid alphabet_ids.)
        alphabet_def: list of two-element tuples where first element is 
            the new alphabet character and the second elements is an iterable 
            object containing the old alphabet chars which should be mapped to
            the new char. 
            e.g., [('A','CVILFMWAGSTPYH'),('B','QNDERKBZ')] 
            (See cogent.util.recode_alignment.alphabets.values() 
            for more examples.)  
         
        Note: either alphabet_id OR alphabet_def must be passed. Either
            provide the alphabet, or have it is looked up. If both are provided
            the alphabet_id is ignored.
         
    """

    # Construct a dict mapping from UInt8s in alignment to their 
    # associated characters. This dict is then used for looking
    # up chars in the new and old alphabets.
    byte_map = dict(zip(aln.Alphabet,range(len(aln.Alphabet))))

    # Construct a dict mapping old characters to new characters.
    alphabet_map = build_alphabet_map(alphabet_id=alphabet_id,\
        alphabet_def=alphabet_def)
   
    # Create the recoded version of seqs.Alphabet 
    new_indices = range(len(aln.Alphabet))
    for old, new in alphabet_map.items():
        new_indices[byte_map[old]] = byte_map[new]    

    # Map the old alphabet onto the new alphabet. Note: characters that
    # that are not mapped are ignored. Returns a new DenseAlignment.
    return DenseAlignment(take(new_indices,aln.ArraySeqs).transpose(),\
        aln.Names[:],MolType=aln.MolType)
        
def recode_alignment(aln,alphabet_id=None,alphabet_def=None):
    raise NotImplementedError

def recode_freq_vector(alphabet_def,freqs,ignores='BXZ'):
    """ recode the bg_freqs to reflect the recoding defined in alphabet_def
        
        alphabet_def: list of tuples where new char is first tuple element
            and sequence of old chars is second tuple element. (For examples,
            see cogent.util.recode_alignment.alphabets.values())
        freqs: dict mapping chars to their frequencies
        ignores: the degenerate characters -- we don't want to include these
            in the new freqs, b/c they'll be counted multiple times. Also,
            isn't clear what should be done if an alphabet were to split them
            apart.

        Note: there is no error-checking here, so you need to be sure that
         the alphabet and the frequencies are compatible (i.e., freqs and the
         old characters must overlap perfectly, with the exception of the
         degenerate characters, which are ignored by default).
    """
    result = {}
    for new,olds in alphabet_def:
        for old in olds:
            if old in ignores: continue
            try:
                result[new] += freqs[old]
            except KeyError:
                result[new] = freqs[old]
    return result

## The following code is for recoding substitution matrices 
def square_matrix_to_dict(matrix,key_order='ACDEFGHIKLMNPQRSTVWY'):
    result = {}
    for c,row in zip(key_order,matrix):
        result[c] = dict(zip(key_order,row))
    return result

def recode_count_matrix(alphabet,count_matrix,aa_order):
    """Recodes a subsitution count matrix 
    
        alphabet: the alphabet to be used for recoding the matrix
         (see cogent.util.recode_alignment.alphabets.values()) for
         examples
        count_matrix: matrix to be recoded (e.g., 
         cogent.evolve.models.DSO78_matrix) 
        aa_order: the order of the rows/cols in the matrix as a string
         (for cogent.evolve.models.DSO78_matrix this would be
         'ACDEFGHIKLMNPQRSTVWY')
        
    """
    m = square_matrix_to_dict(count_matrix,aa_order)
    result = zeros(len(aa_order)**2).reshape(len(aa_order),len(aa_order))
    result = square_matrix_to_dict(result,aa_order)
    for row_new,row_olds in alphabet:
        for col_new,col_olds in alphabet:
            if row_new not in col_olds:
                new_count = 0.
                for row_old in row_olds:
                    for col_old in col_olds:
                        try:
                            new_count += m[row_old][col_old]
                        except KeyError:
                            # hit a char that's not in the sub matrix --
                            # probablyan ambiguous residue (i.e., B, X, or Z)
                            pass
                result[row_new][col_new] = new_count
    cm = []
    for row_c in aa_order:
        r = [] 
        for col_c in aa_order:
            r.append(result[row_c][col_c])
        cm.append(r)
    return array(cm)
 
def recode_counts_and_freqs(alphabet,count_matrix=DSO78_matrix,\
    freqs=DSO78_freqs,aa_order='ACDEFGHIKLMNPQRSTVWY'):
    """ recode a substituion count matrix and a vector of character freqs
    """

    recoded_freqs = recode_freq_vector(alphabet,freqs)
    for aa in aa_order:
        if aa not in recoded_freqs:
            recoded_freqs[aa] = 0.0
    recoded_counts = recode_count_matrix(alphabet,count_matrix,aa_order)
    return recoded_counts,recoded_freqs