/usr/lib/python2.7/dist-packages/cogent/evolve/best_likelihood.py is in python-cogent 1.9-9.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 | #!/usr/bin/env python
"""
Returns the likelihood resulting from a model in which motif probabilities
are assumed to be equal to the observed motif frequencies, as described by
Goldman (1993). This model is not informative for inferring the evolutionary
process, but its likelihood indicates the maximum possible likelihood value
for a site-independent evolutionary process.
"""
from __future__ import division
from numpy import log
from cogent import LoadSeqs
__author__ = "Helen Lindsay, Gavin Huttley"
__copyright__ = "Copyright 2007-2016, The Cogent Project"
__credits__ = ["Helen Lindsay", "Gavin Huttley", "Daniel McDonald"]
cite = "Goldman, N. (1993). Statistical tests of models of DNA substitution. J Mol Evol, 36: 182-98"
__license__ = "GPL"
__version__ = "1.9"
__maintainer__ = "Gavin Huttley"
__email__ = "gavin.huttley@anu.edu.au"
__status__ = "Production"
def _transpose(array):
new_array = []
num_cols = len(array[0])
for col in range(num_cols):
new_row = []
for row in array:
new_row += [row[col]]
new_array.append(new_row)
return new_array
def _take(array, indices):
new_array = []
for index in indices:
new_array.append(array[index])
return new_array
def aligned_columns_to_rows(aln, motif_len, exclude_chars = None, allowed_chars='ACGT'):
"""return alignment broken into motifs as a transposed list with
sequences as columns and aligned columns as rows
Arguments:
exclude_chars: columns containing these characters will be excluded"""
if exclude_chars:
exclude_chars = set(exclude_chars)
exclude_func = exclude_chars.intersection
else:
allowed_chars = set(allowed_chars)
exclude_func = lambda x: not allowed_chars.issuperset(x)
exclude_indices = set()
array = []
for name in aln.Names:
motifs = list(aln.getGappedSeq(name).getInMotifSize(motif_len))
array.append(motifs)
for motif_index, motif in enumerate(motifs):
if exclude_func(motif):
exclude_indices.update([motif_index])
include_indices = set(range(len(array[0]))).difference(exclude_indices)
include_indices = list(include_indices)
include_indices.sort()
array = _transpose(array)
array = _take(array, include_indices)
return array
def count_column_freqs(columns_list):
"""return the frequency of columns"""
col_freq_dict = {}
for column in columns_list:
column = ' '.join(column)
col_freq_dict[column] = col_freq_dict.get(column, 0) + 1
return col_freq_dict
def get_ML_probs(columns_list, with_patterns=False):
"""returns the column log-likelihoods and frequencies
Argument:
- with_patterns: the site patterns are returned"""
n = len(columns_list)
col_freq_dict = count_column_freqs(columns_list)
col_lnL_freqs = []
for column_pattern, freq in col_freq_dict.items():
# note, the behaviour of / is changed due to the __future__ import
if with_patterns:
row = [column_pattern, freq/n, freq]
else:
row = [freq/n, freq]
col_lnL_freqs.append(row)
return col_lnL_freqs
def get_G93_lnL_from_array(columns_list):
"""return the best log likelihood for a list of aligned columns"""
col_stats = get_ML_probs(columns_list)
log_likelihood = 0
for freq, num in col_stats:
pattern_lnL = log(freq)*num
log_likelihood += pattern_lnL
return log_likelihood
def BestLogLikelihood(aln, alphabet=None, exclude_chars = None,
allowed_chars='ACGT', motif_length=None, return_length=False):
"""returns the best log-likelihood according to Goldman 1993.
Arguments:
- alphabet: a sequence alphabet object.
- motif_length: 1 for nucleotide, 2 for dinucleotide, etc ..
- exclude_chars: a series of characters used to exclude motifs
- allowed_chars: only motifs that contain a subset of these are
allowed
- return_length: whether to also return the number of alignment columns
"""
assert alphabet or motif_length, "Must provide either an alphabet or a"\
" motif_length"
# need to use the alphabet, so we can enforce character compliance
if alphabet:
kwargs = dict(moltype=alphabet.MolType)
motif_length = alphabet.getMotifLen()
else:
kwargs = {}
aln = LoadSeqs(data=aln.todict(), **kwargs)
columns = aligned_columns_to_rows(aln, motif_length, exclude_chars,
allowed_chars)
num_cols = len(columns)
log_likelihood = get_G93_lnL_from_array(columns)
if return_length:
return log_likelihood, num_cols
return log_likelihood
|