/usr/lib/python2.7/dist-packages/cogent/align/weights/util.py is in python-cogent 1.9-9.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 | #!/usr/bin/env python
"""Provides utility methods for sequence weighting
"""
from __future__ import division
from numpy import array, zeros, dot as matrixmultiply, ones, identity, take,\
asarray, uint8 as UInt8, add, sqrt
from random import choice
from cogent.util.array import hamming_distance
from cogent.core.profile import Profile, CharMeaningProfile
from cogent.core.moltype import DNA, RNA, PROTEIN
from cogent.core.alignment import Alignment
__author__ = "Sandra Smit"
__copyright__ = "Copyright 2007-2016, The Cogent Project"
__credits__ = ["Sandra Smit", "Rob Knight", "Gavin Huttley", "Daniel McDonald"]
__license__ = "GPL"
__version__ = "1.9"
__maintainer__ = "Sandra Smit"
__email__ = "sandra.smit@colorado.edu"
__status__ = "Development"
PROTEIN_ORDER = ''.join(PROTEIN.Alphabet) + '-'
DNA_ORDER = ''.join(DNA.Alphabet) + '-'
RNA_ORDER = ''.join(RNA.Alphabet) + '-'
class Weights(dict):
"""Dictionary seq_ids: seq weight
"""
def normalize(self):
"""Normalizes to one. Works in place!"""
total = sum(self.values(), 0)
for k, v in self.items():
self[k] = v/total
def number_of_pseudo_seqs(alignment):
"""Returns the # of unique randomized sequences that can be generated
from the alignment.
alignment: Alignment object
A random sequence is created by randomly choosing at each position one
of the residues observed at that position (column) in the alighment.
A single occurrence of that residue type is sufficient to make it an
option and the choice is with equal likelihood from any of the observed
characters. (See Implementation Notes for more details).
"""
return reduce(lambda x, y: x*y, map(len,alignment.columnProbs()))
def pseudo_seqs_exact(alignment,n=None):
"""Returns all possible pseudo sequences (generated from the alignment)
alignment: Alignment object
n: has NO FUNCTION, except to make the API for all functions that generate
pseudo sequences the same.
This function is used by the VOR method (only when the number of unique
pseudo sequences is lower than 1000).
The original sequences will be generated in this list of pseudo sequences.
Duplications of sequences in the alignment don't cause duplications
in the list of pseudo sequences.
Example:
AA, AA, BB in the alignment
generates pseudo sequences: AA,BB,AB,BA
ABC, BCC, BAC in the alignment
generates pseudo sequences: AAC, ABC, ACC, BAC, BBC, and BCC
"""
counts = alignment.columnFreqs()
results = [[]]
for col in counts:
new_results = []
for item in results:
for option in col:
new_results.append(item + [option])
results = new_results
return [''.join(i) for i in results]
def pseudo_seqs_monte_carlo(alignment,n=1000):
"""Yields sample of possible pseudo sequences (generated from alignment)
alignment: Alignment object
n = number of pseudo sequences generated (=sample size)
This function is used by the VOR method, usually when the number of
possible pseudo sequences exceeds 1000.
To see how the pseudo sequences are generated read the Implementation
Notes at the top of this file.
"""
freqs = alignment.columnFreqs()
for x in range(n):
seq = []
for i in freqs:
seq.append(choice(i.keys()))
yield ''.join(seq)
def row_to_vote(row):
"""Changes distances to votes.
There's one vote in total. The sequence with the lowest distance
gets the vote. If there are multiple sequences equidistant at the
minimum distance, the vote is split over all the sequences at the
minimum distance.
Example:
[4,2,3,7] -> [0,1,0,0]
[1,3,2,1] -> [.5,0,0,.5]
[1,1,1,1] -> [.25,.25,.25,.25]
"""
result = array(row) - min(row) == 0
return result/sum(result, 0)
def distance_matrix(alignment, distance_method=hamming_distance):
"""Returns distance matrix for seqs in the alignment.
Order is either the Names in the alignment or the order in which
is iterated over the rows.
Distance is the Hamming distance between two sequences
"""
#change sequences into arrays
alignment = [array(str(alignment.NamedSeqs[k]), 'c') for k in\
alignment.Names]
nr_of_seqs = len(alignment)
distances = zeros([nr_of_seqs,nr_of_seqs])
for i, seq_a in enumerate(alignment):
for j, seq_b in enumerate(alignment):
if i < j:
dist = distance_method(seq_a,seq_b)
distances[i,j] = dist
distances[j,i] = dist
else: # i==j (diagonal) or i>j (lower left half)
continue
return distances
def eigenvector_for_largest_eigenvalue(matrix):
"""Returns eigenvector corresponding to largest eigenvalue of matrix.
Implements a numerical method for finding an eigenvector by repeated
application of the matrix to a starting vector. For a matrix A the
process w(k) <-- A*w(k-1) converges to eigenvector w with the largest
eigenvalue. Because distance matrix D has all entries >= 0, a theorem
due to Perron and Frobenius on nonnegative matrices guarantees good
behavior of this method, excepting degenerate cases where the
iteration oscillates. For distance matrices a remedy is to add the
identity matrix to A, permitting the iteration to converge to the
eigenvector. (From Sander and Schneider (1991), and Vingron and
Sibbald (1993))
Note: Only works on square matrices.
"""
#always add the identity matrix to avoid oscillating behavior
matrix = matrix + identity(len(matrix))
#v is a random vector (chosen as the normalized vector of ones)
v = ones(len(matrix))/len(matrix)
#iterate until convergence
for i in range(1000):
new_v = matrixmultiply(matrix,v)
new_v = new_v/sum(new_v, 0) #normalize
if sum(map(abs,new_v-v), 0) > 1e-9:
v = new_v #not converged yet
continue
else: #converged
break
return new_v
def distance_to_closest(alignment, distance_method=hamming_distance):
"""Returns vector of distances to closest neighbor for each s in alignment
alignment: Alignment object
distance_method: function used to calculate the distance between two seqs
Function returns the closest distances according to the Names in the
alignment
example:
Alignment({1:'ABCD',2:'ABCC',3:'CBDD',4:'ACAA'},Names=[3,2,1,4])
[2,1,1,3]
"""
names = alignment.Names
seqs = [array(str(alignment.NamedSeqs[n]), 'c') for n in names]
closest = []
for i, key in enumerate(names):
seq = seqs[i]
dist = None
for j, other_key in enumerate(names):
if key == other_key:
continue
d = distance_method(seq,seqs[j])
if dist:
if d < dist:
dist = d
else:
dist = d
closest.append(dist)
return array(closest)
def SeqToProfile(seq, alphabet=None, char_order=None,\
split_degenerates=False):
"""Generates a Profile object from a Sequence object.
seq: Sequence object
alphabet (optional): Alphabet object (if you want to split
degenerate symbols, the alphabet object should have a
Degenerates property. Default is the Alphabet associated with
the Sequence object.
char_order (optional): The order the characters occur in the Profile.
Default is the list(alphabet)
split_degenerates (optional): Whether you want the counts for the
degenerate symbols to be divided over the non-degenerate symbols they
code for.
A Profile is a position x character matrix describing which characters
occur at each position. In a sequence (as opposed to an alignment) only
one character occurs at each position. In general a sequence profile
will only contain ones and zeros. However, you have the possibility of
splitting degenerate characters. For example, if a position is R, it
means that you have 50/50% chance of A and G. It's also possible to
ignore characters, which in a sequence profile will lead to positions
(rows) containing only zeros.
Example:
Sequence = ACGU
Profile(seq, CharOrder=UCAG):
U C A G
0 0 1 0 first pos
0 1 0 0 second pos
0 0 0 1 third pos
1 0 0 0 fourth pos
Sequence= GURY
Profile(seq,CharOrder=UCAG, split_degenerates=True)
U C A G
0 0 0 1 first pos
1 0 0 0 second pos
0 0 .5 .5 third pos
.5 .5 0 0 fourth pos
Characters can also be ignored
Sequence = ACN-
Profile(seq, CharOrder=UCAGN, split_degenerates=True)
U C A G
0 0 1 0 first pos
0 1 0 0 second pos
.25 .25 .25 .25 third pos
0 0 0 0 fourth pos <--contains only zeros
"""
if alphabet is None:
alphabet = seq.MolType
if char_order is None:
char_order = list(alphabet)
#Determine the meaning of each character based on the alphabet, the
#character order, and the option to split degenerates
char_meaning = CharMeaningProfile(alphabet, char_order,\
split_degenerates)
#construct profile data
idxs = array(str(seq).upper(), 'c').view(UInt8)
result_data = char_meaning.Data[idxs]
#result_data = take(char_meaning.Data, asarray(str(seq).upper(), UInt8), axis=0)
return Profile(result_data, alphabet, char_order)
def AlnToProfile(aln, alphabet=None, char_order=None, split_degenerates=False,\
weights=None):
"""Generates a Profile object from an Alignment.
aln: Alignment object
alphabet (optional): an Alphabet object (or list of chars, but if you
want to split degenerate symbols, the alphabet must have a
Degenerates property. Default is the alphabet of the first seq in
the alignment.
char_order (optional): order of the characters in the profile. Default
is list(alphabet)
split_degenerates (optional): Whether you want the counts for the
degenerate symbols to be divided over the non-degenerate symbols they
code for.
weights (optional): dictionary of seq_id: weight. If not entered all seqs
are weighted equally
A Profile is a position x character matrix describing which characters
occur at each position of an alignment. The Profile is always normalized,
so it gives the probabilities of each character at each position.
Ignoring chars: you can ignore characters in the alignment by not putting
the char in the CharOrder. If you ignore all characters at a particular
position, an error will be raised, because the profile can't be normalized.
Splitting degenerates: you can split degenerate characters over the
non-degenerate characters they code for. For example: R = A or G. So,
an R at a position counts for 0.5 A and 0.5 G.
Example:
seq1 TCAG weight: 0.5
seq2 TAR- weight: 0.25
seq3 YAG- weight: 0.25
Profile(aln,alphabet=DNA,char_order="TACG",weights=w,
split_degenerates=True)
Profile:
T A C G
[[ 0.875 0. 0.125 0. ]
[ 0. 0.5 0.5 0. ]
[ 0. 0.625 0. 0.375]
[ 0. 0. 0. 1. ]]
"""
if alphabet is None:
alphabet = aln.values()[0].MolType
if char_order is None:
char_order = list(alphabet)
if weights is None:
weights = dict.fromkeys(aln.keys(),1/len(aln))
char_meaning = CharMeaningProfile(alphabet, char_order,\
split_degenerates)
profiles = []
for k,v in aln.items():
idxs = array(str(v).upper(), 'c').view(UInt8)
profiles.append(char_meaning.Data[idxs] * weights[k])
s = reduce(add,profiles)
result = Profile(s,alphabet, char_order)
try:
result.normalizePositions()
except Exception, e:
raise ValueError,e
#"Probably one of the rows in your profile adds up to zero,\n "+\
#"because you are ignoring all of the characters in the "+\
#"corresponding\n column in the alignment"
return result
|