/usr/lib/python2.7/dist-packages/biotools/sequence.py is in python-biotools 1.2.12-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 | from biotools.annotation import Annotation
def chop(seq, length=70):
'''
Yields a chunk of a sequence of no more than `length` characters,
it is meant to be used to print fasta files.
'''
while seq:
try:
piece, seq = seq[:length], seq[length:]
except IndexError:
piece, seq = seq, ''
yield piece
raise StopIteration()
def isprot(seq, nucleotides='ATUCGNYRatucgnyr- '):
'''
Check whether the current sequence is a protein or nucleotide sequence.
'''
for c in seq:
if c not in nucleotides:
return True
else:
return False
class Sequence(object):
'''
A wrapper class for sequences.
'''
def __init__(self, name, seq, **kwargs):
'''
Instantiates a Sequence object with a given sequence. Some other
useful parameters that the `Sequence` constructor can handle are:
* `qual` => the quality scores (an array of integers) of the
sequence,
* `type` => the type of the sequence, either prot or nucl,
* `start` => the starting position of the sequence within a
supersequence,
* `end` => the ending position of the sequnece within a
supersequence,
* `step` => the 'step' of the sequence, usually +1 for top-strand
sequences, and -1 for bottom-strand sequences, but can handle
other values as well,
* `original` => the original `Sequence` object from which this one
derives,
* `defline` => the definition line for this sequnce from a fasta
file.
If one of these are not given, they will default to the most logical
value that can be determined from the other values and sequence (e.g.,
if `end < start`, then `step` is probably -1).
'''
self.name = name
self.seq = seq
self.qual = kwargs.get('qual', None)
if 'type' in kwargs:
self.type = kwargs['type']
self.start = kwargs.get('start', 1)
self.end = kwargs.get('end', self.start - 1 + len(seq))
self.step = kwargs.get('step', -1 if self.start > self.end else 1)
self.original = kwargs.get('original', self)
self.defline = kwargs.get('defline', '')
def __getattr__(self, attr):
if attr == 'type':
self.type = 'prot' if isprot(self.seq) else 'nucl'
return self.type
raise AttributeError('%r object has no attribute %r' %
(self.__class__.__name__, attr))
def __getitem__(self, key):
'''
sequence[i] or sequence[start:end] or sequence[start:end:step]
constructs a new Sequence that is a subsequence as described by the
way this function is called. This function will automatically fill in
the name, sequence, start, stop, step, original, and type of the
subsequence. It also tries to fill in the annotations, but annotations
are handled pretty poorly right now, so it's probably best not to
worry about those, but it will work if you really want to.
'''
try:
start, stop, step = key.indices(len(self.seq))
except AttributeError:
start, stop, step = key, key + 1, 1
order = abs(self.step) / self.step
r = stop - (stop - start) % step - step
seq = ''.join(self.seq[x] for x in xrange(start, stop, step))
qual = self.qual and [self.qual[x] for x in xrange(start, stop, step)]
info = (self.name, start, stop, step)
self.type
return Sequence("subsequence(%s, %d, %d, %d)" % info, seq,
qual=qual, original=self.original, type=self.type,
start=self.start + start * order,
end=self.start + r * order,
step=step * self.step)
'''
Some other things you can do with a Sequence object:
* len(sequence) => gives the length of the sequence.
* for character in sequence: => allows you to loop over each character in
the sequence.
* dictionary[sequence] => allows sequences to be used as keys for
dictionaries and allows you to have sequences in sets. This relies on
the test seqA == seqB, described next.
* seqA == seqB => compare two sequences. The sequences are the same if
they have the same sequence AND name. Therefore, two sequences with
different names are treated as separate items in a set and separate
keys in a dictionary. If you need to match only the sequence, use
seqA.seq == seqB.seq.
* print sequence => print a fasta / fastq (depending on whether there are
any quality scores) representation of the sequence. Sequence objects
in any other data structure (e.g., list, dictionary) are printed as
(e.g., <Sequence 0x000000>). If you want to change that, you can do:
def __repr__(self):
return self.__str__()
'''
def upper(self):
return Sequence(self.name, self.seq.upper(), type=self.type,
qual=self.qual, original=self.original,
defline=self.defline, start=self.start, step=self.step,
end=self.end)
def __iter__(self):
for c in self.seq:
yield c
raise StopIteration()
def __len__(self):
return len(self.seq)
def __hash__(self):
return hash(self.seq)
def __eq__(self, other):
try:
return self.seq == other.seq and self.name == other.name
except AttributeError:
return (self.seq == other)
def __str__(self):
if self.qual:
return '@%s\n%s\n+\n%s' % (self.name, self.seq,
''.join(chr(ord('A') - 1 + q)
for q in self.qual))
else:
return '>%s %s\n%s' % (self.name, self.defline,
'\n'.join(chop(self.seq, 70)))
def annotation(seq, source, type, **kwargs):
'''
Creates an `Annotation` object for the given sequence from a source
(e.g., "phytozome7.0") of a particular type (e.g., "gene").
'''
try:
sname = source.name
except AttributeError:
sname = source
start, end = min(seq.start, seq.end), max(seq.start, seq.end)
strand = '+' if seq.step == 1 else '-'
attrs = ';'.join('%s=%s' % (key, str(kwargs[key])) for key in kwargs)
return Annotation(seq.original.name, sname, type, start, end, '.',
strand, start % 3, attrs)
|