1
2
3
4
5
6
7
8 import string, array
9
10 import Alphabet
11 from Alphabet import IUPAC
12 from Data.IUPACData import ambiguous_dna_complement, ambiguous_rna_complement
13 from Bio.Data import CodonTable
14
23
25 """Returns a (truncated) representation of the sequence for debugging."""
26 if len(self) > 60 :
27
28
29
30 return "%s('%s...%s', %s)" % (self.__class__.__name__,
31 self.data[:54], self.data[-3:],
32 repr(self.alphabet))
33 else :
34 return "%s(%s, %s)" % (self.__class__.__name__,
35 repr(self.data),
36 repr(self.alphabet))
38 """Returns the full sequence as a python string.
39
40 Note that Biopython 1.44 and earlier would give a truncated
41 version of repr(my_seq) for str(my_seq). If you are writing code
42 which need to be backwards compatible with old Biopython, you
43 should continue to use my_seq.tostring() rather than str(my_seq)
44 """
45 return self.data
46
47
48
49
50
51
52
53
55
57
58
59
60 if isinstance(index, int) :
61
62 return self.data[index]
63 else :
64
65 return Seq(self.data[index], self.alphabet)
66
77
86
87
89 """Returns the full sequence as a python string.
90
91 Although not formally deprecated, you are now encouraged to use
92 str(my_seq) instead of my_seq.tostring()."""
93 return self.data
94
96 return MutableSeq(self.data, self.alphabet)
97
98 - def count(self, sub, start=None, end=None):
99 """Count method, like that of a python string.
100
101 Return an integer, the number of occurrences of substring
102 argument sub in the (sub)sequence given by [start:end].
103 Optional arguments start and end are interpreted as in slice
104 notation.
105
106 sub - a string or another Seq object to look for
107 start - optional integer, slice start
108 end - optional integer, slice end
109
110 e.g.
111 from Bio.Seq import Seq
112 my_seq = Seq("AAAATGA")
113 print my_seq.count("A")
114 print my_seq.count("ATG")
115 print my_seq.count(Seq("AT"))
116 print my_seq.count("AT", 2, -1)
117 """
118 try :
119
120
121
122 search = sub.tostring()
123 except AttributeError:
124
125 search = sub
126
127
128 if start is None and end is None :
129 return self.data.count(search)
130 elif end is None :
131 return self.data.count(search, start)
132 else :
133 return self.data.count(search, start, end)
134
136 """Seq.__maketrans(alphabet) -> translation table.
137
138 Return a translation table for use with complement()
139 and reverse_complement().
140
141 Compatible with lower case and upper case sequences.
142
143 alphabet is a dictionary as implement in Data.IUPACData
144
145 For internal use only.
146 """
147 before = ''.join(alphabet.keys())
148 after = ''.join(alphabet.values())
149 before = before + before.lower()
150 after = after + after.lower()
151 return string.maketrans(before, after)
152
171
173 """Returns the reverse complement sequence. New Seq object.
174 """
175
176 return self.complement()[::-1]
177
186 """Returns a (truncated) representation of the sequence for debugging."""
187 if len(self) > 60 :
188
189
190
191 return "%s('%s...%s', %s)" % (self.__class__.__name__,
192 str(self[:54]), str(self[-3:]),
193 repr(self.alphabet))
194 else :
195 return "%s('%s', %s)" % (self.__class__.__name__,
196 str(self),
197 repr(self.alphabet))
198
200 """Returns the full sequence as a python string.
201
202 Note that Biopython 1.44 and earlier would give a truncated
203 version of repr(my_seq) for str(my_seq). If you are writing code
204 which need to be backwards compatible with old Biopython, you
205 should continue to use my_seq.tostring() rather than str(my_seq).
206 """
207 return "".join(self.data)
208
224
226
237
253
261
278
283 - def pop(self, i = (-1)):
284 c = self.data[i]
285 del self.data[i]
286 return c
288 for i in range(len(self.data)):
289 if self.data[i] == item:
290 del self.data[i]
291 return
292 raise ValueError, "MutableSeq.remove(x): x not in list"
293
294 - def count(self, sub, start=None, end=None):
295 """Count method, like that of a python string.
296
297 Return an integer, the number of occurrences of substring
298 argument sub in the (sub)sequence given by [start:end].
299 Optional arguments start and end are interpreted as in slice
300 notation.
301
302 sub - a string or another Seq object to look for
303 start - optional integer, slice start
304 end - optional integer, slice end
305
306 e.g.
307 from Bio.Seq import MutableSeq
308 my_mseq = MutableSeq("AAAATGA")
309 print my_mseq.count("A")
310 print my_mseq.count("ATG")
311 print my_mseq.count(Seq("AT"))
312 print my_mseq.count("AT", 2, -1)
313 """
314 if len(sub) == 1 :
315
316 try :
317
318
319 letter = sub.tostring()
320 except AttributeError :
321 letter = sub
322
323 count = 0
324
325 if start is None and end is None :
326 for c in self.data:
327 if c == letter: count += 1
328 elif end is None :
329 for c in self.data[start:]:
330 if c == letter: count += 1
331 else :
332 for c in self.data[start:end]:
333 if c == letter: count += 1
334 return count
335 else :
336
337
338 return self.toseq().count(sub, start, end)
339
341 for i in range(len(self.data)):
342 if self.data[i] == item:
343 return i
344 raise ValueError, "MutableSeq.index(x): x not in list"
345
347 """Modify the MutableSequence to reverse itself.
348
349 No return value"""
350 self.data.reverse()
351
370
372 """Modify the MutableSequence to take on its reverse complement.
373
374 No return value"""
375 if isinstance(self.alphabet, Alphabet.ProteinAlphabet) :
376 raise ValueError, "Proteins do not have complements!"
377 self.complement()
378 self.data.reverse()
379
380
381
382
390
392 """Returns the full sequence as a python string.
393
394 Although not formally deprecated, you are now encouraged to use
395 str(my_seq) instead of my_seq.tostring()."""
396 return "".join(self.data)
397
399 """Returns the full sequence as a new immutable Seq object"""
400 return Seq("".join(self.data), self.alphabet)
401
402
403
404
405
406
408 """Transcribes a DNA sequence into RNA.
409
410 If given a string, returns a new string object.
411 Given a Seq or MutableSeq, returns a new Seq object with the same alphabet.
412 """
413 if isinstance(dna, Seq) or isinstance(dna, MutableSeq):
414 if isinstance(dna.alphabet, Alphabet.ProteinAlphabet) :
415 raise ValueError, "Proteins cannot be transcribed!"
416
417 rna = dna.tostring().replace('T','U').replace('t','u')
418 if dna.alphabet==IUPAC.unambiguous_dna:
419 alphabet = IUPAC.unambiguous_rna
420 elif dna.alphabet==IUPAC.ambiguous_dna:
421 alphabet = IUPAC.ambiguous_rna
422 else:
423 alphabet = Alphabet.generic_rna
424 return Seq(rna, alphabet)
425 else:
426 rna = dna.replace('T','U').replace('t','u')
427 return rna
428
429
431 """Back-transcribes an RNA sequence into DNA.
432
433 If given a string, returns a new string object.
434 Given a Seq or MutableSeq, returns a new Seq object with the same alphabet.
435 """
436 if isinstance(rna, Seq) or isinstance(rna, MutableSeq):
437 if isinstance(rna.alphabet, Alphabet.ProteinAlphabet) :
438 raise ValueError, "Proteins cannot be (back)transcribed!"
439
440 dna = rna.data.replace('U','T').replace('u','t')
441 if rna.alphabet==IUPAC.unambiguous_rna:
442 alphabet = IUPAC.unambiguous_dna
443 elif rna.alphabet==IUPAC.ambiguous_rna:
444 alphabet = IUPAC.ambiguous_dna
445 else:
446 alphabet = Alphabet.generic_dna
447 return Seq(dna, alphabet)
448 else:
449 dna = rna.replace('U','T').replace('u','t')
450 return dna
451
452
453 -def translate(sequence, table = "Standard", stop_symbol = "*"):
454 """Translate a nucleotide sequence into amino acids.
455
456 If given a string, returns a new string object.
457 Given a Seq or MutableSeq, returns a Seq object.
458
459 table - Which codon table to use? This can be either a name
460 (string) or an identifier (integer)
461
462 NOTE - Does NOT support ambiguous nucleotide sequences which
463 could code for a stop codon. This will throw a TranslationError.
464
465 NOTE - Does NOT support gapped sequences.
466
467 It will however translate either DNA or RNA."""
468 try:
469 id = int(table)
470 except:
471 id = None
472 if isinstance(sequence, Seq) or isinstance(sequence, MutableSeq):
473 if isinstance(sequence.alphabet, Alphabet.ProteinAlphabet) :
474 raise ValueError, "Proteins cannot be translated!"
475 if sequence.alphabet==IUPAC.unambiguous_dna:
476 if id==None:
477 table = CodonTable.unambiguous_dna_by_name[table]
478 else:
479 table = CodonTable.unambiguous_dna_by_id[id]
480 elif sequence.alphabet==IUPAC.ambiguous_dna:
481 if id==None:
482 table = CodonTable.ambiguous_dna_by_name[table]
483 else:
484 table = CodonTable.ambiguous_dna_by_id[id]
485 elif sequence.alphabet==IUPAC.unambiguous_rna:
486 if id==None:
487 table = CodonTable.unambiguous_rna_by_name[table]
488 else:
489 table = CodonTable.unambiguous_rna_by_id[id]
490 elif sequence.alphabet==IUPAC.ambiguous_rna:
491 if id==None:
492 table = CodonTable.ambiguous_rna_by_name[table]
493 else:
494 table = CodonTable.ambiguous_rna_by_id[id]
495 else:
496 if id==None:
497 table = CodonTable.ambiguous_generic_by_name[table]
498 else:
499 table = CodonTable.ambiguous_generic_by_id[id]
500 sequence = sequence.tostring().upper()
501 n = len(sequence)
502 get = table.forward_table.get
503 protein = [get(sequence[i:i+3], stop_symbol) for i in xrange(0,n-n%3,3)]
504 protein = "".join(protein)
505 if stop_symbol in protein :
506 alphabet = Alphabet.HasStopCodon(table.protein_alphabet,
507 stop_symbol = stop_symbol)
508 else :
509 alphabet = table.protein_alphabet
510 return Seq(protein, alphabet)
511 else:
512 if id==None:
513 table = CodonTable.ambiguous_generic_by_name[table]
514 else:
515 table = CodonTable.ambiguous_generic_by_id[id]
516 get = table.forward_table.get
517 sequence = sequence.upper()
518 n = len(sequence)
519 protein = [get(sequence[i:i+3], stop_symbol) for i in xrange(0,n-n%3,3)]
520 protein = "".join(protein)
521 return protein
522
523
544
545 if __name__ == "__main__" :
546 print "Quick self test"
547 from Bio.Data.IUPACData import ambiguous_dna_values, ambiguous_rna_values
548 from Bio.Alphabet import generic_dna, generic_rna
549 from sets import Set
550 print ambiguous_dna_complement
551 for ambig_char, values in ambiguous_dna_values.iteritems() :
552 compl_values = reverse_complement(values)[::-1]
553 print "%s={%s} --> {%s}=%s" % \
554 (ambig_char, values, compl_values, ambiguous_dna_complement[ambig_char])
555 assert Set(compl_values) == Set(ambiguous_dna_values[ambiguous_dna_complement[ambig_char]])
556
557 for s in ["".join(ambiguous_dna_values),
558 Seq("".join(ambiguous_dna_values)),
559 Seq("".join(ambiguous_dna_values), generic_dna),
560 "".join(ambiguous_rna_values),
561 Seq("".join(ambiguous_rna_values)),
562 Seq("".join(ambiguous_dna_values), generic_rna)]:
563 print "%s -> %s [RC]" % (repr(s), repr(reverse_complement(s)))
564 print "%s -> %s [RNA]" % (repr(s), repr(transcribe(s)))
565 print "%s -> %s [DNA]" % (repr(s), repr(back_transcribe(s)))
566
567
568 for letter in "ABCDEFGHIjklmnopqrstuvwyz" :
569 assert 1 == Seq(letter).count(letter)
570 assert 1 == MutableSeq(letter).count(letter)
571 my_str = "AAAATGACGGCGGCGGCT"
572 for my_obj in [my_str, Seq(my_str), MutableSeq(my_str)] :
573 assert 5 == my_obj.count("A")
574 assert 1 == my_obj.count("ATG")
575 assert 3 == my_obj.count("CG")
576 assert 2 == my_obj.count("A", 3, -5)
577 for my_obj in [Seq(my_str), MutableSeq(my_str)] :
578 assert 1 == my_obj.count(Seq("AT"))
579 assert 5 == my_obj.count(Seq("A"))
580 assert 3 == my_obj.count(Seq("CG"))
581 assert 2 == my_obj.count(Seq("A"), 3, -5)
582 assert 1 == my_obj.count(MutableSeq("AT"))
583 assert 5 == my_obj.count(MutableSeq("A"))
584 assert 3 == my_obj.count(MutableSeq("CG"))
585 assert 2 == my_obj.count(MutableSeq("A"), 3, -5)
586 for start in range(-len(my_str), len(my_str)) :
587 for end in range(-len(my_str), len(my_str)) :
588 c = my_str.count("A",start,end)
589 assert c == my_str[start:end].count("A")
590 assert c == my_obj.count("A",start,end)
591 assert c == my_obj[start:end].count("A")
592
593 assert my_str[start:end:-1].count("A") == my_obj[start:end:-1].count("A")
594
595 print repr(translate(Seq("GCTGTTATGGGTCGTTGGAAGGGTGGTCGTGCTGCTGGT",
596 IUPAC.unambiguous_dna)))
597 print repr(translate(Seq("GCTGTTATGGGTCGTTGGAAGGGTGGTCGTGCTGCTGGTTAG",
598 IUPAC.unambiguous_dna)))
599 print repr(translate(Seq("GCTGTTATGGGTCGTTGGAAGGGTGGTCGTGCTGCTGGTTAG",
600 IUPAC.unambiguous_dna), stop_symbol="@"))
601
602 ambig = Set(IUPAC.IUPACAmbiguousDNA.letters)
603 for c1 in ambig :
604 for c2 in ambig :
605 for c3 in ambig :
606 values = Set([translate(a+b+c, table=1) \
607 for a in ambiguous_dna_values[c1] \
608 for b in ambiguous_dna_values[c2] \
609 for c in ambiguous_dna_values[c3]])
610 try :
611 t = translate(c1+c2+c3)
612 except CodonTable.TranslationError :
613 assert "*" in values
614 continue
615 if t=="*" :
616 assert values == Set(["*"])
617 elif t=="X" :
618 assert len(values) > 1, \
619 "translate('%s') = '%s' not '%s'" \
620 % (c1+c2+c3, t, ",".join(values))
621 elif t=="Z" :
622 assert values == Set(("E", "Q"))
623 elif t=="B" :
624 assert values == Set(["D", "N"])
625 else :
626 assert values == Set(t)
627