Package Bio :: Module Seq
[hide private]
[frames] | no frames]

Source Code for Module Bio.Seq

  1  # Copyright 2000-2002 Brad Chapman. 
  2  # Copyright 2004-2005 by M de Hoon. 
  3  # Copyright 2007 by Peter Cock. 
  4  # All rights reserved. 
  5  # This code is part of the Biopython distribution and governed by its 
  6  # license.  Please see the LICENSE file that should have been included 
  7  # as part of this package. 
  8  import string, array 
  9   
 10  import Alphabet 
 11  from Alphabet import IUPAC 
 12  from Data.IUPACData import ambiguous_dna_complement, ambiguous_rna_complement 
 13  from Bio.Data import CodonTable 
 14   
15 -class Seq:
16 - def __init__(self, data, alphabet = Alphabet.generic_alphabet):
17 # Enforce string storage 18 assert (type(data) == type("") or # must use a string 19 type(data) == type(u"")) # but can be a unicode string 20 21 self.data = data # Seq API requirement 22 self.alphabet = alphabet # Seq API requirement
23
24 - def __repr__(self):
25 return "%s(%s, %s)" % (self.__class__.__name__, 26 repr(self.data), 27 repr(self.alphabet))
28 - def __str__(self):
29 if len(self.data) > 60: 30 s = repr(self.data[:60] + " ...") 31 else: 32 s = repr(self.data) 33 return "%s(%s, %s)" % (self.__class__.__name__, s, 34 repr(self.alphabet))
35 # I don't think I like this method... 36 ## def __cmp__(self, other): 37 ## if isinstance(other, Seq): 38 ## return cmp(self.data, other.data) 39 ## else: 40 ## return cmp(self.data, other) 41
42 - def __len__(self): return len(self.data) # Seq API requirement
43
44 - def __getitem__(self, index) : # Seq API requirement
45 #Note since Python 2.0, __getslice__ is deprecated 46 #and __getitem__ is used instead. 47 #See http://docs.python.org/ref/sequence-methods.html 48 if isinstance(index, int) : 49 #Return a single letter as a string 50 return self.data[index] 51 else : 52 #Return the (sub)sequence as another Seq object 53 return Seq(self.data[index], self.alphabet)
54
55 - def __add__(self, other):
56 if type(other) == type(' '): 57 return self.__class__(self.data + other, self.alphabet) 58 elif self.alphabet.contains(other.alphabet): 59 return self.__class__(self.data + other.data, self.alphabet) 60 elif other.alphabet.contains(self.alphabet): 61 return self.__class__(self.data + other.data, other.alphabet) 62 else: 63 raise TypeError, ("incompatable alphabets", str(self.alphabet), 64 str(other.alphabet))
65
66 - def __radd__(self, other):
67 if self.alphabet.contains(other.alphabet): 68 return self.__class__(other.data + self.data, self.alphabet) 69 elif other.alphabet.contains(self.alphabet): 70 return self.__class__(other.data + self.data, other.alphabet) 71 else: 72 raise TypeError, ("incompatable alphabets", str(self.alphabet), 73 str(other.alphabet))
74 75
76 - def tostring(self): # Seq API requirement
77 return self.data 78
79 - def tomutable(self): # Needed? Or use a function?
80 return MutableSeq(self.data, self.alphabet) 81
82 - def count(self, item):
83 return len([x for x in self.data if x == item])
84
85 - def __maketrans(self, alphabet) :
86 """Seq.__maketrans(alphabet) -> translation table. 87 88 Return a translation table for use with complement() 89 and reverse_complement(). 90 91 Compatible with lower case and upper case sequences. 92 93 alphabet is a dictionary as implement in Data.IUPACData 94 95 For internal use only. 96 """ 97 before = ''.join(alphabet.keys()) 98 after = ''.join(alphabet.values()) 99 before = before + before.lower() 100 after = after + after.lower() 101 return string.maketrans(before, after)
102
103 - def complement(self):
104 """Returns the complement sequence. New Seq object. 105 """ 106 if isinstance(self.alphabet, Alphabet.ProteinAlphabet) : 107 raise ValueError, "Proteins do not have complements!" 108 if isinstance(self.alphabet, Alphabet.DNAAlphabet) : 109 d = ambiguous_dna_complement 110 elif isinstance(self.alphabet, Alphabet.RNAAlphabet) : 111 d = ambiguous_rna_complement 112 elif 'U' in self.data: 113 d = ambiguous_rna_complement 114 else: 115 d = ambiguous_dna_complement 116 ttable = self.__maketrans(d) 117 #Much faster on really long sequences than the previous loop based one. 118 #thx to Michael Palmer, University of Waterloo 119 s = self.data.translate(ttable) 120 return Seq(s, self.alphabet)
121
122 - def reverse_complement(self):
123 """Returns the reverse complement sequence. New Seq object. 124 """ 125 #Use -1 stride to reverse the complement 126 return self.complement()[::-1]
127
128 -class MutableSeq:
129 - def __init__(self, data, alphabet = Alphabet.generic_alphabet):
130 if type(data) == type(""): 131 self.data = array.array("c", data) 132 else: 133 self.data = data # assumes the input is an array 134 self.alphabet = alphabet
135 - def __repr__(self):
136 return "%s(%s, %s)" % (self.__class__.__name__, 137 repr(self.data), 138 repr(self.alphabet))
139
140 - def __str__(self):
141 if len(self.data) > 60: 142 s = repr(string.join(self.data[:60], "") + " ...") 143 else: 144 s = repr(string.join(self.data, "")) 145 return "%s(%s, %s)" % (self.__class__.__name__, s, 146 repr(self.alphabet))
147 - def __cmp__(self, other):
148 if isinstance(other, MutableSeq): 149 x = cmp(self.alphabet, other.alphabet) 150 if x == 0: 151 return cmp(self.data, other.data) 152 return x 153 elif type(other) == type(""): 154 return cmp(self.data.tostring(), other) 155 elif isinstance(other, Seq): 156 x = cmp(self.alphabet, other.alphabet) 157 if x == 0: 158 return cmp(self.data.tostring(), other.data) 159 return x 160 else: 161 return cmp(self.data, other)
162
163 - def __len__(self): return len(self.data)
164
165 - def __getitem__(self, index) :
166 #Note since Python 2.0, __getslice__ is deprecated 167 #and __getitem__ is used instead. 168 #See http://docs.python.org/ref/sequence-methods.html 169 if isinstance(index, int) : 170 #Return a single letter as a string 171 return self.data[index] 172 else : 173 #Return the (sub)sequence as another Seq object 174 return MutableSeq(self.data[index], self.alphabet)
175
176 - def __setitem__(self, index, value):
177 #Note since Python 2.0, __setslice__ is deprecated 178 #and __setitem__ is used instead. 179 #See http://docs.python.org/ref/sequence-methods.html 180 if isinstance(index, int) : 181 #Replacing a single letter with a new string 182 self.data[index] = value 183 else : 184 #Replacing a sub-sequence 185 if isinstance(value, MutableSeq): 186 self.data[index] = value.data 187 elif isinstance(value, type(self.data)): 188 self.data[index] = value 189 else: 190 self.data[index] = array.array("c", str(value))
191
192 - def __delitem__(self, index):
193 #Note since Python 2.0, __delslice__ is deprecated 194 #and __delitem__ is used instead. 195 #See http://docs.python.org/ref/sequence-methods.html 196 197 #Could be deleting a single letter, or a slice 198 del self.data[index]
199
200 - def __add__(self, other):
201 if self.alphabet.contains(other.alphabet): 202 return self.__class__(self.data + other.data, self.alphabet) 203 elif other.alphabet.contains(self.alphabet): 204 return self.__class__(self.data + other.data, other.alphabet) 205 else: 206 raise TypeError, ("incompatable alphabets", str(self.alphabet), 207 str(other.alphabet))
208 - def __radd__(self, other):
209 if self.alphabet.contains(other.alphabet): 210 return self.__class__(other.data + self.data, self.alphabet) 211 elif other.alphabet.contains(self.alphabet): 212 return self.__class__(other.data + self.data, other.alphabet) 213 else: 214 raise TypeError, ("incompatable alphabets", str(self.alphabet), 215 str(other.alphabet))
216
217 - def append(self, c):
218 self.data.append(c)
219 - def insert(self, i, c):
220 self.data.insert(i, c)
221 - def pop(self, i = (-1)):
222 c = self.data[i] 223 del self.data[i] 224 return c
225 - def remove(self, item):
226 for i in range(len(self.data)): 227 if self.data[i] == item: 228 del self.data[i] 229 return 230 raise ValueError, "MutableSeq.remove(x): x not in list"
231 - def count(self, item):
232 count = 0 233 for c in self.data: 234 if c == item: 235 count = count + 1 236 return count
237 - def index(self, item):
238 for i in range(len(self.data)): 239 if self.data[i] == item: 240 return i 241 raise ValueError, "MutableSeq.index(x): x not in list"
242
243 - def reverse(self):
244 """Modify the MutableSequence to reverse itself 245 246 No return value""" 247 self.data.reverse()
248
249 - def complement(self):
250 """Modify the MutableSequence to take on its complement 251 252 No return value""" 253 if isinstance(self.alphabet, Alphabet.ProteinAlphabet) : 254 raise ValueError, "Proteins do not have complements!" 255 if self.alphabet in (IUPAC.ambiguous_dna, IUPAC.unambiguous_dna): 256 d = ambiguous_dna_complement 257 elif self.alphabet in (IUPAC.ambiguous_rna, IUPAC.unambiguous_rna): 258 d = ambiguous_rna_complement 259 elif 'U' in self.data: 260 d = ambiguous_rna_complement 261 else: 262 d = ambiguous_dna_complement 263 c = dict([(x.lower(), y.lower()) for x,y in d.iteritems()]) 264 d.update(c) 265 self.data = map(lambda c: d[c], self.data) 266 self.data = array.array('c', self.data)
267
268 - def reverse_complement(self):
269 """Modify the MutableSequence to take on its reverse complement. 270 271 No return value""" 272 if isinstance(self.alphabet, Alphabet.ProteinAlphabet) : 273 raise ValueError, "Proteins do not have complements!" 274 self.complement() 275 self.data.reverse()
276 277 ## Sorting a sequence makes no sense. 278 # def sort(self, *args): self.data.sort(*args) 279
280 - def extend(self, other):
281 if isinstance(other, MutableSeq): 282 for c in other.data: 283 self.data.append(c) 284 else: 285 for c in other: 286 self.data.append(c)
287
288 - def tostring(self):
289 return string.join(self.data, "")
290
291 - def toseq(self):
292 return Seq(string.join(self.data, ""), self.alphabet)
293 294 295 # The transcribe, backward_transcribe, and translate functions are 296 # user-friendly versions of the corresponding functions in Bio.Transcribe 297 # and Bio.Translate. The functions work both on Seq objects, and on strings. 298
299 -def transcribe(dna):
300 """Transcribes a DNA sequence into RNA. 301 302 If given a string, returns a new string object. 303 Given a Seq or MutableSeq, returns a new Seq object with the same alphabet. 304 """ 305 if isinstance(dna, Seq) or isinstance(dna, MutableSeq): 306 if isinstance(dna.alphabet, Alphabet.ProteinAlphabet) : 307 raise ValueError, "Proteins cannot be transcribed!" 308 #TODO - Raise an error if already is RNA alphabet? 309 rna = dna.tostring().replace('T','U').replace('t','u') 310 if dna.alphabet==IUPAC.unambiguous_dna: 311 alphabet = IUPAC.unambiguous_rna 312 elif dna.alphabet==IUPAC.ambiguous_dna: 313 alphabet = IUPAC.ambiguous_rna 314 else: 315 alphabet = Alphabet.generic_rna 316 return Seq(rna, alphabet) 317 else: 318 rna = dna.replace('T','U').replace('t','u') 319 return rna
320 321
322 -def back_transcribe(rna):
323 """Back-transcribes an RNA sequence into DNA. 324 325 If given a string, returns a new string object. 326 Given a Seq or MutableSeq, returns a new Seq object with the same alphabet. 327 """ 328 if isinstance(rna, Seq) or isinstance(rna, MutableSeq): 329 if isinstance(rna.alphabet, Alphabet.ProteinAlphabet) : 330 raise ValueError, "Proteins cannot be (back)transcribed!" 331 #TODO - Raise an error if already is DNA alphabet? 332 dna = rna.data.replace('U','T').replace('u','t') 333 if rna.alphabet==IUPAC.unambiguous_rna: 334 alphabet = IUPAC.unambiguous_dna 335 elif rna.alphabet==IUPAC.ambiguous_rna: 336 alphabet = IUPAC.ambiguous_dna 337 else: 338 alphabet = Alphabet.generic_dna 339 return Seq(dna, alphabet) 340 else: 341 dna = rna.replace('U','T').replace('u','t') 342 return dna
343 344
345 -def translate(sequence, table = "Standard", stop_symbol = "*"):
346 """Translate a nucleotide sequence into amino acids. 347 348 If given a string, returns a new string object. 349 Given a Seq or MutableSeq, returns a Seq object. 350 351 table - Which codon table to use? This can be either a name 352 (string) or an identifier (integer) 353 354 NOTE - Does NOT support unambiguous nucleotide sequences 355 It will however translate either DNA or RNA.""" 356 try: 357 id = int(table) 358 except: 359 id = None 360 if isinstance(sequence, Seq) or isinstance(sequence, MutableSeq): 361 if isinstance(sequence.alphabet, Alphabet.ProteinAlphabet) : 362 raise ValueError, "Proteins cannot be translated!" 363 if sequence.alphabet==IUPAC.unambiguous_dna: 364 if id==None: 365 table = CodonTable.unambiguous_dna_by_name[table] 366 else: 367 table = CodonTable.unambiguous_dna_by_id[id] 368 elif sequence.alphabet==IUPAC.ambiguous_dna: 369 if id==None: 370 table = CodonTable.ambiguous_dna_by_name[table] 371 else: 372 table = CodonTable.ambiguous_dna_by_id[id] 373 elif sequence.alphabet==IUPAC.unambiguous_rna: 374 if id==None: 375 table = CodonTable.unambiguous_rna_by_name[table] 376 else: 377 table = CodonTable.unambiguous_rna_by_id[id] 378 elif sequence.alphabet==IUPAC.ambiguous_rna: 379 if id==None: 380 table = CodonTable.ambiguous_rna_by_name[table] 381 else: 382 table = CodonTable.ambiguous_rna_by_id[id] 383 else: 384 if id==None: 385 table = CodonTable.generic_by_name[table] 386 else: 387 table = CodonTable.generic_by_id[id] 388 sequence = sequence.tostring().upper() 389 n = len(sequence) 390 get = table.forward_table.get 391 protein = [get(sequence[i:i+3], stop_symbol) for i in xrange(0,n-n%3,3)] 392 protein = "".join(protein) 393 alphabet = Alphabet.HasStopCodon(table.protein_alphabet) 394 return Seq(protein, alphabet) 395 else: 396 if id==None: 397 table = CodonTable.generic_by_name[table] 398 else: 399 table = CodonTable.generic_by_id[id] 400 get = table.forward_table.get 401 sequence = sequence.upper() 402 n = len(sequence) 403 protein = [get(sequence[i:i+3], stop_symbol) for i in xrange(0,n-n%3,3)] 404 protein = "".join(protein) 405 return protein
406 407
408 -def reverse_complement(sequence):
409 """Returns the reverse complement sequence of a nucleotide string. 410 411 If given a string, returns a new string object. 412 Given a Seq or a MutableSeq, returns a new Seq object with the same alphabet. 413 414 Supports unambiguous nucleotide sequences 415 """ 416 if isinstance(sequence, Seq) : 417 #Return a Seq 418 return sequence.reverse_complement() 419 elif isinstance(sequence, MutableSeq) : 420 #Return a Seq 421 #Don't use the MutableSeq reverse_complement method as it is 'in place'. 422 return sequence.toseq().reverse_complement() 423 else : 424 #Assume its a string, turn it into a Seq, 425 #do the reverse complement, and turn this back to a string 426 #TODO - Find a more efficient way to do this without code duplication? 427 return Seq(sequence).reverse_complement().tostring()
428 429 if __name__ == "__main__" : 430 print "Quick self test" 431 from Bio.Data.IUPACData import ambiguous_dna_values, ambiguous_rna_values# 432 from Bio.Alphabet import generic_dna, generic_rna 433 from sets import Set 434 print ambiguous_dna_complement 435 for ambig_char, values in ambiguous_dna_values.iteritems() : 436 compl_values = reverse_complement(values)[::-1] 437 print "%s={%s} --> {%s}=%s" % \ 438 (ambig_char, values, compl_values, ambiguous_dna_complement[ambig_char]) 439 assert Set(compl_values) == Set(ambiguous_dna_values[ambiguous_dna_complement[ambig_char]]) 440 441 for s in ["".join(ambiguous_dna_values), 442 Seq("".join(ambiguous_dna_values)), 443 Seq("".join(ambiguous_dna_values), generic_dna), 444 "".join(ambiguous_rna_values), 445 Seq("".join(ambiguous_rna_values)), 446 Seq("".join(ambiguous_dna_values), generic_rna)]: 447 print "%s -> %s [RC]" % (repr(s), repr(reverse_complement(s))) 448 print "%s -> %s [RNA]" % (repr(s), repr(transcribe(s))) 449 print "%s -> %s [DNA]" % (repr(s), repr(back_transcribe(s))) 450