Package Bio :: Package SeqUtils
[hide private]
[frames] | no frames]

Source Code for Package Bio.SeqUtils

  1  #!/usr/bin/env python 
  2  # Created: Wed May 29 08:07:18 2002 
  3  # thomas@cbs.dtu.dk, Cecilia.Alsmark@ebc.uu.se 
  4  # Copyright 2001 by Thomas Sicheritz-Ponten and Cecilia Alsmark. 
  5  # All rights reserved. 
  6  # This code is part of the Biopython distribution and governed by its 
  7  # license.  Please see the LICENSE file that should have been included 
  8  # as part of this package. 
  9   
 10  import os, sys, getopt, re, time 
 11  from string import maketrans 
 12  from Bio import SeqIO 
 13  from Bio import Translate 
 14  from Bio.Seq import Seq 
 15  from Bio import Alphabet 
 16  from Bio.Alphabet import IUPAC 
 17  from Bio.Data import IUPACData, CodonTable 
 18   
 19   
 20  ###################################### 
 21  # DNA 
 22  ###################### 
 23  # {{{  
 24   
25 -def reverse(seq):
26 """Reverse the sequence. Works on string sequences. 27 """ 28 r = map(None, seq) 29 r.reverse() 30 return ''.join(r)
31
32 -def GC(seq):
33 """ calculates G+C content """ 34 # 19/8/03: Iddo: added provision for lowercase 35 # 19/8/03: Iddo: divide by the sequence's length rather than by the 36 # A+T+G+C number. In that way, make provision for N. 37 38 d = {} 39 for nt in ['A','T','G','C','a','t','g','c','S','s']: 40 d[nt] = seq.count(nt) 41 gc = d.get('G',0) + d.get('C',0) + d.get('g',0) + d.get('c',0) + \ 42 d.get('S',0) + d.get('s',0) 43 44 if gc == 0: return 0 45 # return gc*100.0/(d['A'] +d['T'] + gc) 46 return gc*100.0/len(seq)
47
48 -def GC123(seq):
49 " calculates total G+C content plus first, second and third position " 50 51 d= {} 52 for nt in ['A','T','G','C']: 53 d[nt] = [0,0,0] 54 55 for i in range(0,len(seq),3): 56 codon = seq[i:i+3] 57 if len(codon) <3: codon += ' ' 58 for pos in range(0,3): 59 for nt in ['A','T','G','C']: 60 if codon[pos] == nt or codon[pos] == nt.lower(): 61 d[nt][pos] = d[nt][pos] +1 62 63 64 gc = {} 65 gcall = 0 66 nall = 0 67 for i in range(0,3): 68 try: 69 n = d['G'][i] + d['C'][i] +d['T'][i] + d['A'][i] 70 gc[i] = (d['G'][i] + d['C'][i])*100.0/n 71 except: 72 gc[i] = 0 73 74 gcall = gcall + d['G'][i] + d['C'][i] 75 nall = nall + n 76 77 gcall = 100.0*gcall/nall 78 return gcall, gc[0], gc[1], gc[2]
79
80 -def GC_skew(seq, window = 100):
81 """ calculates GC skew (G-C)/(G+C) """ 82 # 8/19/03: Iddo: added lowercase 83 values = [] 84 for i in range(0, len(seq), window): 85 s = seq[i: i + window] 86 g = s.count('G') + s.count('g') 87 c = s.count('C') + s.count('c') 88 skew = (g-c)/float(g+c) 89 values.append(skew) 90 return values
91 92 # 8/19/03 Iddo: moved these imports from within the function as 93 # ``import * '' is only 94 # allowed within the module 95 # Brad -- added an import check so you don't have to have Tkinter 96 # installed to use other sequtils functions. A little ugly but 97 # it really should be fixed by not using 'import *' which I'm not 98 # really excited about doing right now. 99 try: 100 from Tkinter import * 101 except ImportError: 102 pass 103 from math import pi, sin, cos, log
104 -def xGC_skew(seq, window = 1000, zoom = 100, 105 r = 300, px = 100, py = 100):
106 " calculates and plots normal and accumulated GC skew (GRAPHICS !!!) " 107 108 109 yscroll = Scrollbar(orient = VERTICAL) 110 xscroll = Scrollbar(orient = HORIZONTAL) 111 canvas = Canvas(yscrollcommand = yscroll.set, 112 xscrollcommand = xscroll.set, background = 'white') 113 win = canvas.winfo_toplevel() 114 win.geometry('700x700') 115 116 yscroll.config(command = canvas.yview) 117 xscroll.config(command = canvas.xview) 118 yscroll.pack(side = RIGHT, fill = Y) 119 xscroll.pack(side = BOTTOM, fill = X) 120 canvas.pack(fill=BOTH, side = LEFT, expand = 1) 121 canvas.update() 122 123 X0, Y0 = r + px, r + py 124 x1, x2, y1, y2 = X0 - r, X0 + r, Y0 -r, Y0 + r 125 126 ty = Y0 127 canvas.create_text(X0, ty, text = '%s...%s (%d nt)' % (seq[:7], seq[-7:], len(seq))) 128 ty +=20 129 canvas.create_text(X0, ty, text = 'GC %3.2f%%' % (GC(seq))) 130 ty +=20 131 canvas.create_text(X0, ty, text = 'GC Skew', fill = 'blue') 132 ty +=20 133 canvas.create_text(X0, ty, text = 'Accumulated GC Skew', fill = 'magenta') 134 ty +=20 135 canvas.create_oval(x1,y1, x2, y2) 136 137 acc = 0 138 start = 0 139 for gc in GC_skew(seq, window): 140 r1 = r 141 acc+=gc 142 # GC skew 143 alpha = pi - (2*pi*start)/len(seq) 144 r2 = r1 - gc*zoom 145 x1 = X0 + r1 * sin(alpha) 146 y1 = Y0 + r1 * cos(alpha) 147 x2 = X0 + r2 * sin(alpha) 148 y2 = Y0 + r2 * cos(alpha) 149 canvas.create_line(x1,y1,x2,y2, fill = 'blue') 150 # accumulated GC skew 151 r1 = r - 50 152 r2 = r1 - acc 153 x1 = X0 + r1 * sin(alpha) 154 y1 = Y0 + r1 * cos(alpha) 155 x2 = X0 + r2 * sin(alpha) 156 y2 = Y0 + r2 * cos(alpha) 157 canvas.create_line(x1,y1,x2,y2, fill = 'magenta') 158 159 canvas.update() 160 start = start + window 161 162 163 canvas.configure(scrollregion = canvas.bbox(ALL))
164
165 -def molecular_weight(seq):
166 if type(seq) == type(''): seq = Seq(seq, IUPAC.unambiguous_dna) 167 weight_table = IUPACData.unambiguous_dna_weights 168 sum = 0 169 for x in seq: 170 sum += weight_table[x] 171 return sum
172
173 -def nt_search(seq, subseq):
174 """ search for a DNA subseq in sequence 175 use ambiguous values (like N = A or T or C or G, R = A or G etc.) 176 searches only on forward strand 177 """ 178 pattern = '' 179 for nt in subseq: 180 value = IUPACData.ambiguous_dna_values[nt] 181 if len(value) == 1: 182 pattern += value 183 else: 184 pattern += '[%s]' % value 185 186 pos = -1 187 result = [pattern] 188 l = len(seq) 189 while 1: 190 pos+=1 191 s = seq[pos:] 192 m = re.search(pattern, s) 193 if not m: break 194 pos += int(m.start(0)) 195 result.append(pos) 196 197 return result
198 199 # }}} 200 201 ###################################### 202 # Protein 203 ###################### 204 # {{{ 205 206 # temporary hack for exception free translation of "dirty" DNA 207 # should be moved to ??? 208
209 -class ProteinX(Alphabet.ProteinAlphabet):
210 letters = IUPACData.extended_protein_letters + "X"
211 212 proteinX = ProteinX() 213
214 -class MissingTable:
215 - def __init__(self, table):
216 self._table = table
217 - def get(self, codon, stop_symbol):
218 try: 219 return self._table.get(codon, stop_symbol) 220 except CodonTable.TranslationError: 221 return 'X'
222
223 -def makeTableX(table):
224 assert table.protein_alphabet == IUPAC.extended_protein 225 return CodonTable.CodonTable(table.nucleotide_alphabet, proteinX, 226 MissingTable(table.forward_table), 227 table.back_table, table.start_codons, 228 table.stop_codons)
229 230 231 # end of hacks 232
233 -def seq3(seq):
234 """ 235 Method that returns the amino acid sequence as a 236 list of three letter codes. Output follows the IUPAC standard plus 'Ter' for 237 terminator. Any unknown character, including the default 238 unknown character 'X', is changed into 'Xaa'. A noncoded 239 aminoacid selenocystein is recognized (Sel, U). 240 """ 241 threecode = {'A':'Ala', 'B':'Asx', 'C':'Cys', 'D':'Asp', 242 'E':'Glu', 'F':'Phe', 'G':'Gly', 'H':'His', 243 'I':'Ile', 'K':'Lys', 'L':'Leu', 'M':'Met', 244 'N':'Asn', 'P':'Pro', 'Q':'Gln', 'R':'Arg', 245 'S':'Ser', 'T':'Thr', 'V':'Val', 'W':'Trp', 246 'Y':'Tyr', 'Z':'Glx', 'X':'Xaa', '*':'Ter', 247 'U':'Sel' 248 } 249 250 return ''.join([threecode.get(aa,'Xer') for aa in seq])
251 252 253 # }}} 254 255 ###################################### 256 # Mixed ??? 257 ###################### 258 # {{{ 259
260 -def translate(seq, frame = 1, genetic_code = 1, translator = None):
261 " translation of DNA in one of the six different reading frames " 262 if frame not in [1,2,3,-1,-2,-3]: 263 raise ValueError, 'invalid frame' 264 265 if not translator: 266 table = makeTableX(CodonTable.ambiguous_dna_by_id[genetic_code]) 267 translator = Translate.Translator(table) 268 269 return translator.translate(Seq(seq[frame-1:], IUPAC.ambiguous_dna)).data
270
271 -def GC_Frame(seq, genetic_code = 1):
272 " just an alias for six_frame_translations " 273 return six_frame_translations(seq, genetic_code)
274
275 -def six_frame_translations(seq, genetic_code = 1):
276 """ 277 nice looking 6 frame translation with GC content - code from xbbtools 278 similar to DNA Striders six-frame translation 279 """ 280 comp = complement(seq) 281 anti = reverse(comp) 282 length = len(seq) 283 frames = {} 284 for i in range(0,3): 285 frames[i+1] = translate(seq[i:], genetic_code) 286 frames[-(i+1)] = reverse(translate(anti[i:], genetic_code)) 287 288 # create header 289 if length > 20: 290 short = '%s ... %s' % (seq[:10], seq[-10:]) 291 else: 292 short = seq 293 date = time.strftime('%y %b %d, %X', time.localtime(time.time())) 294 header = 'GC_Frame: %s, ' % date 295 for nt in ['a','t','g','c']: 296 header += '%s:%d ' % (nt, seq.count(nt.upper())) 297 298 header += '\nSequence: %s, %d nt, %0.2f %%GC\n\n\n' % (short.lower(),length, GC(seq)) 299 res = header 300 301 for i in range(0,length,60): 302 subseq = seq[i:i+60] 303 csubseq = comp[i:i+60] 304 p = i/3 305 res = res + '%d/%d\n' % (i+1, i/3+1) 306 res = res + ' ' + ' '.join(map(None,frames[3][p:p+20])) + '\n' 307 res = res + ' ' + ' '.join(map(None,frames[2][p:p+20])) + '\n' 308 res = res + ' '.join(map(None,frames[1][p:p+20])) + '\n' 309 # seq 310 res = res + subseq.lower() + '%5d %%\n' % int(GC(subseq)) 311 res = res + csubseq.lower() + '\n' 312 # - frames 313 res = res + ' '.join(map(None,frames[-2][p:p+20])) +' \n' 314 res = res + ' ' + ' '.join(map(None,frames[-1][p:p+20])) + '\n' 315 res = res + ' ' + ' '.join(map(None,frames[-3][p:p+20])) + '\n\n' 316 return res
317 318 # }}} 319 320 ###################################### 321 # FASTA file utilities 322 ###################### 323 # {{{ 324
325 -def fasta_uniqids(file):
326 " checks and changes the name/ID's to be unique identifiers by adding numbers " 327 dict = {} 328 txt = open(file).read() 329 entries = [] 330 for entry in txt.split('>')[1:]: 331 name, seq= entry.split('\n',1) 332 name = name.split()[0].split(',')[0] 333 334 if dict.has_key(name): 335 n = 1 336 while 1: 337 n = n + 1 338 _name = name + str(n) 339 if not dict.has_key(_name): 340 name = _name 341 break 342 343 dict[name] = seq 344 345 for name, seq in dict.items(): 346 print '>%s\n%s' % (name, seq)
347
348 -def quick_FASTA_reader(file):
349 " simple and FASTA reader, preferable to be used on large files " 350 txt = open(file).read() 351 entries = [] 352 for entry in txt.split('>')[1:]: 353 name,seq= entry.split('\n',1) 354 seq = seq.replace('\n','').replace(' ','').upper() 355 entries.append((name, seq)) 356 357 return entries
358
359 -def apply_on_multi_fasta(file, function, *args):
360 " apply function on each sequence in a multiple FASTA file " 361 try: 362 f = globals()[function] 363 except: 364 raise NotImplementedError, "%s not implemented" % function 365 366 handle = open(file, 'r') 367 records = SeqIO.parse(handle, "fasta") 368 results = [] 369 for record in records: 370 arguments = [record.sequence] 371 for arg in args: arguments.append(arg) 372 result = f(*arguments) 373 if result: 374 results.append('>%s\n%s' % (record.title, result)) 375 return results
376
377 -def quicker_apply_on_multi_fasta(file, function, *args):
378 " apply function on each sequence in a multiple FASTA file " 379 try: 380 f = globals()[function] 381 except: 382 raise NotImplementedError, "%s not implemented" % function 383 384 entries = quick_FASTA_reader(file) 385 results = [] 386 for name, seq in entries: 387 arguments = [seq] 388 for arg in args: arguments.append(arg) 389 result = f(*arguments) 390 if result: 391 results.append('>%s\n%s' % (name, result)) 392 return results
393 394 # }}} 395 396 ###################################### 397 # Main 398 ##################### 399 # {{{ 400 401 if __name__ == '__main__': 402 # crude command line options to use most functions directly on a FASTA file 403 options = {'apply_on_multi_fasta':0, 404 'quick':0, 405 'uniq_ids':0, 406 } 407 408 optlist, args = getopt.getopt(sys.argv[1:], '', ['describe', 'apply_on_multi_fasta=', 409 'help', 'quick', 'uniq_ids', 'search=']) 410 for arg in optlist: 411 if arg[0] in ['-h', '--help']: 412 pass 413 elif arg[0] in ['--describe']: 414 # get all new functions from this file 415 mol_funcs = [x[0] for x in locals().items() if type(x[1]) == type(GC)] 416 mol_funcs.sort() 417 print 'available functions:' 418 for f in mol_funcs: print '\t--%s' % f 419 print '\n\ne.g.\n./sequtils.py --apply_on_multi_fasta GC test.fas' 420 421 sys.exit(0) 422 elif arg[0] in ['--apply_on_multi_fasta']: 423 options['apply_on_multi_fasta'] = arg[1] 424 elif arg[0] in ['--search']: 425 options['search'] = arg[1] 426 else: 427 key = re.search('-*(.+)', arg[0]).group(1) 428 options[key] = 1 429 430 431 if options.get('apply_on_multi_fasta'): 432 file = args[0] 433 function = options['apply_on_multi_fasta'] 434 arguments = [] 435 if options.get('search'): 436 arguments = options['search'] 437 if function == 'xGC_skew': 438 arguments = 1000 439 if options.get('quick'): 440 results = quicker_apply_on_multi_fasta(file, function, arguments) 441 else: 442 results = apply_on_multi_fasta(file, function, arguments) 443 for result in results: print result 444 445 elif options.get('uniq_ids'): 446 file = args[0] 447 fasta_uniqids(file) 448 449 # }}} 450