1
2
3
4
5
6
7
8
9
10 import os, sys, getopt, re, time
11 from string import maketrans
12 from Bio import SeqIO
13 from Bio import Translate
14 from Bio.Seq import Seq
15 from Bio import Alphabet
16 from Bio.Alphabet import IUPAC
17 from Bio.Data import IUPACData, CodonTable
18
19
20
21
22
23
24
26 """Reverse the sequence. Works on string sequences.
27 """
28 r = map(None, seq)
29 r.reverse()
30 return ''.join(r)
31
33 """ calculates G+C content """
34
35
36
37
38 d = {}
39 for nt in ['A','T','G','C','a','t','g','c','S','s']:
40 d[nt] = seq.count(nt)
41 gc = d.get('G',0) + d.get('C',0) + d.get('g',0) + d.get('c',0) + \
42 d.get('S',0) + d.get('s',0)
43
44 if gc == 0: return 0
45
46 return gc*100.0/len(seq)
47
49 " calculates total G+C content plus first, second and third position "
50
51 d= {}
52 for nt in ['A','T','G','C']:
53 d[nt] = [0,0,0]
54
55 for i in range(0,len(seq),3):
56 codon = seq[i:i+3]
57 if len(codon) <3: codon += ' '
58 for pos in range(0,3):
59 for nt in ['A','T','G','C']:
60 if codon[pos] == nt or codon[pos] == nt.lower():
61 d[nt][pos] = d[nt][pos] +1
62
63
64 gc = {}
65 gcall = 0
66 nall = 0
67 for i in range(0,3):
68 try:
69 n = d['G'][i] + d['C'][i] +d['T'][i] + d['A'][i]
70 gc[i] = (d['G'][i] + d['C'][i])*100.0/n
71 except:
72 gc[i] = 0
73
74 gcall = gcall + d['G'][i] + d['C'][i]
75 nall = nall + n
76
77 gcall = 100.0*gcall/nall
78 return gcall, gc[0], gc[1], gc[2]
79
91
92
93
94
95
96
97
98
99 try:
100 from Tkinter import *
101 except ImportError:
102 pass
103 from math import pi, sin, cos, log
104 -def xGC_skew(seq, window = 1000, zoom = 100,
105 r = 300, px = 100, py = 100):
106 " calculates and plots normal and accumulated GC skew (GRAPHICS !!!) "
107
108
109 yscroll = Scrollbar(orient = VERTICAL)
110 xscroll = Scrollbar(orient = HORIZONTAL)
111 canvas = Canvas(yscrollcommand = yscroll.set,
112 xscrollcommand = xscroll.set, background = 'white')
113 win = canvas.winfo_toplevel()
114 win.geometry('700x700')
115
116 yscroll.config(command = canvas.yview)
117 xscroll.config(command = canvas.xview)
118 yscroll.pack(side = RIGHT, fill = Y)
119 xscroll.pack(side = BOTTOM, fill = X)
120 canvas.pack(fill=BOTH, side = LEFT, expand = 1)
121 canvas.update()
122
123 X0, Y0 = r + px, r + py
124 x1, x2, y1, y2 = X0 - r, X0 + r, Y0 -r, Y0 + r
125
126 ty = Y0
127 canvas.create_text(X0, ty, text = '%s...%s (%d nt)' % (seq[:7], seq[-7:], len(seq)))
128 ty +=20
129 canvas.create_text(X0, ty, text = 'GC %3.2f%%' % (GC(seq)))
130 ty +=20
131 canvas.create_text(X0, ty, text = 'GC Skew', fill = 'blue')
132 ty +=20
133 canvas.create_text(X0, ty, text = 'Accumulated GC Skew', fill = 'magenta')
134 ty +=20
135 canvas.create_oval(x1,y1, x2, y2)
136
137 acc = 0
138 start = 0
139 for gc in GC_skew(seq, window):
140 r1 = r
141 acc+=gc
142
143 alpha = pi - (2*pi*start)/len(seq)
144 r2 = r1 - gc*zoom
145 x1 = X0 + r1 * sin(alpha)
146 y1 = Y0 + r1 * cos(alpha)
147 x2 = X0 + r2 * sin(alpha)
148 y2 = Y0 + r2 * cos(alpha)
149 canvas.create_line(x1,y1,x2,y2, fill = 'blue')
150
151 r1 = r - 50
152 r2 = r1 - acc
153 x1 = X0 + r1 * sin(alpha)
154 y1 = Y0 + r1 * cos(alpha)
155 x2 = X0 + r2 * sin(alpha)
156 y2 = Y0 + r2 * cos(alpha)
157 canvas.create_line(x1,y1,x2,y2, fill = 'magenta')
158
159 canvas.update()
160 start = start + window
161
162
163 canvas.configure(scrollregion = canvas.bbox(ALL))
164
172
198
199
200
201
202
203
204
205
206
207
208
209 -class ProteinX(Alphabet.ProteinAlphabet):
211
212 proteinX = ProteinX()
213
217 - def get(self, codon, stop_symbol):
222
229
230
231
232
234 """
235 Method that returns the amino acid sequence as a
236 list of three letter codes. Output follows the IUPAC standard plus 'Ter' for
237 terminator. Any unknown character, including the default
238 unknown character 'X', is changed into 'Xaa'. A noncoded
239 aminoacid selenocystein is recognized (Sel, U).
240 """
241 threecode = {'A':'Ala', 'B':'Asx', 'C':'Cys', 'D':'Asp',
242 'E':'Glu', 'F':'Phe', 'G':'Gly', 'H':'His',
243 'I':'Ile', 'K':'Lys', 'L':'Leu', 'M':'Met',
244 'N':'Asn', 'P':'Pro', 'Q':'Gln', 'R':'Arg',
245 'S':'Ser', 'T':'Thr', 'V':'Val', 'W':'Trp',
246 'Y':'Tyr', 'Z':'Glx', 'X':'Xaa', '*':'Ter',
247 'U':'Sel'
248 }
249
250 return ''.join([threecode.get(aa,'Xer') for aa in seq])
251
252
253
254
255
256
257
258
259
260 -def translate(seq, frame = 1, genetic_code = 1, translator = None):
270
274
276 """
277 nice looking 6 frame translation with GC content - code from xbbtools
278 similar to DNA Striders six-frame translation
279 """
280 comp = complement(seq)
281 anti = reverse(comp)
282 length = len(seq)
283 frames = {}
284 for i in range(0,3):
285 frames[i+1] = translate(seq[i:], genetic_code)
286 frames[-(i+1)] = reverse(translate(anti[i:], genetic_code))
287
288
289 if length > 20:
290 short = '%s ... %s' % (seq[:10], seq[-10:])
291 else:
292 short = seq
293 date = time.strftime('%y %b %d, %X', time.localtime(time.time()))
294 header = 'GC_Frame: %s, ' % date
295 for nt in ['a','t','g','c']:
296 header += '%s:%d ' % (nt, seq.count(nt.upper()))
297
298 header += '\nSequence: %s, %d nt, %0.2f %%GC\n\n\n' % (short.lower(),length, GC(seq))
299 res = header
300
301 for i in range(0,length,60):
302 subseq = seq[i:i+60]
303 csubseq = comp[i:i+60]
304 p = i/3
305 res = res + '%d/%d\n' % (i+1, i/3+1)
306 res = res + ' ' + ' '.join(map(None,frames[3][p:p+20])) + '\n'
307 res = res + ' ' + ' '.join(map(None,frames[2][p:p+20])) + '\n'
308 res = res + ' '.join(map(None,frames[1][p:p+20])) + '\n'
309
310 res = res + subseq.lower() + '%5d %%\n' % int(GC(subseq))
311 res = res + csubseq.lower() + '\n'
312
313 res = res + ' '.join(map(None,frames[-2][p:p+20])) +' \n'
314 res = res + ' ' + ' '.join(map(None,frames[-1][p:p+20])) + '\n'
315 res = res + ' ' + ' '.join(map(None,frames[-3][p:p+20])) + '\n\n'
316 return res
317
318
319
320
321
322
323
324
347
349 " simple and FASTA reader, preferable to be used on large files "
350 txt = open(file).read()
351 entries = []
352 for entry in txt.split('>')[1:]:
353 name,seq= entry.split('\n',1)
354 seq = seq.replace('\n','').replace(' ','').upper()
355 entries.append((name, seq))
356
357 return entries
358
360 " apply function on each sequence in a multiple FASTA file "
361 try:
362 f = globals()[function]
363 except:
364 raise NotImplementedError, "%s not implemented" % function
365
366 handle = open(file, 'r')
367 records = SeqIO.parse(handle, "fasta")
368 results = []
369 for record in records:
370 arguments = [record.sequence]
371 for arg in args: arguments.append(arg)
372 result = f(*arguments)
373 if result:
374 results.append('>%s\n%s' % (record.title, result))
375 return results
376
378 " apply function on each sequence in a multiple FASTA file "
379 try:
380 f = globals()[function]
381 except:
382 raise NotImplementedError, "%s not implemented" % function
383
384 entries = quick_FASTA_reader(file)
385 results = []
386 for name, seq in entries:
387 arguments = [seq]
388 for arg in args: arguments.append(arg)
389 result = f(*arguments)
390 if result:
391 results.append('>%s\n%s' % (name, result))
392 return results
393
394
395
396
397
398
399
400
401 if __name__ == '__main__':
402
403 options = {'apply_on_multi_fasta':0,
404 'quick':0,
405 'uniq_ids':0,
406 }
407
408 optlist, args = getopt.getopt(sys.argv[1:], '', ['describe', 'apply_on_multi_fasta=',
409 'help', 'quick', 'uniq_ids', 'search='])
410 for arg in optlist:
411 if arg[0] in ['-h', '--help']:
412 pass
413 elif arg[0] in ['--describe']:
414
415 mol_funcs = [x[0] for x in locals().items() if type(x[1]) == type(GC)]
416 mol_funcs.sort()
417 print 'available functions:'
418 for f in mol_funcs: print '\t--%s' % f
419 print '\n\ne.g.\n./sequtils.py --apply_on_multi_fasta GC test.fas'
420
421 sys.exit(0)
422 elif arg[0] in ['--apply_on_multi_fasta']:
423 options['apply_on_multi_fasta'] = arg[1]
424 elif arg[0] in ['--search']:
425 options['search'] = arg[1]
426 else:
427 key = re.search('-*(.+)', arg[0]).group(1)
428 options[key] = 1
429
430
431 if options.get('apply_on_multi_fasta'):
432 file = args[0]
433 function = options['apply_on_multi_fasta']
434 arguments = []
435 if options.get('search'):
436 arguments = options['search']
437 if function == 'xGC_skew':
438 arguments = 1000
439 if options.get('quick'):
440 results = quicker_apply_on_multi_fasta(file, function, arguments)
441 else:
442 results = apply_on_multi_fasta(file, function, arguments)
443 for result in results: print result
444
445 elif options.get('uniq_ids'):
446 file = args[0]
447 fasta_uniqids(file)
448
449
450