1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29 """Convert a serie of Rebase files into a Restriction_Dictionary.py module.
30
31 The Rebase files are in the emboss format :
32
33 emboss_e.### -> contains informations about the restriction sites.
34 emboss_r.### -> contains general informations about the enzymes.
35 emboss_s.### -> contains informations about the suppliers.
36
37 ### is a 3 digit number. The first digit is the year and the two last the month.
38 """
39
40 import sre
41 import os
42 import itertools
43 import pprint
44 import time
45 import sys
46 import site
47 import shutil
48
49 from Bio.Seq import Seq
50 from Bio.Alphabet import IUPAC
51
52 import Bio.Restriction.Restriction
53 from Bio.Restriction.Restriction import AbstractCut, RestrictionType, NoCut, OneCut,\
54 TwoCuts, Meth_Dep, Meth_Undep, Palindromic, NonPalindromic, Unknown, Blunt,\
55 Ov5, Ov3, NotDefined, Defined, Ambiguous, Commercially_available, Not_available
56
57 import Bio.Restriction.RanaConfig as config
58 from Bio.Restriction._Update.Update import RebaseUpdate
59 from Bio.Restriction.Restriction import *
60 from Bio.Restriction.DNAUtils import antiparallel
61
62 DNA=Seq
63 dna_alphabet = {'A':'A', 'C':'C', 'G':'G', 'T':'T',
64 'R':'AG', 'Y':'CT', 'W':'AT', 'S':'CG', 'M':'AC', 'K':'GT',
65 'H':'ACT', 'B':'CGT', 'V':'ACG', 'D':'AGT',
66 'N':'ACGT',
67 'a': 'a', 'c': 'c', 'g': 'g', 't': 't',
68 'r':'ag', 'y':'ct', 'w':'at', 's':'cg', 'm':'ac', 'k':'gt',
69 'h':'act', 'b':'cgt', 'v':'acg', 'd':'agt',
70 'n':'acgt'}
71
72
73 complement_alphabet = {'A':'T', 'T':'A', 'C':'G', 'G':'C','R':'Y', 'Y':'R',
74 'W':'W', 'S':'S', 'M':'K', 'K':'M', 'H':'D', 'D':'H',
75 'B':'V', 'V':'B', 'N':'N','a':'t', 'c':'g', 'g':'c',
76 't':'a', 'r':'y', 'y':'r', 'w':'w', 's':'s','m':'k',
77 'k':'m', 'h':'d', 'd':'h', 'b':'v', 'v':'b', 'n':'n'}
78 enzymedict = {}
79 suppliersdict = {}
80 classdict = {}
81 typedict = {}
82
83
85 """Exception for dealing with overhang."""
86 pass
87
89 """BaseExpand(base) -> string.
90
91 given a degenerated base, returns its meaning in IUPAC alphabet.
92
93 i.e:
94 b= 'A' -> 'A'
95 b= 'N' -> 'ACGT'
96 etc..."""
97 base = base.upper()
98 return dna_alphabet[base]
99
101 """regex(site) -> string.
102
103 Construct a regular expression from a DNA sequence.
104 i.e. :
105 site = 'ABCGN' -> 'A[CGT]CG.'"""
106 reg_ex = site
107 for base in reg_ex :
108 if base in ('A', 'T', 'C', 'G', 'a', 'c', 'g', 't') :
109 pass
110 if base in ('N', 'n') :
111 reg_ex = '.'.join(reg_ex.split('N'))
112 reg_ex = '.'.join(reg_ex.split('n'))
113 if base in ('R', 'Y', 'W', 'M', 'S', 'K', 'H', 'D', 'B', 'V') :
114 expand = '['+ str(BaseExpand(base))+']'
115 reg_ex = expand.join(reg_ex.split(base))
116 return reg_ex
117
119 """Antiparallel(sequence) -> string.
120
121 returns a string which represents the reverse complementary strand of
122 a DNA sequence."""
123 return antiparallel(sequence.tostring())
124
126 """is_palindrom(sequence) -> bool.
127
128 True is the sequence is a palindrom.
129 sequence is a DNA object."""
130 return sequence == DNA(Antiparallel(sequence))
131
133 """LocalTime() -> string.
134
135 LocalTime calculate the extension for emboss file for the current year and
136 month."""
137 t = time.gmtime()
138 year = str(t.tm_year)[-1]
139 month = str(t.tm_mon)
140 if len(month) == 1 : month = '0'+month
141 return year+month
142
143
145 """construct the attributes of the enzyme corresponding to 'name'."""
147 cls.opt_temp = 37
148 cls.inact_temp = 65
149 cls.substrat = 'DNA'
150 target = enzymedict[name]
151 cls.site = target[0]
152 cls.size = target[1]
153 cls.suppl = tuple(target[9])
154 cls.freq = target[11]
155 cls.ovhg = target[13]
156 cls.ovhgseq = target[14]
157 cls.bases = ()
158
159
160
161
162
163
164
165 if target[10] : cls.bases += ('Palindromic',)
166 else : cls.bases += ('NonPalindromic',)
167
168
169
170
171
172
173
174 if not target[2] :
175
176
177
178 cls.bases += ('NoCut','Unknown', 'NotDefined')
179 cls.fst5 = None
180 cls.fst3 = None
181 cls.scd5 = None
182 cls.scd3 = None
183 cls.ovhg = None
184 cls.ovhgseq = None
185 else :
186
187
188
189 if target[2] == 2 :
190 cls.bases += ('OneCut',)
191 cls.fst5 = target[4]
192 cls.fst3 = target[5]
193 cls.scd5 = None
194 cls.scd3 = None
195 else :
196 cls.bases += ('TwoCuts',)
197 cls.fst5 = target[4]
198 cls.fst3 = target[5]
199 cls.scd5 = target[6]
200 cls.scd3 = target[7]
201
202
203
204
205
206
207
208
209
210
211
212
213
214 if target[3] :
215
216
217
218
219 cls.bases += ('Blunt', 'Defined')
220 cls.ovhg = 0
221 elif isinstance(cls.ovhg, int) :
222
223
224
225 if cls.ovhg > 0 :
226
227
228
229
230 cls.bases += ('Ov3', 'Ambiguous')
231 elif cls.ovhg < 0 :
232
233
234
235
236 cls.bases += ('Ov5', 'Ambiguous')
237 else :
238
239
240
241 if cls.fst5 - (cls.fst3 + cls.size) < 0 :
242 cls.bases += ('Ov5', 'Defined')
243 cls.ovhg = - len(cls.ovhg)
244 else :
245 cls.bases += ('Ov3', 'Defined')
246 cls.ovhg = + len(cls.ovhg)
247
248
249
250
251
252
253
254
255 if target[8] :
256 cls.bases += ('Meth_Dep', )
257 cls.compsite = target[12]
258 else :
259 cls.bases += ('Meth_Undep',)
260 cls.compsite = target[12]
261
262
263
264
265 if cls.suppl :
266 cls.bases += ('Commercially_available', )
267 else :
268 cls.bases += ('Not_available', )
269 cls.bases += ('AbstractCut', 'RestrictionType')
270 cls.__name__ = name
271 cls.results = None
272 cls.dna = None
273 cls.__bases__ = cls.bases
274 cls.charac = (cls.fst5, cls.fst3, cls.scd5, cls.scd3, cls.site)
275 if not target[2] and cls.suppl :
276 supp = ', '.join([suppliersdict[s][0] for s in cls.suppl])
277 print 'WARNING : It seems that %s is both commercially available\
278 \n\tand its characteristics are unknown. \
279 \n\tThis seems counter-intuitive.\
280 \n\tThere is certainly an error either in ranacompiler or\
281 \n\tin this REBASE release.\
282 \n\tThe supplier is : %s.' % (name, supp)
283 return
284
285
287
288 """Build the different types possible for Restriction Enzymes"""
289
291 """TypeCompiler() -> new TypeCompiler instance."""
292 pass
293
295 """TC.buildtype() -> generator.
296
297 build the new types that will be needed for constructing the
298 restriction enzymes."""
299 baT = (AbstractCut, RestrictionType)
300 cuT = (NoCut, OneCut, TwoCuts)
301 meT = (Meth_Dep, Meth_Undep)
302 paT = (Palindromic, NonPalindromic)
303 ovT = (Unknown, Blunt, Ov5, Ov3)
304 deT = (NotDefined, Defined, Ambiguous)
305 coT = (Commercially_available, Not_available)
306 All = (baT, cuT, meT, paT, ovT, deT, coT)
307
308
309
310
311
312
313 types = [(p,c,o,d,m,co,baT[0],baT[1])
314 for p in paT for c in cuT for o in ovT
315 for d in deT for m in meT for co in coT]
316 n= 1
317 for ty in types :
318 dct = {}
319 for t in ty :
320 dct.update(t.__dict__)
321
322
323
324
325
326
327 dct['results'] = []
328 dct['substrat'] = 'DNA'
329 dct['dna'] = None
330 if t == NoCut :
331 dct.update({'fst5':None,'fst3':None,
332 'scd5':None,'scd3':None,
333 'ovhg':None,'ovhgseq':None})
334 elif t == OneCut :
335 dct.update({'scd5':None, 'scd3':None})
336 class klass(type) :
337 def __new__(cls) :
338 return type.__new__(cls, 'type%i'%n,ty,dct)
339 def __init__(cls) :
340 super(klass, cls).__init__('type%i'%n,ty,dct)
341 yield klass()
342 n+=1
343
344 start = '\n\
345 #!/usr/bin/env python\n\
346 #\n\
347 # Restriction Analysis Libraries.\n\
348 # Copyright (C) 2004. Frederic Sohm.\n\
349 #\n\
350 # This code is part of the Biopython distribution and governed by its\n\
351 # license. Please see the LICENSE file that should have been included\n\
352 # as part of this package.\n\
353 #\n\
354 #\n\
355 rest_dict = \\\n'
356
358
359 - def __init__(self, e_mail='', ftp_proxy='') :
360 """DictionaryBuilder([e_mail[, ftp_proxy]) -> DictionaryBuilder instance.
361
362 If the emboss files used for the construction need to be updated this
363 class will download them if the ftp connection is correctly set.
364 either in RanaConfig.py or given at run time.
365
366 e_mail is the e-mail address used as password for the anonymous
367 ftp connection.
368
369 proxy is the ftp_proxy to use if any."""
370 self.rebase_pass = e_mail or config.Rebase_password
371 self.proxy = ftp_proxy or config.ftp_proxy
372
374 """DB.build_dict() -> None.
375
376 Construct the dictionary and build the files containing the new
377 dictionaries."""
378
379
380
381 emboss_e, emboss_r, emboss_s = self.lastrebasefile()
382
383
384
385 self.information_mixer(emboss_r, emboss_e, emboss_s)
386 emboss_r.close()
387 emboss_e.close()
388 emboss_s.close()
389
390
391
392 tdct = {}
393 for klass in TypeCompiler().buildtype() :
394 exec klass.__name__ +'= klass'
395 exec "tdct['"+klass.__name__+"'] = klass"
396
397
398
399
400
401
402
403 for name in enzymedict :
404
405
406
407 cls = newenzyme(name)
408
409
410
411 bases = cls.bases
412 clsbases = tuple([eval(x) for x in bases])
413 typestuff = ''
414 for n, t in tdct.iteritems() :
415
416
417
418
419 if t.__bases__ == clsbases :
420 typestuff = t
421 typename = t.__name__
422 continue
423
424
425
426 dct = dict(cls.__dict__)
427 del dct['bases']
428 del dct['__bases__']
429 del dct['__name__']
430 classdict[name] = dct
431
432 commonattr = ['fst5', 'fst3', 'scd5', 'scd3', 'substrat',
433 'ovhg', 'ovhgseq','results', 'dna']
434 if typename in typedict :
435 typedict[typename][1].append(name)
436 else :
437 enzlst= []
438 tydct = dict(typestuff.__dict__)
439 tydct = dict([(k,v) for k,v in tydct.iteritems() if k in commonattr])
440 enzlst.append(name)
441 typedict[typename] = (bases, enzlst)
442 for letter in cls.__dict__['suppl'] :
443 supplier = suppliersdict[letter]
444 suppliersdict[letter][1].append(name)
445 if not classdict or not suppliersdict or not typedict :
446 print 'One of the new dictionaries is empty.'
447 print 'Check the integrity of the emboss file before continuing.'
448 print 'Update aborted.'
449 sys.exit()
450
451
452
453 print '\nThe new database contains %i enzymes.\n' % len(classdict)
454
455
456
457
458
459 update = os.getcwd()
460 results = open(os.path.join(update, 'Restriction_Dictionary.py'), 'w')
461 print 'Writing the dictionary containing the new Restriction classes.\t',
462 results.write(start)
463 a = pprint.PrettyPrinter(1, 80, None, results)
464 a.pprint(classdict)
465 print 'OK.\n'
466 print 'Writing the dictionary containing the suppliers datas.\t\t',
467 results.write('suppliers = \\\n')
468 a.pprint(suppliersdict)
469 print 'OK.\n'
470 print 'Writing the dictionary containing the Restriction types.\t',
471 results.write('typedict = \\\n')
472 a.pprint(typedict)
473 print 'OK.\n'
474 results.close()
475 return
476
478 """DB.install_dict() -> None.
479
480 Install the newly created dictionary in the site-packages folder.
481
482 May need super user privilege on some architectures."""
483 print '\n ' +'*'*78 + ' \n'
484 print '\n\t\tInstalling Restriction_Dictionary.py'
485 try :
486 import Bio.Restriction.Restriction_Dictionary as rd
487 except ImportError :
488 print '\
489 \n Unable to locate the previous Restriction_Dictionary.py module\
490 \n Aborting installation.'
491 sys.exit()
492
493
494
495 old = os.path.join(os.path.split(rd.__file__)[0],
496 'Restriction_Dictionary.py')
497
498 update_folder = os.getcwd()
499 shutil.copyfile(old, os.path.join(update_folder,
500 'Restriction_Dictionary.old'))
501
502
503
504 new = os.path.join(update_folder, 'Restriction_Dictionary.py')
505 try :
506 execfile(new)
507 print '\
508 \n\tThe new file seems ok. Proceeding with the installation.'
509 except SyntaxError :
510 print '\
511 \n The new dictionary file is corrupted. Aborting the installation.'
512 return
513 try :
514 shutil.copyfile(new, old)
515 print'\n\t Everything ok. If you need it a version of the old\
516 \n\t dictionary have been saved in the Updates folder under\
517 \n\t the name Restriction_Dictionary.old.'
518 print '\n ' +'*'*78 + ' \n'
519 except IOError :
520 print '\n ' +'*'*78 + ' \n'
521 print '\
522 \n\t WARNING : Impossible to install the new dictionary.\
523 \n\t Are you sure you have write permission to the folder :\n\
524 \n\t %s ?\n\n' % os.path.split(old)[0]
525 return self.no_install()
526 return
527
529 """BD.no_install() -> None.
530
531 build the new dictionary but do not install the dictionary."""
532 print '\n ' +'*'*78 + '\n'
533
534 try :
535 import Bio.Restriction.Restriction_Dictionary as rd
536 except ImportError :
537 print '\
538 \n Unable to locate the previous Restriction_Dictionary.py module\
539 \n Aborting installation.'
540 sys.exit()
541
542
543
544 old = os.path.join(os.path.split(rd.__file__)[0],
545 'Restriction_Dictionary.py')
546 update = os.getcwd()
547 shutil.copyfile(old, os.path.join(update, 'Restriction_Dictionary.old'))
548 places = update, os.path.split(Bio.Restriction.Restriction.__file__)[0]
549 print "\t\tCompilation of the new dictionary : OK.\
550 \n\t\tInstallation : No.\n\
551 \n You will find the newly created 'Restriction_Dictionary.py' file\
552 \n in the folder : \n\
553 \n\t%s\n\
554 \n Make a copy of 'Restriction_Dictionary.py' and place it with \
555 \n the other Restriction libraries.\n\
556 \n note : \
557 \n This folder should be :\n\
558 \n\t%s\n" % places
559 print '\n ' +'*'*78 + '\n'
560 return
561
562
564 """BD.lastrebasefile() -> None.
565
566 Check the emboss files are up to date and download them if they are not.
567 """
568 embossnames = ('emboss_e', 'emboss_r', 'emboss_s')
569
570
571
572 emboss_now = ['.'.join((x,LocalTime())) for x in embossnames]
573 update_needed = False
574
575 dircontent = os.listdir(os.getcwd())
576 base = os.getcwd()
577 for name in emboss_now :
578 if name in dircontent :
579 pass
580 else :
581 update_needed = True
582
583 if not update_needed :
584
585
586
587 print '\n Using the files : %s'% ', '.join(emboss_now)
588 return tuple([open(os.path.join(base, n)) for n in emboss_now])
589 else :
590
591
592
593 print '\n The rebase files are more than one month old.\
594 \n Would you like to update them before proceeding?(y/n)'
595 r = raw_input(' update [n] >>> ')
596 if r in ['y', 'yes', 'Y', 'Yes'] :
597 updt = RebaseUpdate(self.rebase_pass, self.proxy)
598 updt.openRebase()
599 updt.getfiles()
600 updt.close()
601 print '\n Update complete. Creating the dictionaries.\n'
602 print '\n Using the files : %s'% ', '.join(emboss_now)
603 return tuple([open(os.path.join(base, n)) for n in emboss_now])
604 else :
605
606
607
608
609 class NotFoundError(Exception) :
610 pass
611 for name in embossnames :
612 try :
613 for file in dircontent :
614 if file.startswith(name) :
615 break
616 else :
617 pass
618 raise NotFoundError
619 except NotFoundError :
620 print "\nNo %s file found. Upgrade is impossible.\n"%name
621 sys.exit()
622 continue
623 pass
624
625
626
627 last = [0]
628 for file in dircontent :
629 fs = file.split('.')
630 try :
631 if fs[0] in embossnames and int(fs[1]) > int(last[-1]) :
632 if last[0] : last.append(fs[1])
633 else : last[0] = fs[1]
634 else :
635 continue
636 except ValueError :
637 continue
638 last.sort()
639 last = last[::-1]
640 if int(last[-1]) < 100 : last[0], last[-1] = last[-1], last[0]
641 for number in last :
642 files = [(name, name+'.%s'%number) for name in embossnames]
643 strmess = '\nLast EMBOSS files found are :\n'
644 try :
645 for name,file in files :
646 if os.path.isfile(os.path.join(base, file)) :
647 strmess += '\t%s.\n'%file
648 else :
649 raise ValueError
650 print strmess
651 emboss_e = open(os.path.join(base, 'emboss_e.%s'%number),'r')
652 emboss_r = open(os.path.join(base, 'emboss_r.%s'%number),'r')
653 emboss_s = open(os.path.join(base, 'emboss_s.%s'%number),'r')
654 return emboss_e, emboss_r, emboss_s
655 except ValueError :
656 continue
657
659 line = [line[0]]+[line[1].upper()]+[int(i) for i in line[2:9]]+line[9:]
660 name = line[0]
661 site = line[1]
662 dna = DNA(site)
663 size = line[2]
664
665
666
667 fst5 = line[5]
668 fst3 = line[6]
669 scd5 = line[7]
670 scd3 = line[8]
671
672
673
674
675 ovhg1 = fst5 - fst3
676 ovhg2 = scd5 - scd3
677
678
679
680
681
682
683
684 if fst5 < 0 : fst5 += 1
685 if fst3 < 0 : fst3 += 1
686 if scd5 < 0 : scd5 += 1
687 if scd3 < 0 : scd3 += 1
688
689 if ovhg2 != 0 and ovhg1 != ovhg2 :
690
691
692
693
694
695
696
697
698
699 print '\
700 \nWARNING : %s cut twice with different overhang length each time.\
701 \n\tUnable to deal with this behaviour. \
702 \n\tThis enzyme will not be included in the database. Sorry.' %name
703 print '\tChecking :',
704 raise OverhangError
705 if 0 <= fst5 <= size and 0 <= fst3 <= size :
706
707
708
709 if fst5 < fst3 :
710
711
712
713 ovhg1 = ovhgseq = site[fst5:fst3]
714 elif fst5 > fst3 :
715
716
717
718 ovhg1 = ovhgseq = site[fst3:fst5]
719 else :
720
721
722
723 ovhg1 = ovhgseq = ''
724 for base in 'NRYWMSKHDBV' :
725 if base in ovhg1 :
726
727
728
729 ovhgseq = ovhg1
730 if fst5 < fst3 : ovhg1 = - len(ovhg1)
731 else : ovhg1 = len(ovhg1)
732 break
733 else :
734 continue
735 elif 0 <= fst5 <= size :
736
737
738
739 if fst5 < fst3 :
740
741
742
743 ovhgseq = site[fst5:] + (fst3 - size) * 'N'
744 elif fst5 > fst3 :
745
746
747
748 ovhgseq = abs(fst3) * 'N' + site[:fst5]
749 else :
750
751
752
753 ovhg1 = ovhgseq = ''
754 elif 0 <= fst3 <= size :
755
756
757
758 if fst5 < fst3 :
759
760
761
762 ovhgseq = abs(fst5) * 'N' + site[:fst3]
763 elif fst5 > fst3 :
764
765
766
767 ovhgseq = site[fst3:] + (fst5 - size) * 'N'
768 else :
769
770
771
772 raise ValueError, 'Error in #1'
773 elif fst3 < 0 and size < fst5 :
774
775
776
777 ovhgseq = abs(fst3)*'N' + site + (fst5-size)*'N'
778 elif fst5 < 0 and size <fst3 :
779
780
781
782 ovhgseq = abs(fst5)*'N' + site + (fst3-size)*'N'
783 else :
784
785
786
787 ovhgseq = 'N' * abs(ovhg1)
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807 for x in (5, 7) :
808 if line[x] < 0 : line[x] += 1
809 for x in (6, 8) :
810 if line[x] > 0 : line[x] -= size
811 elif line[x] < 0 : line[x] = line[x] - size + 1
812
813
814
815
816
817
818
819 rg = ''
820 if is_palindrom(dna) :
821 line.append(True)
822 rg = ''.join(['(?P<', name, '>', regex(site.upper()), ')'])
823 else :
824 line.append(False)
825 sense = ''.join(['(?P<', name, '>', regex(site.upper()), ')'])
826 antisense = ''.join(['(?P<', name, '_as>',
827 regex(Antiparallel(dna)), ')'])
828 rg = sense + '|' + antisense
829
830
831
832 f = [4/len(dna_alphabet[l]) for l in site.upper()]
833 freq = reduce(lambda x, y : x*y, f)
834 line.append(freq)
835
836
837
838
839 line.append(rg)
840 line.append(ovhg1)
841 line.append(ovhgseq)
842 return line
843
845
846
847
848 return [l for l in itertools.dropwhile(lambda l:l.startswith('#'),file)]
849
851
852
853
854 take = itertools.takewhile
855 block = [l for l in take(lambda l :not l.startswith('//'),file[index:])]
856 index += len(block)+1
857 return block, index
858
859 - def get(self, block) :
860
861
862
863
864
865
866
867 bl3 = block[3].strip()
868 if not bl3 : bl3 = False
869 return (block[0].strip(), bl3, block[5].strip())
870
940