1
2
3
4
5
6
7
8 """
9 Bio.GFF.easy: some functions to ease the use of Biopython
10 """
11
12 from __future__ import generators
13
14 __version__ = "$Revision: 1.9 $"
15
16
17 import copy
18 import re
19 import string
20 import sys
21
22 import Bio
23 from Bio import GenBank
24 from Bio.Data import IUPACData
25 from Bio.Seq import Seq
26
27 from Bio import SeqIO
28 from Bio import SeqUtils
29
30 import GenericTools
31
33 """ JH: accessing feature.qualifiers as a list is stupid. Here's a dict that does it"""
34 - def __init__(self, feature_list, default=None):
48
49 -class Location(GenericTools.VerboseList):
50 """
51 this is really best interfaced through LocationFromString
52 fuzzy: < or >
53 join: {0 = no join, 1 = join, 2 = order}
54
55 >>> location = Location([Location([339]), Location([564])]) # zero-based
56 >>> location
57 Location(Location(339), Location(564))
58 >>> print location # one-based
59 340..565
60 >>> print location.five_prime()
61 340
62 >>> location_rev = Location([Location([339]), Location([564])], 1)
63 >>> print location_rev
64 complement(340..565)
65 >>> print location_rev.five_prime()
66 565
67 """
68 - def __init__(self, the_list, complement=0, seqname=None):
74
76 if self.join == 1:
77 label = 'join'
78 elif self.join == 2:
79 label = 'order'
80 return "%s(%s)" % (label, ",".join(map(str, self)))
81
83 if self.seqname:
84 format = "%s:%%s" % self.seqname
85 else:
86 format = "%s"
87
88 if self.complement:
89 format = format % "complement(%s)"
90
91 if self.join:
92 return format % self._joinstr()
93
94 elif isinstance(self[0], list):
95 return format % "%s..%s" % (str(self[0]), str(self[1]))
96 else:
97 if self.fuzzy:
98 format = format % self.fuzzy + "%s"
99 return format % str(self[0] + 1)
100
102 return "Location(%s)" % ", ".join(map(repr, self))
103
104 direction2index = {1: 0, -1: -1}
106 """
107 1: 5'
108 -1: 3'
109
110 >>> loc1 = LocationFromString("join(1..3,complement(5..6))")
111 >>> loc1.direction_and_index(1)
112 (1, 0)
113 >>> loc1.direction_and_index(-1)
114 (-1, -1)
115 >>> loc1.reverse()
116 >>> print loc1
117 complement(join(1..3,complement(5..6)))
118 >>> loc1.direction_and_index(1)
119 (-1, -1)
120 """
121 if self.complement:
122 direction = direction * -1
123 index = self.direction2index[direction]
124 return direction, index
125
127 """
128 >>> loc = LocationFromString("complement(join(1..5,complement(6..10)))")
129 >>> loc.findside(1)
130 Location(5)
131 >>> loc.findside(-1)
132 Location(0)
133 """
134 direction, index = self.direction_and_index(direction)
135 if self.join or isinstance(self[0], list):
136 return self[index].findside(direction)
137 else:
138 return self
139
141 """
142 >>> loc = LocationFromString("complement(join(MOOCOW:1..5,SEQ:complement(6..10)))")
143 >>> loc.findseqname_3prime()
144 'MOOCOW'
145 """
146 return self.findseqname(-1)
147
149 """
150 >>> loc = LocationFromString("complement(join(MOOCOW:1..5,SEQ:complement(6..10)))")
151 >>> loc.findseqname()
152 'SEQ'
153 >>> loc.findseqname(-1)
154 'MOOCOW'
155 """
156 direction, index = self.direction_and_index(direction)
157 if self.seqname:
158 return self.seqname
159 elif self.join:
160 return self[index].findseqname(direction)
161 else:
162 raise AttributeError, 'no sequence name'
163
168
170 """
171 WARNING: doesn't deal with joins!!!!
172 """
173 return self.end()-self.start()
174
176 """
177 WARNING: doesn't deal with joins!!!!
178
179 >>> location1 = LocationFromString("1..50")
180 >>> location2 = LocationFromString("25..200")
181 >>> print location1.intersection(location2)
182 25..50
183 >>> print location1.intersection(location2)
184 25..50
185 """
186 if self.start() >= other.start():
187 start = self.start()
188 else:
189 start = other.start()
190 if self.end() <= other.end():
191 end = self.end()
192 else:
193 end = other.end()
194 return Location([Location([start]), Location([end])])
195
202
209
216
218 """
219 >>> fwd_location = LocationFromString('X:5830132..5831528')
220 >>> print fwd_location.sublocation(LocationFromString('1..101'))
221 X:5830132..5830232
222 >>> print fwd_location.sublocation(LocationFromString('1267..1286'))
223 X:5831398..5831417
224 >>> rev_location = LocationFromString('I:complement(8415686..8416216)')
225 >>> print rev_location.sublocation(LocationFromString('1..101'))
226 I:complement(8416116..8416216)
227 >>> print rev_location.sublocation(LocationFromString('100..200'))
228 I:complement(8416017..8416117)
229 """
230
231 absolute_location = copy.deepcopy(self)
232 for i in xrange(2):
233 absolute_location[i] = self.five_prime().add(sub_location[i], self.complement)
234 if absolute_location.complement:
235 list.reverse(absolute_location)
236 return absolute_location
237
239 return self.add(addend)
240
241 - def add(self, addend, complement=0):
242 self_copy = copy.deepcopy(self)
243 if isinstance(addend, Location):
244 addend = addend[0]
245 if complement:
246 addend *= -1
247 self_copy[0] += addend
248 return self_copy
249
251 return self + -subtrahend
252
255
257 """
258 >>> loc1 = LocationFromString("join(I:complement(1..9000),I:complement(9001..10000))")
259 >>> loc1.reorient()
260 >>> print loc1
261 complement(join(I:1..9000,I:9001..10000))
262 >>> loc2 = LocationFromString("join(I:complement(1..9000),I:9001..10000)")
263 >>> loc2.reorient()
264 >>> print loc2
265 join(I:complement(1..9000),I:9001..10000)
266 """
267 if self.join:
268 if len([x for x in self if x.complement]) == len(self):
269 self.reverse()
270 for segment in self:
271 segment.reverse()
272
274 """
275 works for single level non-complex joins
276
277 >>> LOC = LocationFromString
278 >>> l1 = LOC("join(alpha:1..30,alpha:50..70)")
279 >>> print l1.bounding()
280 join(alpha:1..70)
281 >>> l2 = LOC("join(alpha:1..30,alpha:complement(50..70))")
282 >>> print l2.bounding()
283 join(alpha:1..30,alpha:complement(50..70))
284 >>> l3 = LOC("join(alpha:1..30,alpha:complement(50..70),beta:6..20,alpha:25..45)")
285 >>> print l3.bounding()
286 join(alpha:1..45,alpha:complement(50..70),beta:6..20)
287
288 """
289 if not self.join:
290 return
291
292 seqdict = {}
293 seqkeys = []
294 for subloc in self:
295 assert subloc.seqname
296 assert not subloc.join
297 try:
298 seqdict[_hashname(subloc)].append(subloc)
299 except KeyError:
300 key = _hashname(subloc)
301 seqdict[key] = [subloc]
302 seqkeys.append(key)
303
304 res = LocationJoin()
305 for key in seqkeys:
306 locations = seqdict[key]
307 coords = []
308 for subloc in locations:
309 coords.append(subloc.start())
310 coords.append(subloc.end())
311 res.append(LocationFromCoords(min(coords), max(coords), locations[0].complement, locations[0].seqname))
312 return res
313
316
318 """
319 >>> join = LocationJoin([LocationFromCoords(339, 564, 1), LocationFromString("complement(100..339)")])
320 >>> appendloc = LocationFromString("order(complement(66..99),complement(5..55))")
321 >>> join.append(appendloc)
322 >>> print join
323 join(complement(340..565),complement(100..339),order(complement(66..99),complement(5..55)))
324 >>> join2 = LocationJoin()
325 >>> join2.append(LocationFromString("complement(66..99)"))
326 >>> join2.append(LocationFromString("complement(5..55)"))
327 >>> print join2
328 join(complement(66..99),complement(5..55))
329 """
330 - def __init__(self, the_list = [], complement=0, seqname=None):
336
338 """
339 >>> print LocationFromCoords(339, 564)
340 340..565
341 >>> print LocationFromCoords(339, 564, seqname="I")
342 I:340..565
343 >>> print LocationFromCoords(999, 3234, "-", seqname="NC_343434")
344 NC_343434:complement(1000..3235)
345 """
346 - def __init__(self, start, end, strand=0, seqname=None):
352
353
354
355 re_complement = re.compile(r"^complement\((.*)\)$")
356 re_seqname = re.compile(r"^(?!join|order|complement)([^\:]+?):(.*)$")
357 re_join = re.compile(r"^(join|order)\((.*)\)$")
358 re_dotdot = re.compile(r"^([><]*\d+)\.\.([><]*\d+)$")
359 re_fuzzy = re.compile(r"^([><])(\d+)")
361 """
362 >>> # here are some tests from http://www.ncbi.nlm.nih.gov/collab/FT/index.html#location
363 >>> print LocationFromString("467")
364 467
365 >>> print LocationFromString("340..565")
366 340..565
367 >>> print LocationFromString("<345..500")
368 <345..500
369 >>> print LocationFromString("<1..888")
370 <1..888
371 >>> # (102.110) and 123^124 syntax unimplemented
372 >>> print LocationFromString("join(12..78,134..202)")
373 join(12..78,134..202)
374 >>> print LocationFromString("complement(join(2691..4571,4918..5163))")
375 complement(join(2691..4571,4918..5163))
376 >>> print LocationFromString("join(complement(4918..5163),complement(2691..4571))")
377 join(complement(4918..5163),complement(2691..4571))
378 >>> print LocationFromString("order(complement(4918..5163),complement(2691..4571))")
379 order(complement(4918..5163),complement(2691..4571))
380 >>> print LocationFromString("NC_001802x.fna:73..78")
381 NC_001802x.fna:73..78
382 >>> print LocationFromString("J00194:100..202")
383 J00194:100..202
384
385 >>> print LocationFromString("join(117505..118584,1..609)")
386 join(117505..118584,1..609)
387 >>> print LocationFromString("join(test3:complement(4..6),test3:complement(1..3))")
388 join(test3:complement(4..6),test3:complement(1..3))
389 >>> print LocationFromString("test3:join(test1:complement(1..3),4..6)")
390 test3:join(test1:complement(1..3),4..6)
391 """
423
425 if filename:
426 return open(filename)
427 else:
428 return sys.stdin
429
431 """
432 >>> record = fasta_single(string=\"""
433 ... >gi|9629360|ref|NP_057850.1| Gag [Human immunodeficiency virus type 1]
434 ... MGARASVLSGGELDRWEKIRLRPGGKKKYKLKHIVWASRELERFAVNPGLLETSEGCRQILGQLQPSLQT
435 ... GSEELRSLYNTVATLYCVHQRIEIKDTKEALDKIEEEQNKSKKKAQQAAADTGHSNQVSQNYPIVQNIQG
436 ... QMVHQAISPRTLNAWVKVVEEKAFSPEVIPMFSALSEGATPQDLNTMLNTVGGHQAAMQMLKETINEEAA
437 ... EWDRVHPVHAGPIAPGQMREPRGSDIAGTTSTLQEQIGWMTNNPPIPVGEIYKRWIILGLNKIVRMYSPT
438 ... SILDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQNANPDCKTILKALGPAATLEEMMTAC
439 ... QGVGGPGHKARVLAEAMSQVTNSATIMMQRGNFRNQRKIVKCFNCGKEGHTARNCRAPRKKGCWKCGKEG
440 ... HQMKDCTERQANFLGKIWPSYKGRPGNFLQSRPEPTAPPEESFRSGVETTTPPQKQEPIDKELYPLTSLR
441 ... SLFGNDPSSQ
442 ... \""")
443 >>> record.id
444 'gi|9629360|ref|NP_057850.1|'
445 >>> record.description
446 'gi|9629360|ref|NP_057850.1| Gag [Human immunodeficiency virus type 1]'
447 >>> record.seq[0:5]
448 Seq('MGARA', SingleLetterAlphabet())
449 """
450
451
452 if string:
453 import cStringIO
454 handle = cStringIO.StringIO(string)
455 else :
456 handle = open_file(filename)
457 try :
458 record = SeqIO.parse(handle, format="fasta").next()
459 except StopIteration :
460 record = None
461 return record
462
475
477 """
478 >>> records = fasta_readrecords('GFF/multi.fna')
479 >>> records[0].id
480 'test1'
481 >>> records[2].seq
482 Seq('AAACACAC', SingleLetterAlphabet())
483 """
484 return list(SeqIO.parse(open_file(filename), format="fasta"))
485
490
492 """
493 >>> record = genbank_single("GFF/NC_001422.gbk")
494 >>> record.taxonomy
495 ['Viruses', 'ssDNA viruses', 'Microviridae', 'Microvirus']
496 >>> cds = record.features[-4]
497 >>> cds.key
498 'CDS'
499 >>> location = LocationFromString(cds.location)
500 >>> print location
501 2931..3917
502 >>> subseq = record_subseq(record, location)
503 >>> subseq[0:20]
504 Seq('ATGTTTGGTGCTATTGCTGG', Alphabet())
505 """
506 return GenBank.RecordParser().parse(open(filename))
507
509 """
510 >>> from Bio.SeqRecord import SeqRecord
511 >>> record = SeqRecord(Seq("gagttttatcgcttccatga"),
512 ... "ref|NC_001422",
513 ... "Coliphage phiX174, complete genome",
514 ... "bases 1-11")
515 >>> record_subseq(record, LocationFromString("1..4")) # one-based
516 Seq('GAGT', Alphabet())
517 >>> record_subseq(record, LocationFromString("complement(1..4)")) # one-based
518 Seq('ACTC', Alphabet())
519 >>> record_subseq(record, LocationFromString("join(complement(1..4),1..4)")) # what an idea!
520 Seq('ACTCGAGT', Alphabet())
521 >>> loc = LocationFromString("complement(join(complement(5..7),1..4))")
522 >>> print loc
523 complement(join(complement(5..7),1..4))
524 >>> record_subseq(record, loc)
525 Seq('ACTCTTT', Alphabet())
526 >>> print loc
527 complement(join(complement(5..7),1..4))
528 >>> loc.reverse()
529 >>> record_subseq(record, loc)
530 Seq('AAAGAGT', Alphabet())
531 >>> record_subseq(record, loc, upper=1)
532 Seq('AAAGAGT', Alphabet())
533 """
534 if location.join:
535 subsequence_list = []
536 if location.complement:
537 location_copy = copy.copy(location)
538 list.reverse(location_copy)
539 else:
540 location_copy = location
541 for sublocation in location_copy:
542 if location.complement:
543 sublocation_copy = copy.copy(sublocation)
544 sublocation_copy.reverse()
545 else:
546 sublocation_copy = sublocation
547 subsequence_list.append(record_subseq(record, sublocation_copy, *args, **keywds).tostring())
548 return Seq(''.join(subsequence_list), record_sequence(record).alphabet)
549 else:
550 return record_coords(record, location.start(), location.end()+1, location.complement, *args, **keywds)
551
564
566 """
567 >>> from Bio.SeqRecord import SeqRecord
568 >>> record = SeqRecord(Seq("gagttttatcgcttccatga"),
569 ... "ref|NC_001422",
570 ... "Coliphage phiX174, complete genome",
571 ... "bases 1-11")
572 >>> record_coords(record, 0, 4) # zero-based
573 Seq('GAGT', Alphabet())
574 >>> record_coords(record, 0, 4, "-") # zero-based
575 Seq('ACTC', Alphabet())
576 >>> record_coords(record, 0, 4, "-", upper=1) # zero-based
577 Seq('ACTC', Alphabet())
578 """
579
580 subseq = record_sequence(record)[start:end]
581 subseq_str = subseq.tostring()
582 subseq_str = subseq_str.upper()
583 subseq = Seq(subseq_str, subseq.alphabet)
584 if strand == '-' or strand == 1:
585 return subseq.reverse_complement()
586 else:
587 return subseq
588
589 -def _test(*args, **keywds):
590 import doctest, sys
591 doctest.testmod(sys.modules[__name__], *args, **keywds)
592
593 if __name__ == "__main__":
594 if __debug__:
595 _test()
596