1
2
3
4
5
6
7
8 """
9 Bio.GFF.easy: some functions to ease the use of Biopython
10 """
11
12 from __future__ import generators
13
14 __version__ = "$Revision: 1.8 $"
15
16
17 import copy
18 import re
19 import string
20 import sys
21
22 import Bio
23 from Bio import GenBank
24 from Bio.Data import IUPACData
25 from Bio.Seq import Seq
26
27
28
29
30 from Bio import SeqIO
31
32 from Bio import SeqUtils
33
34 import GenericTools
35
37 """ JH: accessing feature.qualifiers as a list is stupid. Here's a dict that does it"""
38 - def __init__(self, feature_list, default=None):
52
53 -class Location(GenericTools.VerboseList):
54 """
55 this is really best interfaced through LocationFromString
56 fuzzy: < or >
57 join: {0 = no join, 1 = join, 2 = order}
58
59 >>> location = Location([Location([339]), Location([564])]) # zero-based
60 >>> location
61 Location(Location(339), Location(564))
62 >>> print location # one-based
63 340..565
64 >>> print location.five_prime()
65 340
66 >>> location_rev = Location([Location([339]), Location([564])], 1)
67 >>> print location_rev
68 complement(340..565)
69 >>> print location_rev.five_prime()
70 565
71 """
72 - def __init__(self, the_list, complement=0, seqname=None):
78
80 if self.join == 1:
81 label = 'join'
82 elif self.join == 2:
83 label = 'order'
84 return "%s(%s)" % (label, ",".join(map(str, self)))
85
87 if self.seqname:
88 format = "%s:%%s" % self.seqname
89 else:
90 format = "%s"
91
92 if self.complement:
93 format = format % "complement(%s)"
94
95 if self.join:
96 return format % self._joinstr()
97
98 elif isinstance(self[0], list):
99 return format % "%s..%s" % (str(self[0]), str(self[1]))
100 else:
101 if self.fuzzy:
102 format = format % self.fuzzy + "%s"
103 return format % str(self[0] + 1)
104
106 return "Location(%s)" % ", ".join(map(repr, self))
107
108 direction2index = {1: 0, -1: -1}
110 """
111 1: 5'
112 -1: 3'
113
114 >>> loc1 = LocationFromString("join(1..3,complement(5..6))")
115 >>> loc1.direction_and_index(1)
116 (1, 0)
117 >>> loc1.direction_and_index(-1)
118 (-1, -1)
119 >>> loc1.reverse()
120 >>> print loc1
121 complement(join(1..3,complement(5..6)))
122 >>> loc1.direction_and_index(1)
123 (-1, -1)
124 """
125 if self.complement:
126 direction = direction * -1
127 index = self.direction2index[direction]
128 return direction, index
129
131 """
132 >>> loc = LocationFromString("complement(join(1..5,complement(6..10)))")
133 >>> loc.findside(1)
134 Location(5)
135 >>> loc.findside(-1)
136 Location(0)
137 """
138 direction, index = self.direction_and_index(direction)
139 if self.join or isinstance(self[0], list):
140 return self[index].findside(direction)
141 else:
142 return self
143
145 """
146 >>> loc = LocationFromString("complement(join(MOOCOW:1..5,SEQ:complement(6..10)))")
147 >>> loc.findseqname_3prime()
148 'MOOCOW'
149 """
150 return self.findseqname(-1)
151
153 """
154 >>> loc = LocationFromString("complement(join(MOOCOW:1..5,SEQ:complement(6..10)))")
155 >>> loc.findseqname()
156 'SEQ'
157 >>> loc.findseqname(-1)
158 'MOOCOW'
159 """
160 direction, index = self.direction_and_index(direction)
161 if self.seqname:
162 return self.seqname
163 elif self.join:
164 return self[index].findseqname(direction)
165 else:
166 raise AttributeError, 'no sequence name'
167
172
174 """
175 WARNING: doesn't deal with joins!!!!
176 """
177 return self.end()-self.start()
178
180 """
181 WARNING: doesn't deal with joins!!!!
182
183 >>> location1 = LocationFromString("1..50")
184 >>> location2 = LocationFromString("25..200")
185 >>> print location1.intersection(location2)
186 25..50
187 >>> print location1.intersection(location2)
188 25..50
189 """
190 if self.start() >= other.start():
191 start = self.start()
192 else:
193 start = other.start()
194 if self.end() <= other.end():
195 end = self.end()
196 else:
197 end = other.end()
198 return Location([Location([start]), Location([end])])
199
206
213
220
222 """
223 >>> fwd_location = LocationFromString('X:5830132..5831528')
224 >>> print fwd_location.sublocation(LocationFromString('1..101'))
225 X:5830132..5830232
226 >>> print fwd_location.sublocation(LocationFromString('1267..1286'))
227 X:5831398..5831417
228 >>> rev_location = LocationFromString('I:complement(8415686..8416216)')
229 >>> print rev_location.sublocation(LocationFromString('1..101'))
230 I:complement(8416116..8416216)
231 >>> print rev_location.sublocation(LocationFromString('100..200'))
232 I:complement(8416017..8416117)
233 """
234
235 absolute_location = copy.deepcopy(self)
236 for i in xrange(2):
237 absolute_location[i] = self.five_prime().add(sub_location[i], self.complement)
238 if absolute_location.complement:
239 list.reverse(absolute_location)
240 return absolute_location
241
243 return self.add(addend)
244
245 - def add(self, addend, complement=0):
246 self_copy = copy.deepcopy(self)
247 if isinstance(addend, Location):
248 addend = addend[0]
249 if complement:
250 addend *= -1
251 self_copy[0] += addend
252 return self_copy
253
255 return self + -subtrahend
256
259
261 """
262 >>> loc1 = LocationFromString("join(I:complement(1..9000),I:complement(9001..10000))")
263 >>> loc1.reorient()
264 >>> print loc1
265 complement(join(I:1..9000,I:9001..10000))
266 >>> loc2 = LocationFromString("join(I:complement(1..9000),I:9001..10000)")
267 >>> loc2.reorient()
268 >>> print loc2
269 join(I:complement(1..9000),I:9001..10000)
270 """
271 if self.join:
272 if len([x for x in self if x.complement]) == len(self):
273 self.reverse()
274 for segment in self:
275 segment.reverse()
276
278 """
279 works for single level non-complex joins
280
281 >>> LOC = LocationFromString
282 >>> l1 = LOC("join(alpha:1..30,alpha:50..70)")
283 >>> print l1.bounding()
284 join(alpha:1..70)
285 >>> l2 = LOC("join(alpha:1..30,alpha:complement(50..70))")
286 >>> print l2.bounding()
287 join(alpha:1..30,alpha:complement(50..70))
288 >>> l3 = LOC("join(alpha:1..30,alpha:complement(50..70),beta:6..20,alpha:25..45)")
289 >>> print l3.bounding()
290 join(alpha:1..45,alpha:complement(50..70),beta:6..20)
291
292 """
293 if not self.join:
294 return
295
296 seqdict = {}
297 seqkeys = []
298 for subloc in self:
299 assert subloc.seqname
300 assert not subloc.join
301 try:
302 seqdict[_hashname(subloc)].append(subloc)
303 except KeyError:
304 key = _hashname(subloc)
305 seqdict[key] = [subloc]
306 seqkeys.append(key)
307
308 res = LocationJoin()
309 for key in seqkeys:
310 locations = seqdict[key]
311 coords = []
312 for subloc in locations:
313 coords.append(subloc.start())
314 coords.append(subloc.end())
315 res.append(LocationFromCoords(min(coords), max(coords), locations[0].complement, locations[0].seqname))
316 return res
317
320
322 """
323 >>> join = LocationJoin([LocationFromCoords(339, 564, 1), LocationFromString("complement(100..339)")])
324 >>> appendloc = LocationFromString("order(complement(66..99),complement(5..55))")
325 >>> join.append(appendloc)
326 >>> print join
327 join(complement(340..565),complement(100..339),order(complement(66..99),complement(5..55)))
328 >>> join2 = LocationJoin()
329 >>> join2.append(LocationFromString("complement(66..99)"))
330 >>> join2.append(LocationFromString("complement(5..55)"))
331 >>> print join2
332 join(complement(66..99),complement(5..55))
333 """
334 - def __init__(self, the_list = [], complement=0, seqname=None):
340
342 """
343 >>> print LocationFromCoords(339, 564)
344 340..565
345 >>> print LocationFromCoords(339, 564, seqname="I")
346 I:340..565
347 >>> print LocationFromCoords(999, 3234, "-", seqname="NC_343434")
348 NC_343434:complement(1000..3235)
349 """
350 - def __init__(self, start, end, strand=0, seqname=None):
356
357
358
359 re_complement = re.compile(r"^complement\((.*)\)$")
360 re_seqname = re.compile(r"^(?!join|order|complement)([^\:]+?):(.*)$")
361 re_join = re.compile(r"^(join|order)\((.*)\)$")
362 re_dotdot = re.compile(r"^([><]*\d+)\.\.([><]*\d+)$")
363 re_fuzzy = re.compile(r"^([><])(\d+)")
365 """
366 >>> # here are some tests from http://www.ncbi.nlm.nih.gov/collab/FT/index.html#location
367 >>> print LocationFromString("467")
368 467
369 >>> print LocationFromString("340..565")
370 340..565
371 >>> print LocationFromString("<345..500")
372 <345..500
373 >>> print LocationFromString("<1..888")
374 <1..888
375 >>> # (102.110) and 123^124 syntax unimplemented
376 >>> print LocationFromString("join(12..78,134..202)")
377 join(12..78,134..202)
378 >>> print LocationFromString("complement(join(2691..4571,4918..5163))")
379 complement(join(2691..4571,4918..5163))
380 >>> print LocationFromString("join(complement(4918..5163),complement(2691..4571))")
381 join(complement(4918..5163),complement(2691..4571))
382 >>> print LocationFromString("order(complement(4918..5163),complement(2691..4571))")
383 order(complement(4918..5163),complement(2691..4571))
384 >>> print LocationFromString("NC_001802x.fna:73..78")
385 NC_001802x.fna:73..78
386 >>> print LocationFromString("J00194:100..202")
387 J00194:100..202
388
389 >>> print LocationFromString("join(117505..118584,1..609)")
390 join(117505..118584,1..609)
391 >>> print LocationFromString("join(test3:complement(4..6),test3:complement(1..3))")
392 join(test3:complement(4..6),test3:complement(1..3))
393 >>> print LocationFromString("test3:join(test1:complement(1..3),4..6)")
394 test3:join(test1:complement(1..3),4..6)
395 """
427
429 if filename:
430 return open(filename)
431 else:
432 return sys.stdin
433
435 """
436 >>> record = fasta_single(string=\"""
437 ... >gi|9629360|ref|NP_057850.1| Gag [Human immunodeficiency virus type 1]
438 ... MGARASVLSGGELDRWEKIRLRPGGKKKYKLKHIVWASRELERFAVNPGLLETSEGCRQILGQLQPSLQT
439 ... GSEELRSLYNTVATLYCVHQRIEIKDTKEALDKIEEEQNKSKKKAQQAAADTGHSNQVSQNYPIVQNIQG
440 ... QMVHQAISPRTLNAWVKVVEEKAFSPEVIPMFSALSEGATPQDLNTMLNTVGGHQAAMQMLKETINEEAA
441 ... EWDRVHPVHAGPIAPGQMREPRGSDIAGTTSTLQEQIGWMTNNPPIPVGEIYKRWIILGLNKIVRMYSPT
442 ... SILDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQNANPDCKTILKALGPAATLEEMMTAC
443 ... QGVGGPGHKARVLAEAMSQVTNSATIMMQRGNFRNQRKIVKCFNCGKEGHTARNCRAPRKKGCWKCGKEG
444 ... HQMKDCTERQANFLGKIWPSYKGRPGNFLQSRPEPTAPPEESFRSGVETTTPPQKQEPIDKELYPLTSLR
445 ... SLFGNDPSSQ
446 ... \""")
447 >>> record.id
448 'gi|9629360|ref|NP_057850.1|'
449 >>> record.description
450 'gi|9629360|ref|NP_057850.1| Gag [Human immunodeficiency virus type 1]'
451 >>> record.seq[0:5]
452 Seq('MGARA', SingleLetterAlphabet())
453 """
454
455
456 if string:
457 import cStringIO
458 handle = cStringIO.StringIO(string)
459 else :
460 handle = open_file(filename)
461 try :
462 record = SeqIO.parse(handle, format="fasta").next()
463 except StopIteration :
464 record = None
465 return record
466
479
481 """
482 >>> records = fasta_readrecords('GFF/multi.fna')
483 >>> records[0].id
484 'test1'
485 >>> records[2].seq
486 Seq('AAACACAC', SingleLetterAlphabet())
487 """
488 return list(SeqIO.parse(open_file(filename), format="fasta"))
489
494
496 """
497 >>> record = genbank_single("GFF/NC_001422.gbk")
498 >>> record.taxonomy
499 ['Viruses', 'ssDNA viruses', 'Microviridae', 'Microvirus']
500 >>> cds = record.features[-4]
501 >>> cds.key
502 'CDS'
503 >>> location = LocationFromString(cds.location)
504 >>> print location
505 2931..3917
506 >>> subseq = record_subseq(record, location)
507 >>> subseq[0:20]
508 Seq('ATGTTTGGTGCTATTGCTGG', Alphabet())
509 """
510 return GenBank.RecordParser().parse(open(filename))
511
513 """
514 >>> from Bio.SeqRecord import SeqRecord
515 >>> record = SeqRecord(Seq("gagttttatcgcttccatga"),
516 ... "ref|NC_001422",
517 ... "Coliphage phiX174, complete genome",
518 ... "bases 1-11")
519 >>> record_subseq(record, LocationFromString("1..4")) # one-based
520 Seq('GAGT', Alphabet())
521 >>> record_subseq(record, LocationFromString("complement(1..4)")) # one-based
522 Seq('ACTC', Alphabet())
523 >>> record_subseq(record, LocationFromString("join(complement(1..4),1..4)")) # what an idea!
524 Seq('ACTCGAGT', Alphabet())
525 >>> loc = LocationFromString("complement(join(complement(5..7),1..4))")
526 >>> print loc
527 complement(join(complement(5..7),1..4))
528 >>> record_subseq(record, loc)
529 Seq('ACTCTTT', Alphabet())
530 >>> print loc
531 complement(join(complement(5..7),1..4))
532 >>> loc.reverse()
533 >>> record_subseq(record, loc)
534 Seq('AAAGAGT', Alphabet())
535 >>> record_subseq(record, loc, upper=1)
536 Seq('AAAGAGT', Alphabet())
537 """
538 if location.join:
539 subsequence_list = []
540 if location.complement:
541 location_copy = copy.copy(location)
542 list.reverse(location_copy)
543 else:
544 location_copy = location
545 for sublocation in location_copy:
546 if location.complement:
547 sublocation_copy = copy.copy(sublocation)
548 sublocation_copy.reverse()
549 else:
550 sublocation_copy = sublocation
551 subsequence_list.append(record_subseq(record, sublocation_copy, *args, **keywds).tostring())
552 return Seq(''.join(subsequence_list), record_sequence(record).alphabet)
553 else:
554 return record_coords(record, location.start(), location.end()+1, location.complement, *args, **keywds)
555
568
570 """
571 >>> from Bio.SeqRecord import SeqRecord
572 >>> record = SeqRecord(Seq("gagttttatcgcttccatga"),
573 ... "ref|NC_001422",
574 ... "Coliphage phiX174, complete genome",
575 ... "bases 1-11")
576 >>> record_coords(record, 0, 4) # zero-based
577 Seq('GAGT', Alphabet())
578 >>> record_coords(record, 0, 4, "-") # zero-based
579 Seq('ACTC', Alphabet())
580 >>> record_coords(record, 0, 4, "-", upper=1) # zero-based
581 Seq('ACTC', Alphabet())
582 """
583
584 subseq = record_sequence(record)[start:end]
585 subseq_str = subseq.tostring()
586 subseq_str = subseq_str.upper()
587 subseq = Seq(subseq_str, subseq.alphabet)
588 if strand == '-' or strand == 1:
589 return subseq.reverse_complement()
590 else:
591 return subseq
592
593 -def _test(*args, **keywds):
594 import doctest, sys
595 doctest.testmod(sys.modules[__name__], *args, **keywds)
596
597 if __name__ == "__main__":
598 if __debug__:
599 _test()
600