Package Bio :: Package GFF :: Module easy
[hide private]
[frames] | no frames]

Source Code for Module Bio.GFF.easy

  1  #!/usr/bin/env python 
  2  # 
  3  # Copyright 2002 by Michael Hoffman.  All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7   
  8  """ 
  9  Bio.GFF.easy: some functions to ease the use of Biopython 
 10  """ 
 11   
 12  from __future__ import generators # requires Python 2.2 
 13   
 14  __version__ = "$Revision: 1.8 $" 
 15  # $Source: /home/repository/biopython/biopython/Bio/GFF/easy.py,v $ 
 16   
 17  import copy 
 18  import re 
 19  import string 
 20  import sys 
 21   
 22  import Bio 
 23  from Bio import GenBank 
 24  from Bio.Data import IUPACData 
 25  from Bio.Seq import Seq 
 26   
 27  #from Bio.SeqIO.FASTA import FastaReader, FastaWriter 
 28  #The whole of Bio.SeqIO.FASTA has been deprecated, so 
 29  #we'll use the new Bio.SeqIO functions instead: 
 30  from Bio import SeqIO 
 31   
 32  from Bio import SeqUtils 
 33   
 34  import GenericTools 
 35   
36 -class FeatureDict(dict):
37 """ JH: accessing feature.qualifiers as a list is stupid. Here's a dict that does it"""
38 - def __init__(self, feature_list, default=None):
39 dict.__init__(self) 40 self.default = default 41 key_re = re.compile(r'/(\S+)=') 42 43 for i in feature_list: 44 key = key_re.match(i.key).group(1) 45 val = string.replace(i.value,'"','') 46 self[key] = val
47 - def __getitem__(self, key):
48 try: 49 return dict.__getitem__(self, key) 50 except KeyError: 51 return self.default
52
53 -class Location(GenericTools.VerboseList):
54 """ 55 this is really best interfaced through LocationFromString 56 fuzzy: < or > 57 join: {0 = no join, 1 = join, 2 = order} 58 59 >>> location = Location([Location([339]), Location([564])]) # zero-based 60 >>> location 61 Location(Location(339), Location(564)) 62 >>> print location # one-based 63 340..565 64 >>> print location.five_prime() 65 340 66 >>> location_rev = Location([Location([339]), Location([564])], 1) 67 >>> print location_rev 68 complement(340..565) 69 >>> print location_rev.five_prime() 70 565 71 """
72 - def __init__(self, the_list, complement=0, seqname=None):
73 self.complement = complement 74 self.join = 0 75 self.fuzzy = None 76 self.seqname = seqname 77 list.__init__(self, the_list)
78
79 - def _joinstr(self):
80 if self.join == 1: 81 label = 'join' 82 elif self.join == 2: 83 label = 'order' 84 return "%s(%s)" % (label, ",".join(map(str, self)))
85
86 - def __str__(self):
87 if self.seqname: 88 format = "%s:%%s" % self.seqname 89 else: 90 format = "%s" 91 92 if self.complement: 93 format = format % "complement(%s)" 94 95 if self.join: 96 return format % self._joinstr() 97 98 elif isinstance(self[0], list): 99 return format % "%s..%s" % (str(self[0]), str(self[1])) 100 else: 101 if self.fuzzy: 102 format = format % self.fuzzy + "%s" 103 return format % str(self[0] + 1)
104
105 - def __repr__(self):
106 return "Location(%s)" % ", ".join(map(repr, self))
107 108 direction2index = {1: 0, -1: -1}
109 - def direction_and_index(self, direction):
110 """ 111 1: 5' 112 -1: 3' 113 114 >>> loc1 = LocationFromString("join(1..3,complement(5..6))") 115 >>> loc1.direction_and_index(1) 116 (1, 0) 117 >>> loc1.direction_and_index(-1) 118 (-1, -1) 119 >>> loc1.reverse() 120 >>> print loc1 121 complement(join(1..3,complement(5..6))) 122 >>> loc1.direction_and_index(1) 123 (-1, -1) 124 """ 125 if self.complement: 126 direction = direction * -1 127 index = self.direction2index[direction] 128 return direction, index
129
130 - def findside(self, direction):
131 """ 132 >>> loc = LocationFromString("complement(join(1..5,complement(6..10)))") 133 >>> loc.findside(1) 134 Location(5) 135 >>> loc.findside(-1) 136 Location(0) 137 """ 138 direction, index = self.direction_and_index(direction) 139 if self.join or isinstance(self[0], list): 140 return self[index].findside(direction) 141 else: 142 return self
143
144 - def findseqname_3prime(self):
145 """ 146 >>> loc = LocationFromString("complement(join(MOOCOW:1..5,SEQ:complement(6..10)))") 147 >>> loc.findseqname_3prime() 148 'MOOCOW' 149 """ 150 return self.findseqname(-1)
151
152 - def findseqname(self, direction=1): # find 5' seqname
153 """ 154 >>> loc = LocationFromString("complement(join(MOOCOW:1..5,SEQ:complement(6..10)))") 155 >>> loc.findseqname() 156 'SEQ' 157 >>> loc.findseqname(-1) 158 'MOOCOW' 159 """ 160 direction, index = self.direction_and_index(direction) 161 if self.seqname: 162 return self.seqname 163 elif self.join: 164 return self[index].findseqname(direction) 165 else: 166 raise AttributeError, 'no sequence name'
167
168 - def five_prime(self):
169 return self.findside(1)
170 - def three_prime(self):
171 return self.findside(-1)
172
173 - def length(self):
174 """ 175 WARNING: doesn't deal with joins!!!! 176 """ 177 return self.end()-self.start()
178
179 - def intersection(self, other):
180 """ 181 WARNING: doesn't deal with joins!!!! 182 183 >>> location1 = LocationFromString("1..50") 184 >>> location2 = LocationFromString("25..200") 185 >>> print location1.intersection(location2) 186 25..50 187 >>> print location1.intersection(location2) 188 25..50 189 """ 190 if self.start() >= other.start(): 191 start = self.start() 192 else: 193 start = other.start() 194 if self.end() <= other.end(): 195 end = self.end() 196 else: 197 end = other.end() 198 return Location([Location([start]), Location([end])])
199
200 - def start(self):
201 # zero-based 202 if self.complement: 203 return self.three_prime()[0] 204 else: 205 return self.five_prime()[0]
206
207 - def end(self):
208 # zero-based 209 if self.complement: 210 return self.five_prime()[0] 211 else: 212 return self.three_prime()[0]
213
214 - def three_prime_range(self, window):
215 three_prime_loc = self.three_prime() 216 if self.complement: 217 return Location([three_prime_loc-window, three_prime_loc], complement=1) 218 else: 219 return Location([three_prime_loc, three_prime_loc+window])
220
221 - def sublocation(self, sub_location):
222 """ 223 >>> fwd_location = LocationFromString('X:5830132..5831528') 224 >>> print fwd_location.sublocation(LocationFromString('1..101')) 225 X:5830132..5830232 226 >>> print fwd_location.sublocation(LocationFromString('1267..1286')) 227 X:5831398..5831417 228 >>> rev_location = LocationFromString('I:complement(8415686..8416216)') 229 >>> print rev_location.sublocation(LocationFromString('1..101')) 230 I:complement(8416116..8416216) 231 >>> print rev_location.sublocation(LocationFromString('100..200')) 232 I:complement(8416017..8416117) 233 """ 234 235 absolute_location = copy.deepcopy(self) 236 for i in xrange(2): 237 absolute_location[i] = self.five_prime().add(sub_location[i], self.complement) 238 if absolute_location.complement: 239 list.reverse(absolute_location) 240 return absolute_location
241
242 - def __add__(self, addend):
243 return self.add(addend)
244
245 - def add(self, addend, complement=0):
246 self_copy = copy.deepcopy(self) 247 if isinstance(addend, Location): 248 addend = addend[0] 249 if complement: 250 addend *= -1 251 self_copy[0] += addend 252 return self_copy
253
254 - def __sub__(self, subtrahend):
255 return self + -subtrahend
256
257 - def reverse(self):
258 self.complement = [1, 0][self.complement]
259
260 - def reorient(self):
261 """ 262 >>> loc1 = LocationFromString("join(I:complement(1..9000),I:complement(9001..10000))") 263 >>> loc1.reorient() 264 >>> print loc1 265 complement(join(I:1..9000,I:9001..10000)) 266 >>> loc2 = LocationFromString("join(I:complement(1..9000),I:9001..10000)") 267 >>> loc2.reorient() 268 >>> print loc2 269 join(I:complement(1..9000),I:9001..10000) 270 """ 271 if self.join: 272 if len([x for x in self if x.complement]) == len(self): 273 self.reverse() 274 for segment in self: 275 segment.reverse()
276
277 - def bounding(self):
278 """ 279 works for single level non-complex joins 280 281 >>> LOC = LocationFromString 282 >>> l1 = LOC("join(alpha:1..30,alpha:50..70)") 283 >>> print l1.bounding() 284 join(alpha:1..70) 285 >>> l2 = LOC("join(alpha:1..30,alpha:complement(50..70))") 286 >>> print l2.bounding() 287 join(alpha:1..30,alpha:complement(50..70)) 288 >>> l3 = LOC("join(alpha:1..30,alpha:complement(50..70),beta:6..20,alpha:25..45)") 289 >>> print l3.bounding() 290 join(alpha:1..45,alpha:complement(50..70),beta:6..20) 291 292 """ 293 if not self.join: 294 return 295 296 seqdict = {} 297 seqkeys = [] 298 for subloc in self: 299 assert subloc.seqname 300 assert not subloc.join 301 try: 302 seqdict[_hashname(subloc)].append(subloc) 303 except KeyError: 304 key = _hashname(subloc) 305 seqdict[key] = [subloc] 306 seqkeys.append(key) 307 308 res = LocationJoin() 309 for key in seqkeys: 310 locations = seqdict[key] 311 coords = [] 312 for subloc in locations: 313 coords.append(subloc.start()) 314 coords.append(subloc.end()) 315 res.append(LocationFromCoords(min(coords), max(coords), locations[0].complement, locations[0].seqname)) 316 return res
317
318 -def _hashname(location):
319 return str(location.complement) + location.seqname
320
321 -class LocationJoin(Location):
322 """ 323 >>> join = LocationJoin([LocationFromCoords(339, 564, 1), LocationFromString("complement(100..339)")]) 324 >>> appendloc = LocationFromString("order(complement(66..99),complement(5..55))") 325 >>> join.append(appendloc) 326 >>> print join 327 join(complement(340..565),complement(100..339),order(complement(66..99),complement(5..55))) 328 >>> join2 = LocationJoin() 329 >>> join2.append(LocationFromString("complement(66..99)")) 330 >>> join2.append(LocationFromString("complement(5..55)")) 331 >>> print join2 332 join(complement(66..99),complement(5..55)) 333 """
334 - def __init__(self, the_list = [], complement=0, seqname=None):
335 self.complement = complement 336 self.join = 1 337 self.fuzzy = None 338 self.seqname = seqname 339 list.__init__(self, the_list)
340
341 -class LocationFromCoords(Location):
342 """ 343 >>> print LocationFromCoords(339, 564) 344 340..565 345 >>> print LocationFromCoords(339, 564, seqname="I") 346 I:340..565 347 >>> print LocationFromCoords(999, 3234, "-", seqname="NC_343434") 348 NC_343434:complement(1000..3235) 349 """
350 - def __init__(self, start, end, strand=0, seqname=None):
351 if strand == "+": 352 strand = 0 353 elif strand == "-": 354 strand = 1 355 Location.__init__(self, [Location([start]), Location([end])], strand, seqname)
356 357 # see http://www.ncbi.nlm.nih.gov/collab/FT/index.html#backus-naur 358 # for how this should actually be implemented 359 re_complement = re.compile(r"^complement\((.*)\)$") 360 re_seqname = re.compile(r"^(?!join|order|complement)([^\:]+?):(.*)$") # not every character is allowed by spec 361 re_join = re.compile(r"^(join|order)\((.*)\)$") 362 re_dotdot = re.compile(r"^([><]*\d+)\.\.([><]*\d+)$") 363 re_fuzzy = re.compile(r"^([><])(\d+)")
364 -class LocationFromString(Location):
365 """ 366 >>> # here are some tests from http://www.ncbi.nlm.nih.gov/collab/FT/index.html#location 367 >>> print LocationFromString("467") 368 467 369 >>> print LocationFromString("340..565") 370 340..565 371 >>> print LocationFromString("<345..500") 372 <345..500 373 >>> print LocationFromString("<1..888") 374 <1..888 375 >>> # (102.110) and 123^124 syntax unimplemented 376 >>> print LocationFromString("join(12..78,134..202)") 377 join(12..78,134..202) 378 >>> print LocationFromString("complement(join(2691..4571,4918..5163))") 379 complement(join(2691..4571,4918..5163)) 380 >>> print LocationFromString("join(complement(4918..5163),complement(2691..4571))") 381 join(complement(4918..5163),complement(2691..4571)) 382 >>> print LocationFromString("order(complement(4918..5163),complement(2691..4571))") 383 order(complement(4918..5163),complement(2691..4571)) 384 >>> print LocationFromString("NC_001802x.fna:73..78") 385 NC_001802x.fna:73..78 386 >>> print LocationFromString("J00194:100..202") 387 J00194:100..202 388 389 >>> print LocationFromString("join(117505..118584,1..609)") 390 join(117505..118584,1..609) 391 >>> print LocationFromString("join(test3:complement(4..6),test3:complement(1..3))") 392 join(test3:complement(4..6),test3:complement(1..3)) 393 >>> print LocationFromString("test3:join(test1:complement(1..3),4..6)") 394 test3:join(test1:complement(1..3),4..6) 395 """
396 - def __init__(self, location_str):
397 match_seqname = re_seqname.match(location_str) 398 if match_seqname: 399 self.seqname = match_seqname.group(1) 400 location_str = match_seqname.group(2) 401 else: 402 self.seqname = None 403 match_complement = re_complement.match(location_str) 404 if match_complement: 405 self.complement = 1 406 location_str = match_complement.group(1) 407 else: 408 self.complement = 0 409 match_join = re_join.match(location_str) 410 if match_join: 411 self.join = {'join':1, 'order':2}[match_join.group(1)] 412 list.__init__(self, map(lambda x: LocationFromString(x), match_join.group(2).split(","))) 413 else: 414 self.join = 0 415 match_dotdot = re_dotdot.match(location_str) 416 if match_dotdot: 417 list.__init__(self, map(lambda x: LocationFromString(match_dotdot.group(x)), (1, 2))) 418 else: 419 match_fuzzy = re_fuzzy.match(location_str) 420 if match_fuzzy: 421 self.fuzzy = match_fuzzy.group(1) 422 location_str = match_fuzzy.group(2) 423 else: 424 self.fuzzy = None 425 426 list.__init__(self, [int(location_str)-1]) # zero based, nip it in the bud
427
428 -def open_file(filename):
429 if filename: 430 return open(filename) 431 else: 432 return sys.stdin
433
434 -def fasta_single(filename=None, string=None):
435 """ 436 >>> record = fasta_single(string=\""" 437 ... >gi|9629360|ref|NP_057850.1| Gag [Human immunodeficiency virus type 1] 438 ... MGARASVLSGGELDRWEKIRLRPGGKKKYKLKHIVWASRELERFAVNPGLLETSEGCRQILGQLQPSLQT 439 ... GSEELRSLYNTVATLYCVHQRIEIKDTKEALDKIEEEQNKSKKKAQQAAADTGHSNQVSQNYPIVQNIQG 440 ... QMVHQAISPRTLNAWVKVVEEKAFSPEVIPMFSALSEGATPQDLNTMLNTVGGHQAAMQMLKETINEEAA 441 ... EWDRVHPVHAGPIAPGQMREPRGSDIAGTTSTLQEQIGWMTNNPPIPVGEIYKRWIILGLNKIVRMYSPT 442 ... SILDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQNANPDCKTILKALGPAATLEEMMTAC 443 ... QGVGGPGHKARVLAEAMSQVTNSATIMMQRGNFRNQRKIVKCFNCGKEGHTARNCRAPRKKGCWKCGKEG 444 ... HQMKDCTERQANFLGKIWPSYKGRPGNFLQSRPEPTAPPEESFRSGVETTTPPQKQEPIDKELYPLTSLR 445 ... SLFGNDPSSQ 446 ... \""") 447 >>> record.id 448 'gi|9629360|ref|NP_057850.1|' 449 >>> record.description 450 'gi|9629360|ref|NP_057850.1| Gag [Human immunodeficiency virus type 1]' 451 >>> record.seq[0:5] 452 Seq('MGARA', SingleLetterAlphabet()) 453 """ 454 #Returns the first record in a fasta file as a SeqRecord, 455 #or None if there are no records in the file. 456 if string: 457 import cStringIO 458 handle = cStringIO.StringIO(string) 459 else : 460 handle = open_file(filename) 461 try : 462 record = SeqIO.parse(handle, format="fasta").next() 463 except StopIteration : 464 record = None 465 return record
466
467 -def fasta_multi(filename=None):
468 #Simple way is just: 469 #return SeqIO.parse(open_file(filename), format="fasta") 470 #However, for backwards compatibility make sure we raise 471 #the StopIteration exception rather than returning None. 472 reader = SeqIO.parse(open_file(filename), format="fasta") 473 while True: 474 record = reader.next() 475 if record is None: 476 raise StopIteration 477 else: 478 yield record
479
480 -def fasta_readrecords(filename=None):
481 """ 482 >>> records = fasta_readrecords('GFF/multi.fna') 483 >>> records[0].id 484 'test1' 485 >>> records[2].seq 486 Seq('AAACACAC', SingleLetterAlphabet()) 487 """ 488 return list(SeqIO.parse(open_file(filename), format="fasta"))
489
490 -def fasta_write(filename, records):
491 handle = open(filename, "w") 492 SeqIO.write(records, handle, format="fasta") 493 handle.close()
494
495 -def genbank_single(filename):
496 """ 497 >>> record = genbank_single("GFF/NC_001422.gbk") 498 >>> record.taxonomy 499 ['Viruses', 'ssDNA viruses', 'Microviridae', 'Microvirus'] 500 >>> cds = record.features[-4] 501 >>> cds.key 502 'CDS' 503 >>> location = LocationFromString(cds.location) 504 >>> print location 505 2931..3917 506 >>> subseq = record_subseq(record, location) 507 >>> subseq[0:20] 508 Seq('ATGTTTGGTGCTATTGCTGG', Alphabet()) 509 """ 510 return GenBank.RecordParser().parse(open(filename))
511
512 -def record_subseq(record, location, *args, **keywds):
513 """ 514 >>> from Bio.SeqRecord import SeqRecord 515 >>> record = SeqRecord(Seq("gagttttatcgcttccatga"), 516 ... "ref|NC_001422", 517 ... "Coliphage phiX174, complete genome", 518 ... "bases 1-11") 519 >>> record_subseq(record, LocationFromString("1..4")) # one-based 520 Seq('GAGT', Alphabet()) 521 >>> record_subseq(record, LocationFromString("complement(1..4)")) # one-based 522 Seq('ACTC', Alphabet()) 523 >>> record_subseq(record, LocationFromString("join(complement(1..4),1..4)")) # what an idea! 524 Seq('ACTCGAGT', Alphabet()) 525 >>> loc = LocationFromString("complement(join(complement(5..7),1..4))") 526 >>> print loc 527 complement(join(complement(5..7),1..4)) 528 >>> record_subseq(record, loc) 529 Seq('ACTCTTT', Alphabet()) 530 >>> print loc 531 complement(join(complement(5..7),1..4)) 532 >>> loc.reverse() 533 >>> record_subseq(record, loc) 534 Seq('AAAGAGT', Alphabet()) 535 >>> record_subseq(record, loc, upper=1) 536 Seq('AAAGAGT', Alphabet()) 537 """ 538 if location.join: 539 subsequence_list = [] 540 if location.complement: 541 location_copy = copy.copy(location) 542 list.reverse(location_copy) 543 else: 544 location_copy = location 545 for sublocation in location_copy: 546 if location.complement: 547 sublocation_copy = copy.copy(sublocation) 548 sublocation_copy.reverse() 549 else: 550 sublocation_copy = sublocation 551 subsequence_list.append(record_subseq(record, sublocation_copy, *args, **keywds).tostring()) 552 return Seq(''.join(subsequence_list), record_sequence(record).alphabet) 553 else: 554 return record_coords(record, location.start(), location.end()+1, location.complement, *args, **keywds)
555
556 -def record_sequence(record):
557 """ 558 returns the sequence of a record 559 560 can be Bio.SeqRecord.SeqRecord or Bio.GenBank.Record.Record 561 """ 562 if isinstance(record, Bio.SeqRecord.SeqRecord): 563 return record.seq 564 elif isinstance(record, Bio.GenBank.Record.Record): 565 return Seq(record.sequence) 566 else: 567 raise TypeError, 'not Bio.SeqRecord.SeqRecord or Bio.GenBank.Record.Record'
568
569 -def record_coords(record, start, end, strand=0, upper=0):
570 """ 571 >>> from Bio.SeqRecord import SeqRecord 572 >>> record = SeqRecord(Seq("gagttttatcgcttccatga"), 573 ... "ref|NC_001422", 574 ... "Coliphage phiX174, complete genome", 575 ... "bases 1-11") 576 >>> record_coords(record, 0, 4) # zero-based 577 Seq('GAGT', Alphabet()) 578 >>> record_coords(record, 0, 4, "-") # zero-based 579 Seq('ACTC', Alphabet()) 580 >>> record_coords(record, 0, 4, "-", upper=1) # zero-based 581 Seq('ACTC', Alphabet()) 582 """ 583 584 subseq = record_sequence(record)[start:end] 585 subseq_str = subseq.tostring() 586 subseq_str = subseq_str.upper() 587 subseq = Seq(subseq_str, subseq.alphabet) 588 if strand == '-' or strand == 1: 589 return subseq.reverse_complement() 590 else: 591 return subseq
592
593 -def _test(*args, **keywds):
594 import doctest, sys 595 doctest.testmod(sys.modules[__name__], *args, **keywds)
596 597 if __name__ == "__main__": 598 if __debug__: 599 _test() 600