Package Bio :: Package GenBank
[hide private]
[frames] | no frames]

Source Code for Package Bio.GenBank

   1  # Copyright 2000 by Jeffrey Chang, Brad Chapman.  All rights reserved. 
   2  # Copyright 2006, 2007 by Peter Cock.  All rights reserved. 
   3  # This code is part of the Biopython distribution and governed by its 
   4  # license.  Please see the LICENSE file that should have been included 
   5  # as part of this package. 
   6   
   7  """Code to work with GenBank formatted files. 
   8   
   9  Classes: 
  10  Iterator              Iterate through a file of GenBank entries 
  11  Dictionary            Access a GenBank file using a dictionary interface. 
  12  ErrorFeatureParser    Catch errors caused during parsing. 
  13  FeatureParser         Parse GenBank data in Seq and SeqFeature objects. 
  14  RecordParser          Parse GenBank data into a Record object. 
  15  NCBIDictionary        Access GenBank using a dictionary interface. 
  16   
  17  _BaseGenBankConsumer  A base class for GenBank consumer that implements 
  18                        some helpful functions that are in common between 
  19                        consumers. 
  20  _FeatureConsumer      Create SeqFeature objects from info generated by 
  21                        the Scanner 
  22  _RecordConsumer       Create a GenBank record object from Scanner info. 
  23  _PrintingConsumer     A debugging consumer. 
  24   
  25  ParserFailureError    Exception indicating a failure in the parser (ie. 
  26                        scanner or consumer) 
  27  LocationParserError   Exception indiciating a problem with the spark based 
  28                        location parser. 
  29   
  30  Functions: 
  31  index_file            Get a GenBank file ready to be used as a Dictionary. 
  32  search_for            Do a query against GenBank. 
  33  download_many         Download many GenBank records. 
  34   
  35  """ 
  36  import cStringIO 
  37   
  38  # other Biopython stuff 
  39  from Bio.ParserSupport import AbstractConsumer 
  40  from utils import FeatureValueCleaner 
  41   
  42  #There used to be a (GenBank only) class _Scanner in 
  43  #this file.  Now use a more generic system which we import: 
  44  from Scanner import GenBankScanner 
  45   
  46  #These are used by the index_file function and Dictionary class: 
  47  #Moved the import inside the deprecated functions 
  48  #from Bio import Mindy 
  49  #from Bio.Mindy import SimpleSeqRecord 
  50   
  51  #These are used for downloading files from GenBank 
  52  from Bio import EUtils 
  53  from Bio.EUtils import DBIds, DBIdsClient 
  54   
  55  #Constants used to parse GenBank header lines 
  56  GENBANK_INDENT = 12 
  57  GENBANK_SPACER = " " * GENBANK_INDENT 
  58   
  59  #Constants for parsing GenBank feature lines 
  60  FEATURE_KEY_INDENT = 5 
  61  FEATURE_QUALIFIER_INDENT = 21 
  62  FEATURE_KEY_SPACER = " " * FEATURE_KEY_INDENT 
  63  FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT 
  64           
65 -class Dictionary:
66 """Access a GenBank file using a dictionary-like interface. 67 """
68 - def __init__(self, indexname, parser = None):
69 """Initialize and open up a GenBank dictionary. DEPRECATED 70 71 Each entry is a full GenBank record (i.e. from the LOCUS line 72 to the // at the end of the sequence). 73 74 Most GenBank files have only one such "entry", in which case 75 using this dictionary class is rather unnecessary. 76 77 Arguments: 78 o indexname - The name of the index for the file. This should have been 79 created using the index_file function. 80 o parser - An optional argument specifying a parser object that 81 the records should be run through before returning the output. If 82 parser is None then the unprocessed contents of the file will be 83 returned. 84 """ 85 86 import warnings 87 warnings.warn("Bio.GenBank.index_file Bio.GenBank.Dictionary are deprecated." \ 88 + " We hope an in memory dictionary, for example using the" \ 89 + " Bio.SeqIO.to_dict() function, will be suitable for" \ 90 + " most users. Please get in touch on the mailing lists if" \ 91 + " this (or its removal) causes any problems for you.", 92 DeprecationWarning) 93 94 from Bio import Mindy 95 self._index = Mindy.open(indexname) 96 self._parser = parser
97
98 - def __len__(self):
99 return len(self.keys())
100
101 - def __getitem__(self, key):
102 # first try to retrieve by the base id 103 try: 104 seqs = self._index.lookup(id = key) 105 # if we can't do that, we have to try and fetch by alias 106 except KeyError: 107 seqs = self._index.lookup(aliases = key) 108 109 if len(seqs) == 1: 110 seq = seqs[0] 111 else: 112 raise KeyError("Multiple sequences found for %s" % key) 113 114 if self._parser: 115 handle = cStringIO.StringIO(seq.text) 116 return self._parser.parse(handle) 117 else: 118 return seq.text
119
120 - def keys(self):
121 primary_key_retriever = self._index['id'] 122 return primary_key_retriever.keys()
123
124 -class Iterator:
125 """Iterator interface to move over a file of GenBank entries one at a time. 126 """
127 - def __init__(self, handle, parser = None):
128 """Initialize the iterator. 129 130 Arguments: 131 o handle - A handle with GenBank entries to iterate through. 132 o parser - An optional parser to pass the entries through before 133 returning them. If None, then the raw entry will be returned. 134 """ 135 self.handle = handle 136 self._parser = parser
137
138 - def next(self):
139 """Return the next GenBank record from the handle. 140 141 Will return None if we ran out of records. 142 """ 143 if self._parser is None : 144 lines = [] 145 while True : 146 line = self.handle.readline() 147 if not line : return None #Premature end of file? 148 lines.append(line) 149 if line.rstrip() == "//" : break 150 return "".join(lines) 151 try : 152 return self._parser.parse(self.handle) 153 except StopIteration : 154 return None
155
156 - def __iter__(self):
157 return iter(self.next, None)
158
159 -class ParserFailureError(Exception):
160 """Failure caused by some kind of problem in the parser. 161 """ 162 pass
163
164 -class LocationParserError(Exception):
165 """Could not Properly parse out a location from a GenBank file. 166 """ 167 pass
168
169 -class FeatureParser:
170 """Parse GenBank files into Seq + Feature objects. 171 """
172 - def __init__(self, debug_level = 0, use_fuzziness = 1, 173 feature_cleaner = FeatureValueCleaner()):
174 """Initialize a GenBank parser and Feature consumer. 175 176 Arguments: 177 o debug_level - An optional argument that species the amount of 178 debugging information the parser should spit out. By default we have 179 no debugging info (the fastest way to do things), but if you want 180 you can set this as high as two and see exactly where a parse fails. 181 o use_fuzziness - Specify whether or not to use fuzzy representations. 182 The default is 1 (use fuzziness). 183 o feature_cleaner - A class which will be used to clean out the 184 values of features. This class must implement the function 185 clean_value. GenBank.utils has a "standard" cleaner class, which 186 is used by default. 187 """ 188 self._scanner = GenBankScanner(debug_level) 189 self.use_fuzziness = use_fuzziness 190 self._cleaner = feature_cleaner
191
192 - def parse(self, handle):
193 """Parse the specified handle. 194 """ 195 self._consumer = _FeatureConsumer(self.use_fuzziness, 196 self._cleaner) 197 self._scanner.feed(handle, self._consumer) 198 return self._consumer.data
199
200 -class RecordParser:
201 """Parse GenBank files into Record objects 202 """
203 - def __init__(self, debug_level = 0):
204 """Initialize the parser. 205 206 Arguments: 207 o debug_level - An optional argument that species the amount of 208 debugging information the parser should spit out. By default we have 209 no debugging info (the fastest way to do things), but if you want 210 you can set this as high as two and see exactly where a parse fails. 211 """ 212 self._scanner = GenBankScanner(debug_level)
213
214 - def parse(self, handle):
215 """Parse the specified handle into a GenBank record. 216 """ 217 self._consumer = _RecordConsumer() 218 self._scanner.feed(handle, self._consumer) 219 return self._consumer.data
220
221 -class _BaseGenBankConsumer(AbstractConsumer):
222 """Abstract GenBank consumer providing useful general functions. 223 224 This just helps to eliminate some duplication in things that most 225 GenBank consumers want to do. 226 """ 227 # Special keys in GenBank records that we should remove spaces from 228 # For instance, \translation keys have values which are proteins and 229 # should have spaces and newlines removed from them. This class 230 # attribute gives us more control over specific formatting problems. 231 remove_space_keys = ["translation"] 232
233 - def __init__(self):
234 pass
235
236 - def _split_keywords(self, keyword_string):
237 """Split a string of keywords into a nice clean list. 238 """ 239 # process the keywords into a python list 240 if keyword_string == "" or keyword_string == "." : 241 keywords = "" 242 elif keyword_string[-1] == '.': 243 keywords = keyword_string[:-1] 244 else: 245 keywords = keyword_string 246 keyword_list = keywords.split(';') 247 clean_keyword_list = [x.strip() for x in keyword_list] 248 return clean_keyword_list
249
250 - def _split_accessions(self, accession_string):
251 """Split a string of accession numbers into a list. 252 """ 253 # first replace all line feeds with spaces 254 # Also, EMBL style accessions are split with ';' 255 accession = accession_string.replace("\n", " ").replace(";"," ") 256 257 return [x.strip() for x in accession.split(' ')]
258
259 - def _split_taxonomy(self, taxonomy_string):
260 """Split a string with taxonomy info into a list. 261 """ 262 if not taxonomy_string or taxonomy_string=="." : 263 #Missing data, no taxonomy 264 return [] 265 266 if taxonomy_string[-1] == '.': 267 tax_info = taxonomy_string[:-1] 268 else: 269 tax_info = taxonomy_string 270 tax_list = tax_info.split(';') 271 new_tax_list = [] 272 for tax_item in tax_list: 273 new_items = tax_item.split("\n") 274 new_tax_list.extend(new_items) 275 while '' in new_tax_list: 276 new_tax_list.remove('') 277 clean_tax_list = [x.strip() for x in new_tax_list] 278 279 return clean_tax_list
280
281 - def _clean_location(self, location_string):
282 """Clean whitespace out of a location string. 283 284 The location parser isn't a fan of whitespace, so we clean it out 285 before feeding it into the parser. 286 """ 287 import string 288 location_line = location_string 289 for ws in string.whitespace: 290 location_line = location_line.replace(ws, '') 291 292 return location_line
293
294 - def _remove_newlines(self, text):
295 """Remove any newlines in the passed text, returning the new string. 296 """ 297 # get rid of newlines in the qualifier value 298 newlines = ["\n", "\r"] 299 for ws in newlines: 300 text = text.replace(ws, "") 301 302 return text
303
304 - def _normalize_spaces(self, text):
305 """Replace multiple spaces in the passed text with single spaces. 306 """ 307 # get rid of excessive spaces 308 text_parts = text.split(" ") 309 text_parts = filter(None, text_parts) 310 return ' '.join(text_parts)
311
312 - def _remove_spaces(self, text):
313 """Remove all spaces from the passed text. 314 """ 315 return text.replace(" ", "")
316
317 - def _convert_to_python_numbers(self, start, end):
318 """Convert a start and end range to python notation. 319 320 In GenBank, starts and ends are defined in "biological" coordinates, 321 where 1 is the first base and [i, j] means to include both i and j. 322 323 In python, 0 is the first base and [i, j] means to include i, but 324 not j. 325 326 So, to convert "biological" to python coordinates, we need to 327 subtract 1 from the start, and leave the end and things should 328 be converted happily. 329 """ 330 new_start = start - 1 331 new_end = end 332 333 return new_start, new_end
334
335 -class _FeatureConsumer(_BaseGenBankConsumer):
336 """Create a SeqRecord object with Features to return. 337 338 Attributes: 339 o use_fuzziness - specify whether or not to parse with fuzziness in 340 feature locations. 341 o feature_cleaner - a class that will be used to provide specialized 342 cleaning-up of feature values. 343 """
344 - def __init__(self, use_fuzziness, feature_cleaner = None):
345 from Bio.SeqRecord import SeqRecord 346 _BaseGenBankConsumer.__init__(self) 347 self.data = SeqRecord(None, id = None) 348 self.data.id = None 349 self.data.description = "" 350 351 self._use_fuzziness = use_fuzziness 352 self._feature_cleaner = feature_cleaner 353 354 self._seq_type = '' 355 self._seq_data = [] 356 self._current_ref = None 357 self._cur_feature = None 358 self._cur_qualifier_key = None 359 self._cur_qualifier_value = None
360
361 - def locus(self, locus_name):
362 """Set the locus name is set as the name of the Sequence. 363 """ 364 self.data.name = locus_name
365
366 - def size(self, content):
367 pass
368
369 - def residue_type(self, type):
370 """Record the sequence type so we can choose an appropriate alphabet. 371 """ 372 self._seq_type = type
373
374 - def data_file_division(self, division):
375 self.data.annotations['data_file_division'] = division
376
377 - def date(self, submit_date):
378 self.data.annotations['date'] = submit_date
379
380 - def definition(self, definition):
381 """Set the definition as the description of the sequence. 382 """ 383 if self.data.description : 384 #Append to any existing description 385 #e.g. EMBL files with two DE lines. 386 self.data.description += " " + definition 387 else : 388 self.data.description = definition
389
390 - def accession(self, acc_num):
391 """Set the accession number as the id of the sequence. 392 393 If we have multiple accession numbers, the first one passed is 394 used. 395 """ 396 new_acc_nums = self._split_accessions(acc_num) 397 398 #Also record them ALL in the annotations 399 try : 400 #On the off chance there was more than one accession line: 401 self.data.annotations['accessions'].extend(new_acc_nums) 402 except KeyError : 403 self.data.annotations['accessions'] = new_acc_nums 404 405 # if we haven't set the id information yet, add the first acc num 406 if self.data.id is None: 407 if len(new_acc_nums) > 0: 408 #self.data.id = new_acc_nums[0] 409 #Use the FIRST accession as the ID, not the first on this line! 410 self.data.id = self.data.annotations['accessions'][0]
411 412
413 - def nid(self, content):
414 self.data.annotations['nid'] = content
415
416 - def pid(self, content):
417 self.data.annotations['pid'] = content
418
419 - def version(self, version_id):
420 #Want to use the versioned accension as the record.id 421 #This comes from the VERSION line in GenBank files, or the 422 #obsolete SV line in EMBL. For the new EMBL files we need 423 #both the version suffix from the ID line and the accession 424 #from the AC line. 425 if version_id.count(".")==1 and version_id.split(".")[1].isdigit() : 426 self.accession(version_id.split(".")[0]) 427 self.version_suffix(version_id.split(".")[1]) 428 else : 429 #For backwards compatibility... 430 self.data.id = version_id
431
432 - def version_suffix(self, version):
433 """Set the version to overwrite the id. 434 435 Since the verison provides the same information as the accession 436 number, plus some extra info, we set this as the id if we have 437 a version. 438 """ 439 #e.g. GenBank line: 440 #VERSION U49845.1 GI:1293613 441 #or the obsolete EMBL line: 442 #SV U49845.1 443 #Scanner calls consumer.version("U49845.1") 444 #which then calls consumer.version_suffix(1) 445 # 446 #e.g. EMBL new line: 447 #ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. 448 #Scanner calls consumer.version_suffix(1) 449 assert version.isdigit() 450 self.data.annotations['sequence_version'] = int(version)
451
452 - def db_source(self, content):
453 self.data.annotations['db_source'] = content.rstrip()
454
455 - def gi(self, content):
456 self.data.annotations['gi'] = content
457
458 - def keywords(self, content):
459 self.data.annotations['keywords'] = self._split_keywords(content)
460
461 - def segment(self, content):
462 self.data.annotations['segment'] = content
463
464 - def source(self, content):
465 if content[-1] == '.': 466 source_info = content[:-1] 467 else: 468 source_info = content 469 self.data.annotations['source'] = source_info
470
471 - def organism(self, content):
472 self.data.annotations['organism'] = content
473
474 - def taxonomy(self, content):
475 self.data.annotations['taxonomy'] = self._split_taxonomy(content)
476
477 - def reference_num(self, content):
478 """Signal the beginning of a new reference object. 479 """ 480 from Bio.SeqFeature import Reference 481 # if we have a current reference that hasn't been added to 482 # the list of references, add it. 483 if self._current_ref is not None: 484 self.data.annotations['references'].append(self._current_ref) 485 else: 486 self.data.annotations['references'] = [] 487 488 self._current_ref = Reference()
489
490 - def reference_bases(self, content):
491 """Attempt to determine the sequence region the reference entails. 492 493 Possible types of information we may have to deal with: 494 495 (bases 1 to 86436) 496 (sites) 497 (bases 1 to 105654; 110423 to 111122) 498 1 (residues 1 to 182) 499 """ 500 # first remove the parentheses or other junk 501 ref_base_info = content[1:-1] 502 503 all_locations = [] 504 # parse if we've got 'bases' and 'to' 505 if ref_base_info.find('bases') != -1 and \ 506 ref_base_info.find('to') != -1: 507 # get rid of the beginning 'bases' 508 ref_base_info = ref_base_info[5:] 509 locations = self._split_reference_locations(ref_base_info) 510 all_locations.extend(locations) 511 elif (ref_base_info.find("residues") >= 0 and 512 ref_base_info.find("to") >= 0): 513 residues_start = ref_base_info.find("residues") 514 # get only the information after "residues" 515 ref_base_info = ref_base_info[(residues_start + len("residues ")):] 516 locations = self._split_reference_locations(ref_base_info) 517 all_locations.extend(locations) 518 519 # make sure if we are not finding information then we have 520 # the string 'sites' or the string 'bases' 521 elif (ref_base_info == 'sites' or 522 ref_base_info.strip() == 'bases'): 523 pass 524 # otherwise raise an error 525 else: 526 raise ValueError("Could not parse base info %s in record %s" % 527 (ref_base_info, self.data.id)) 528 529 self._current_ref.location = all_locations
530
531 - def _split_reference_locations(self, location_string):
532 """Get reference locations out of a string of reference information 533 534 The passed string should be of the form: 535 536 1 to 20; 20 to 100 537 538 This splits the information out and returns a list of location objects 539 based on the reference locations. 540 """ 541 from Bio import SeqFeature 542 # split possibly multiple locations using the ';' 543 all_base_info = location_string.split(';') 544 545 new_locations = [] 546 for base_info in all_base_info: 547 start, end = base_info.split('to') 548 new_start, new_end = \ 549 self._convert_to_python_numbers(int(start.strip()), 550 int(end.strip())) 551 this_location = SeqFeature.FeatureLocation(new_start, new_end) 552 new_locations.append(this_location) 553 return new_locations
554
555 - def authors(self, content):
556 self._current_ref.authors = content
557
558 - def consrtm(self, content):
559 self._current_ref.consrtm = content
560
561 - def title(self, content):
562 self._current_ref.title = content
563
564 - def journal(self, content):
565 self._current_ref.journal = content
566
567 - def medline_id(self, content):
568 self._current_ref.medline_id = content
569
570 - def pubmed_id(self, content):
571 self._current_ref.pubmed_id = content
572
573 - def remark(self, content):
574 self._current_ref.comment = content
575
576 - def comment(self, content):
577 try : 578 self.data.annotations['comment'] += "\n" + "\n".join(content) 579 except KeyError : 580 self.data.annotations['comment'] = "\n".join(content)
581
582 - def features_line(self, content):
583 """Get ready for the feature table when we reach the FEATURE line. 584 """ 585 self.start_feature_table()
586
587 - def start_feature_table(self):
588 """Indicate we've got to the start of the feature table. 589 """ 590 # make sure we've added on our last reference object 591 if self._current_ref is not None: 592 self.data.annotations['references'].append(self._current_ref) 593 self._current_ref = None
594
595 - def _add_feature(self):
596 """Utility function to add a feature to the SeqRecord. 597 598 This does all of the appropriate checking to make sure we haven't 599 left any info behind, and that we are only adding info if it 600 exists. 601 """ 602 if self._cur_feature: 603 # if we have a left over qualifier, add it to the qualifiers 604 # on the current feature 605 self._add_qualifier() 606 607 self._cur_qualifier_key = '' 608 self._cur_qualifier_value = '' 609 self.data.features.append(self._cur_feature)
610
611 - def feature_key(self, content):
612 from Bio import SeqFeature 613 # if we already have a feature, add it on 614 self._add_feature() 615 616 # start a new feature 617 self._cur_feature = SeqFeature.SeqFeature() 618 self._cur_feature.type = content 619 620 # assume positive strand to start with if we have DNA or cDNA 621 # (labelled as mRNA). The complement in the location will 622 # change this later if something is on the reverse strand 623 if self._seq_type.find("DNA") >= 0 or self._seq_type.find("mRNA") >= 0: 624 self._cur_feature.strand = 1
625
626 - def location(self, content):
627 """Parse out location information from the location string. 628 629 This uses Andrew's nice spark based parser to do the parsing 630 for us, and translates the results of the parse into appropriate 631 Location objects. 632 """ 633 from Bio.GenBank import LocationParser 634 # --- first preprocess the location for the spark parser 635 636 # we need to clean up newlines and other whitespace inside 637 # the location before feeding it to the parser. 638 # locations should have no whitespace whatsoever based on the 639 # grammer 640 location_line = self._clean_location(content) 641 642 # Older records have junk like replace(266,"c") in the 643 # location line. Newer records just replace this with 644 # the number 266 and have the information in a more reasonable 645 # place. So we'll just grab out the number and feed this to the 646 # parser. We shouldn't really be losing any info this way. 647 if location_line.find('replace') != -1: 648 comma_pos = location_line.find(',') 649 location_line = location_line[8:comma_pos] 650 651 # feed everything into the scanner and parser 652 try: 653 parse_info = \ 654 LocationParser.parse(LocationParser.scan(location_line)) 655 # spark raises SystemExit errors when parsing fails 656 except SystemExit: 657 raise LocationParserError(location_line) 658 659 # print "parse_info:", repr(parse_info) 660 661 # add the parser information the current feature 662 self._set_location_info(parse_info, self._cur_feature)
663
664 - def _set_function(self, function, cur_feature):
665 """Set the location information based on a function. 666 667 This handles all of the location functions like 'join', 'complement' 668 and 'order'. 669 670 Arguments: 671 o function - A LocationParser.Function object specifying the 672 function we are acting on. 673 o cur_feature - The feature to add information to. 674 """ 675 from Bio import SeqFeature 676 from Bio.GenBank import LocationParser 677 assert isinstance(function, LocationParser.Function), \ 678 "Expected a Function object, got %s" % function 679 680 if function.name == "complement": 681 # mark the current feature as being on the opposite strand 682 cur_feature.strand = -1 683 # recursively deal with whatever is left inside the complement 684 for inner_info in function.args: 685 self._set_location_info(inner_info, cur_feature) 686 # deal with functions that have multipe internal segments that 687 # are connected somehow. 688 # join and order are current documented functions. 689 # one-of is something I ran across in old files. Treating it 690 # as a sub sequence feature seems appropriate to me. 691 # bond is some piece of junk I found in RefSeq files. I have 692 # no idea how to interpret it, so I jam it in here 693 elif (function.name == "join" or function.name == "order" or 694 function.name == "one-of" or function.name == "bond"): 695 self._set_ordering_info(function, cur_feature) 696 elif (function.name == "gap"): 697 assert len(function.args) == 1, \ 698 "Unexpected number of arguments in gap %s" % function.args 699 # make the cur information location a gap object 700 position = self._get_position(function.args[0].local_location) 701 cur_feature.location = SeqFeature.PositionGap(position) 702 else: 703 raise ValueError("Unexpected function name: %s" % function.name)
704
705 - def _set_ordering_info(self, function, cur_feature):
706 """Parse a join or order and all of the information in it. 707 708 This deals with functions that order a bunch of locations, 709 specifically 'join' and 'order'. The inner locations are 710 added as subfeatures of the top level feature 711 """ 712 from Bio import SeqFeature 713 # for each inner element, create a sub SeqFeature within the 714 # current feature, then get the information for this feature 715 for inner_element in function.args: 716 new_sub_feature = SeqFeature.SeqFeature() 717 # inherit the type from the parent 718 new_sub_feature.type = cur_feature.type 719 # add the join or order info to the location_operator 720 cur_feature.location_operator = function.name 721 new_sub_feature.location_operator = function.name 722 # inherit references and strand from the parent feature 723 new_sub_feature.ref = cur_feature.ref 724 new_sub_feature.ref_db = cur_feature.ref_db 725 new_sub_feature.strand = cur_feature.strand 726 727 # set the information for the inner element 728 self._set_location_info(inner_element, new_sub_feature) 729 730 # now add the feature to the sub_features 731 cur_feature.sub_features.append(new_sub_feature) 732 733 # set the location of the top -- this should be a combination of 734 # the start position of the first sub_feature and the end position 735 # of the last sub_feature 736 737 # these positions are already converted to python coordinates 738 # (when the sub_features were added) so they don't need to 739 # be converted again 740 feature_start = cur_feature.sub_features[0].location.start 741 feature_end = cur_feature.sub_features[-1].location.end 742 cur_feature.location = SeqFeature.FeatureLocation(feature_start, 743 feature_end)
744
745 - def _set_location_info(self, parse_info, cur_feature):
746 """Set the location information for a feature from the parse info. 747 748 Arguments: 749 o parse_info - The classes generated by the LocationParser. 750 o cur_feature - The feature to add the information to. 751 """ 752 from Bio.GenBank import LocationParser 753 # base case -- we are out of information 754 if parse_info is None: 755 return 756 # parse a location -- this is another base_case -- we assume 757 # we have no information after a single location 758 elif isinstance(parse_info, LocationParser.AbsoluteLocation): 759 self._set_location(parse_info, cur_feature) 760 return 761 # parse any of the functions (join, complement, etc) 762 elif isinstance(parse_info, LocationParser.Function): 763 self._set_function(parse_info, cur_feature) 764 # otherwise we are stuck and should raise an error 765 else: 766 raise ValueError("Could not parse location info: %s" 767 % parse_info)
768
769 - def _set_location(self, location, cur_feature):
770 """Set the location information for a feature. 771 772 Arguments: 773 o location - An AbsoluteLocation object specifying the info 774 about the location. 775 o cur_feature - The feature to add the information to. 776 """ 777 # check to see if we have a cross reference to another accession 778 # ie. U05344.1:514..741 779 if location.path is not None: 780 cur_feature.ref = location.path.accession 781 cur_feature.ref_db = location.path.database 782 # now get the actual location information 783 cur_feature.location = self._get_location(location.local_location)
784
785 - def _get_location(self, range_info):
786 """Return a (possibly fuzzy) location from a Range object. 787 788 Arguments: 789 o range_info - A location range (ie. something like 67..100). This 790 may also be a single position (ie 27). 791 792 This returns a FeatureLocation object. 793 If parser.use_fuzziness is set at one, the positions for the 794 end points will possibly be fuzzy. 795 """ 796 from Bio import SeqFeature 797 from Bio.GenBank import LocationParser 798 # check if we just have a single base 799 if not(isinstance(range_info, LocationParser.Range)): 800 pos = self._get_position(range_info) 801 # move the single position back one to be consistent with how 802 # python indexes numbers (starting at 0) 803 pos.position = pos.position - 1 804 return SeqFeature.FeatureLocation(pos, pos) 805 # otherwise we need to get both sides of the range 806 else: 807 # get *Position objects for the start and end 808 start_pos = self._get_position(range_info.low) 809 end_pos = self._get_position(range_info.high) 810 811 start_pos.position, end_pos.position = \ 812 self._convert_to_python_numbers(start_pos.position, 813 end_pos.position) 814 815 return SeqFeature.FeatureLocation(start_pos, end_pos)
816
817 - def _get_position(self, position):
818 """Return a (possibly fuzzy) position for a single coordinate. 819 820 Arguments: 821 o position - This is a LocationParser.* object that specifies 822 a single coordinate. We will examine the object to determine 823 the fuzziness of the position. 824 825 This is used with _get_location to parse out a location of any 826 end_point of arbitrary fuzziness. 827 """ 828 from Bio import SeqFeature 829 from Bio.GenBank import LocationParser 830 # case 1 -- just a normal number 831 if (isinstance(position, LocationParser.Integer)): 832 final_pos = SeqFeature.ExactPosition(position.val) 833 # case 2 -- we've got a > sign 834 elif isinstance(position, LocationParser.LowBound): 835 final_pos = SeqFeature.AfterPosition(position.base.val) 836 # case 3 -- we've got a < sign 837 elif isinstance(position, LocationParser.HighBound): 838 final_pos = SeqFeature.BeforePosition(position.base.val) 839 # case 4 -- we've got 100^101 840 elif isinstance(position, LocationParser.Between): 841 final_pos = SeqFeature.BetweenPosition(position.low.val, 842 position.high.val) 843 # case 5 -- we've got (100.101) 844 elif isinstance(position, LocationParser.TwoBound): 845 final_pos = SeqFeature.WithinPosition(position.low.val, 846 position.high.val) 847 # case 6 -- we've got a one-of(100, 110) location 848 elif isinstance(position, LocationParser.Function) and \ 849 position.name == "one-of": 850 # first convert all of the arguments to positions 851 position_choices = [] 852 for arg in position.args: 853 # we only handle AbsoluteLocations with no path 854 # right now. Not sure if other cases will pop up 855 assert isinstance(arg, LocationParser.AbsoluteLocation), \ 856 "Unhandled Location type %r" % arg 857 assert arg.path is None, "Unhandled path in location" 858 position = self._get_position(arg.local_location) 859 position_choices.append(position) 860 final_pos = SeqFeature.OneOfPosition(position_choices) 861 # if it is none of these cases we've got a problem! 862 else: 863 raise ValueError("Unexpected LocationParser object %r" % 864 position) 865 866 # if we are using fuzziness return what we've got 867 if self._use_fuzziness: 868 return final_pos 869 # otherwise return an ExactPosition equivalent 870 else: 871 return SeqFeature.ExactPosition(final_pos.location)
872
873 - def _add_qualifier(self):
874 """Add a qualifier to the current feature without loss of info. 875 876 If there are multiple qualifier keys with the same name we 877 would lose some info in the dictionary, so we append a unique 878 number to the end of the name in case of conflicts. 879 """ 880 # if we've got a key from before, add it to the dictionary of 881 # qualifiers 882 if self._cur_qualifier_key: 883 key = self._cur_qualifier_key 884 value = "".join(self._cur_qualifier_value) 885 if self._feature_cleaner is not None: 886 value = self._feature_cleaner.clean_value(key, value) 887 # if the qualifier name exists, append the value 888 if key in self._cur_feature.qualifiers: 889 self._cur_feature.qualifiers[key].append(value) 890 # otherwise start a new list of the key with its values 891 else: 892 self._cur_feature.qualifiers[key] = [value]
893
894 - def feature_qualifier_name(self, content_list):
895 """When we get a qualifier key, use it as a dictionary key. 896 897 We receive a list of keys, since you can have valueless keys such as 898 /pseudo which would be passed in with the next key (since no other 899 tags separate them in the file) 900 """ 901 for content in content_list: 902 # add a qualifier if we've got one 903 self._add_qualifier() 904 905 # remove the / and = from the qualifier if they're present 906 qual_key = content.replace('/', '') 907 qual_key = qual_key.replace('=', '') 908 qual_key = qual_key.strip() 909 910 self._cur_qualifier_key = qual_key 911 self._cur_qualifier_value = []
912
913 - def feature_qualifier_description(self, content):
914 # get rid of the quotes surrounding the qualifier if we've got 'em 915 qual_value = content.replace('"', '') 916 917 self._cur_qualifier_value.append(qual_value)
918
919 - def contig_location(self, content):
920 """Deal with a location of CONTIG information. 921 """ 922 from Bio import SeqFeature 923 # add a last feature if is hasn't been added, 924 # so that we don't overwrite it 925 self._add_feature() 926 927 # make a feature to add the information to 928 self._cur_feature = SeqFeature.SeqFeature() 929 self._cur_feature.type = "contig" 930 931 # now set the location on the feature using the standard 932 # location handler 933 self.location(content) 934 # add the contig information to the annotations and get rid 935 # of the feature to prevent it from being added to the feature table 936 self.data.annotations["contig"] = self._cur_feature 937 self._cur_feature = None
938
939 - def origin_name(self, content):
940 pass
941
942 - def base_count(self, content):
943 pass
944
945 - def base_number(self, content):
946 pass
947
948 - def sequence(self, content):
949 """Add up sequence information as we get it. 950 951 To try and make things speedier, this puts all of the strings 952 into a list of strings, and then uses string.join later to put 953 them together. Supposedly, this is a big time savings 954 """ 955 new_seq = content.replace(' ', '') 956 new_seq = new_seq.upper() 957 958 self._seq_data.append(new_seq)
959
960 - def record_end(self, content):
961 """Clean up when we've finished the record. 962 """ 963 from Bio import Alphabet 964 from Bio.Alphabet import IUPAC 965 from Bio.Seq import Seq 966 967 #Try and append the version number to the accession for the full id 968 if self.data.id.count('.') == 0 : 969 try : 970 self.data.id+='.%i' % self.data.annotations['sequence_version'] 971 except KeyError : 972 pass 973 974 # add the last feature in the table which hasn't been added yet 975 self._add_feature() 976 977 # add the sequence information 978 # first, determine the alphabet 979 # we default to an generic alphabet if we don't have a 980 # seq type or have strange sequence information. 981 seq_alphabet = Alphabet.generic_alphabet 982 983 if self._seq_type: 984 # mRNA is really also DNA, since it is actually cDNA 985 if self._seq_type.find('DNA') != -1 or \ 986 self._seq_type.find('mRNA') != -1: 987 seq_alphabet = IUPAC.ambiguous_dna 988 # are there every really RNA sequences in GenBank? 989 elif self._seq_type.find('RNA') != -1: 990 seq_alphabet = IUPAC.ambiguous_rna 991 elif self._seq_type.find('PROTEIN') != -1 : 992 seq_alphabet = IUPAC.protein # or extended protein? 993 # work around ugly GenBank records which have circular or 994 # linear but no indication of sequence type 995 elif self._seq_type in ["circular", "linear"]: 996 pass 997 # we have a bug if we get here 998 else: 999 raise ValueError("Could not determine alphabet for seq_type %s" 1000 % self._seq_type) 1001 1002 # now set the sequence 1003 sequence = "".join(self._seq_data) 1004 self.data.seq = Seq(sequence, seq_alphabet)
1005
1006 -class _RecordConsumer(_BaseGenBankConsumer):
1007 """Create a GenBank Record object from scanner generated information. 1008 """
1009 - def __init__(self):
1010 _BaseGenBankConsumer.__init__(self) 1011 import Record 1012 self.data = Record.Record() 1013 1014 self._seq_data = [] 1015 self._cur_reference = None 1016 self._cur_feature = None 1017 self._cur_qualifier = None
1018
1019 - def locus(self, content):
1020 self.data.locus = content
1021
1022 - def size(self, content):
1023 self.data.size = content
1024
1025 - def residue_type(self, content):
1026 self.data.residue_type = content
1027
1028 - def data_file_division(self, content):
1029 self.data.data_file_division = content
1030
1031 - def date(self, content):
1032 self.data.date = content
1033
1034 - def definition(self, content):
1035 self.data.definition = content
1036
1037 - def accession(self, content):
1038 new_accessions = self._split_accessions(content) 1039 self.data.accession.extend(new_accessions)
1040
1041 - def nid(self, content):
1042 self.data.nid = content
1043
1044 - def pid(self, content):
1045 self.data.pid = content
1046
1047 - def version(self, content):
1048 self.data.version = content
1049
1050 - def db_source(self, content):
1051 self.data.db_source = content.rstrip()
1052
1053 - def gi(self, content):
1054 self.data.gi = content
1055
1056 - def keywords(self, content):
1057 self.data.keywords = self._split_keywords(content)
1058
1059 - def segment(self, content):
1060 self.data.segment = content
1061
1062 - def source(self, content):
1063 self.data.source = content
1064
1065 - def organism(self, content):
1066 self.data.organism = content
1067
1068 - def taxonomy(self, content):
1069 self.data.taxonomy = self._split_taxonomy(content)
1070
1071 - def reference_num(self, content):
1072 """Grab the reference number and signal the start of a new reference. 1073 """ 1074 # check if we have a reference to add 1075 if self._cur_reference is not None: 1076 self.data.references.append(self._cur_reference) 1077 1078 self._cur_reference = Record.Reference() 1079 self._cur_reference.number = content
1080
1081 - def reference_bases(self, content):
1082 self._cur_reference.bases = content
1083
1084 - def authors(self, content):
1085 self._cur_reference.authors = content
1086
1087 - def consrtm(self, content):
1088 self._cur_reference.consrtm = content
1089
1090 - def title(self, content):
1091 self._cur_reference.title = content
1092
1093 - def journal(self, content):
1094 self._cur_reference.journal = content
1095
1096 - def medline_id(self, content):
1097 self._cur_reference.medline_id = content
1098
1099 - def pubmed_id(self, content):
1100 self._cur_reference.pubmed_id = content
1101
1102 - def remark(self, content):
1103 self._cur_reference.remark = content
1104
1105 - def comment(self, content):
1106 self.data.comment += "\n".join(content)
1107
1108 - def primary_ref_line(self,content):
1109 """Data for the PRIMARY line""" 1110 self.data.primary.append(content)
1111
1112 - def primary(self,content):
1113 pass
1114
1115 - def features_line(self, content):
1116 """Get ready for the feature table when we reach the FEATURE line. 1117 """ 1118 self.start_feature_table()
1119
1120 - def start_feature_table(self):
1121 """Signal the start of the feature table. 1122 """ 1123 # we need to add on the last reference 1124 if self._cur_reference is not None: 1125 self.data.references.append(self._cur_reference)
1126
1127 - def feature_key(self, content):
1128 """Grab the key of the feature and signal the start of a new feature. 1129 """ 1130 # first add on feature information if we've got any 1131 self._add_feature() 1132 1133 self._cur_feature = Record.Feature() 1134 self._cur_feature.key = content
1135
1136 - def _add_feature(self):
1137 """Utility function to add a feature to the Record. 1138 1139 This does all of the appropriate checking to make sure we haven't 1140 left any info behind, and that we are only adding info if it 1141 exists. 1142 """ 1143 if self._cur_feature is not None: 1144 # if we have a left over qualifier, add it to the qualifiers 1145 # on the current feature 1146 if self._cur_qualifier is not None: 1147 self._cur_feature.qualifiers.append(self._cur_qualifier) 1148 1149 self._cur_qualifier = None 1150 self.data.features.append(self._cur_feature)
1151
1152 - def location(self, content):
1153 self._cur_feature.location = self._clean_location(content)
1154
1155 - def feature_qualifier_name(self, content_list):
1156 """Deal with qualifier names 1157 1158 We receive a list of keys, since you can have valueless keys such as 1159 /pseudo which would be passed in with the next key (since no other 1160 tags separate them in the file) 1161 """ 1162 for content in content_list: 1163 # the record parser keeps the /s -- add them if we don't have 'em 1164 if content.find("/") != 0: 1165 content = "/%s" % content 1166 # add on a qualifier if we've got one 1167 if self._cur_qualifier is not None: 1168 self._cur_feature.qualifiers.append(self._cur_qualifier) 1169 1170 self._cur_qualifier = Record.Qualifier() 1171 self._cur_qualifier.key = content
1172
1173 - def feature_qualifier_description(self, content):
1174 # if we have info then the qualifier key should have a ='s 1175 if self._cur_qualifier.key.find("=") == -1: 1176 self._cur_qualifier.key = "%s=" % self._cur_qualifier.key 1177 cur_content = self._remove_newlines(content) 1178 # remove all spaces from the value if it is a type where spaces 1179 # are not important 1180 for remove_space_key in self.__class__.remove_space_keys: 1181 if self._cur_qualifier.key.find(remove_space_key) >= 0: 1182 cur_content = self._remove_spaces(cur_content) 1183 self._cur_qualifier.value = self._normalize_spaces(cur_content)
1184
1185 - def base_count(self, content):
1186 self.data.base_counts = content
1187
1188 - def origin_name(self, content):
1189 self.data.origin = content
1190
1191 - def contig_location(self, content):
1192 """Signal that we have contig information to add to the record. 1193 """ 1194 self.data.contig = self._clean_location(content)
1195
1196 - def sequence(self, content):
1197 """Add sequence information to a list of sequence strings. 1198 1199 This removes spaces in the data and uppercases the sequence, and 1200 then adds it to a list of sequences. Later on we'll join this 1201 list together to make the final sequence. This is faster than 1202 adding on the new string every time. 1203 """ 1204 new_seq = content.replace(' ', '') 1205 self._seq_data.append(new_seq.upper())
1206
1207 - def record_end(self, content):
1208 """Signal the end of the record and do any necessary clean-up. 1209 """ 1210 # add together all of the sequence parts to create the 1211 # final sequence string 1212 self.data.sequence = "".join(self._seq_data) 1213 # add on the last feature 1214 self._add_feature()
1215
1216 -def _strip_and_combine(line_list):
1217 """Combine multiple lines of content separated by spaces. 1218 1219 This function is used by the EventGenerator callback function to 1220 combine multiple lines of information. The lines are first 1221 stripped to remove whitepsace, and then combined so they are separated 1222 by a space. This is a simple minded way to combine lines, but should 1223 work for most cases. 1224 """ 1225 # first strip out extra whitespace 1226 stripped_line_list = [x.strip() for x in line_list] 1227 1228 # now combine everything with spaces 1229 return ' '.join(stripped_line_list)
1230
1231 -def index_file(filename, indexname, rec2key = None, use_berkeley = 0):
1232 """Index a GenBank file to prepare it for use as a dictionary. DEPRECATED 1233 1234 Arguments: 1235 filename - The name of the GenBank file to be indexed. 1236 indexname - The name of the index to create 1237 rec2key - A reference to a function object which, when called with a 1238 SeqRecord object, will return a key to be used for the record. If no 1239 function is specified then the records will be indexed by the 'id' 1240 attribute of the SeqRecord (the versioned GenBank id). 1241 use_berkeley - specifies whether to use the BerkeleyDB indexer, which 1242 uses the bsddb3 wrappers around the embedded database Berkeley DB. By 1243 default, the standard flat file (non-Berkeley) indexes are used. 1244 """ 1245 1246 import warnings 1247 warnings.warn("Bio.GenBank.index_file Bio.GenBank.Dictionary are deprecated." \ 1248 + " We hope an in memory dictionary, for example using the" \ 1249 + " Bio.SeqIO.to_dict() function, will be suitable for" \ 1250 + " most users. Please get in touch on the mailing lists if" \ 1251 + " this (or its removal) causes any problems for you.", 1252 DeprecationWarning) 1253 1254 from Bio.Mindy import SimpleSeqRecord 1255 if rec2key: 1256 indexer = SimpleSeqRecord.FunctionIndexer(rec2key) 1257 else: 1258 indexer = SimpleSeqRecord.SimpleIndexer() 1259 1260 if use_berkeley: 1261 SimpleSeqRecord.create_berkeleydb([filename], indexname, indexer) 1262 else: 1263 SimpleSeqRecord.create_flatdb([filename], indexname, indexer)
1264
1265 -class NCBIDictionary:
1266 """Access GenBank using a read-only dictionary interface. 1267 """ 1268 VALID_DATABASES = ['nucleotide', 'protein', 'genome'] 1269 VALID_FORMATS = ['genbank', 'fasta']
1270 - def __init__(self, database, format, parser = None):
1271 """Initialize an NCBI dictionary to retrieve sequences. 1272 1273 Create a new Dictionary to access GenBank. Valid values for 1274 database are 'nucleotide' and 'protein'. 1275 Valid values for format are 'genbank' (for nucleotide genbank and 1276 protein genpept) and 'fasta'. 1277 dely and retmax are old options kept only for compatibility -- do not 1278 bother to set them. 1279 parser is an optional parser object 1280 to change the results into another form. If unspecified, then 1281 the raw contents of the file will be returned. 1282 """ 1283 from Bio import db 1284 self.parser = parser 1285 if database not in self.__class__.VALID_DATABASES: 1286 raise ValueError("Invalid database %s, should be one of %s" % 1287 (database, self.__class__.VALID_DATABASES)) 1288 if format not in self.__class__.VALID_FORMATS: 1289 raise ValueError("Invalid format %s, should be one of %s" % 1290 (format, self.__class__.VALID_FORMATS)) 1291 1292 if format == 'fasta': 1293 self.db = db["fasta-sequence-eutils"] 1294 elif format == 'genbank': 1295 if database == 'nucleotide': 1296 self.db = db["nucleotide-genbank-eutils"] 1297 elif database == 'protein': 1298 self.db = db["protein-genbank-eutils"] 1299 elif database == 'genome': 1300 self.db = db["genome-genbank-eutils"]
1301
1302 - def __len__(self):
1303 raise NotImplementedError, "GenBank contains lots of entries"
1304 - def clear(self):
1305 raise NotImplementedError, "This is a read-only dictionary"
1306 - def __setitem__(self, key, item):
1307 raise NotImplementedError, "This is a read-only dictionary"
1308 - def update(self):
1309 raise NotImplementedError, "This is a read-only dictionary"
1310 - def copy(self):
1311 raise NotImplementedError, "You don't need to do this..."
1312 - def keys(self):
1313 raise NotImplementedError, "You don't really want to do this..."
1314 - def items(self):
1315 raise NotImplementedError, "You don't really want to do this..."
1316 - def values(self):
1317 raise NotImplementedError, "You don't really want to do this..."
1318
1319 - def has_key(self, id):
1320 """S.has_key(id) -> bool""" 1321 try: 1322 self[id] 1323 except KeyError: 1324 return 0 1325 return 1
1326
1327 - def get(self, id, failobj=None):
1328 try: 1329 return self[id] 1330 except KeyError: 1331 return failobj 1332 raise "How did I get here?"
1333
1334 - def __getitem__(self, id):
1335 """Return the GenBank entry specified by the GenBank ID. 1336 1337 Raises a KeyError if there's an error. 1338 """ 1339 handle = self.db[id] 1340 # Parse the record if a parser was passed in. 1341 if self.parser is not None: 1342 return self.parser.parse(handle) 1343 return handle.read()
1344
1345 -def search_for(search, database='nucleotide', 1346 reldate=None, mindate=None, maxdate=None, 1347 start_id = 0, max_ids = 50000000):
1348 """search_for(search[, reldate][, mindate][, maxdate] 1349 [, batchsize][, delay][, callback_fn][, start_id][, max_ids]) -> ids 1350 1351 Search GenBank and return a list of the GenBank identifiers (gi's) 1352 that match the criteria. search is the search string used to 1353 search the database. Valid values for database are 1354 'nucleotide', 'protein', 'popset' and 'genome'. reldate is 1355 the number of dates prior to the current date to restrict the 1356 search. mindate and maxdate are the dates to restrict the search, 1357 e.g. 2002/01/01. start_id is the number to begin retrieval on. 1358 max_ids specifies the maximum number of id's to retrieve. 1359 1360 batchsize, delay and callback_fn are old parameters for 1361 compatibility -- do not set them. 1362 """ 1363 # deal with dates 1364 date_restrict = None 1365 if reldate: 1366 date_restrict = EUtils.WithinNDays(reldate) 1367 elif mindate: 1368 date_restrict = EUtils.DateRange(mindate, maxdate) 1369 1370 eutils_client = DBIdsClient.DBIdsClient() 1371 db_ids = eutils_client.search(search, database, daterange = date_restrict, 1372 retstart = start_id, retmax = max_ids) 1373 ids = [] 1374 for db_id in db_ids: 1375 ids.append(db_id.dbids.ids[0]) 1376 return ids
1377
1378 -def download_many(ids, database = 'nucleotide'):
1379 """download_many(ids, database) -> handle of results 1380 1381 Download many records from GenBank. ids is a list of gis or 1382 accessions. 1383 1384 callback_fn, broken_fn, delay, faildelay, batchsize, parser are old 1385 parameter for compatibility. They should not be used. 1386 """ 1387 db_ids = DBIds(database, ids) 1388 if database in ['nucleotide']: 1389 format = 'gb' 1390 elif database in ['protein']: 1391 format = 'gp' 1392 else: 1393 raise ValueError("Unexpected database: %s" % database) 1394 1395 eutils_client = DBIdsClient.from_dbids(db_ids) 1396 result_handle = eutils_client.efetch(retmode = "text", rettype = format) 1397 return cStringIO.StringIO(result_handle.read())
1398