1
2
3
4
5
6
7 """Code to work with GenBank formatted files.
8
9 Classes:
10 Iterator Iterate through a file of GenBank entries
11 Dictionary Access a GenBank file using a dictionary interface.
12 ErrorFeatureParser Catch errors caused during parsing.
13 FeatureParser Parse GenBank data in Seq and SeqFeature objects.
14 RecordParser Parse GenBank data into a Record object.
15 NCBIDictionary Access GenBank using a dictionary interface.
16
17 _BaseGenBankConsumer A base class for GenBank consumer that implements
18 some helpful functions that are in common between
19 consumers.
20 _FeatureConsumer Create SeqFeature objects from info generated by
21 the Scanner
22 _RecordConsumer Create a GenBank record object from Scanner info.
23 _PrintingConsumer A debugging consumer.
24
25 ParserFailureError Exception indicating a failure in the parser (ie.
26 scanner or consumer)
27 LocationParserError Exception indiciating a problem with the spark based
28 location parser.
29
30 Functions:
31 index_file Get a GenBank file ready to be used as a Dictionary.
32 search_for Do a query against GenBank.
33 download_many Download many GenBank records.
34
35 """
36 import cStringIO
37
38
39 from Bio.ParserSupport import AbstractConsumer
40 from utils import FeatureValueCleaner
41
42
43
44 from Scanner import GenBankScanner
45
46
47
48
49
50
51
52 from Bio import EUtils
53 from Bio.EUtils import DBIds, DBIdsClient
54
55
56 GENBANK_INDENT = 12
57 GENBANK_SPACER = " " * GENBANK_INDENT
58
59
60 FEATURE_KEY_INDENT = 5
61 FEATURE_QUALIFIER_INDENT = 21
62 FEATURE_KEY_SPACER = " " * FEATURE_KEY_INDENT
63 FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT
64
66 """Access a GenBank file using a dictionary-like interface.
67 """
68 - def __init__(self, indexname, parser = None):
69 """Initialize and open up a GenBank dictionary. DEPRECATED
70
71 Each entry is a full GenBank record (i.e. from the LOCUS line
72 to the // at the end of the sequence).
73
74 Most GenBank files have only one such "entry", in which case
75 using this dictionary class is rather unnecessary.
76
77 Arguments:
78 o indexname - The name of the index for the file. This should have been
79 created using the index_file function.
80 o parser - An optional argument specifying a parser object that
81 the records should be run through before returning the output. If
82 parser is None then the unprocessed contents of the file will be
83 returned.
84 """
85
86 import warnings
87 warnings.warn("Bio.GenBank.index_file Bio.GenBank.Dictionary are deprecated." \
88 + " We hope an in memory dictionary, for example using the" \
89 + " Bio.SeqIO.to_dict() function, will be suitable for" \
90 + " most users. Please get in touch on the mailing lists if" \
91 + " this (or its removal) causes any problems for you.",
92 DeprecationWarning)
93
94 from Bio import Mindy
95 self._index = Mindy.open(indexname)
96 self._parser = parser
97
99 return len(self.keys())
100
102
103 try:
104 seqs = self._index.lookup(id = key)
105
106 except KeyError:
107 seqs = self._index.lookup(aliases = key)
108
109 if len(seqs) == 1:
110 seq = seqs[0]
111 else:
112 raise KeyError("Multiple sequences found for %s" % key)
113
114 if self._parser:
115 handle = cStringIO.StringIO(seq.text)
116 return self._parser.parse(handle)
117 else:
118 return seq.text
119
121 primary_key_retriever = self._index['id']
122 return primary_key_retriever.keys()
123
125 """Iterator interface to move over a file of GenBank entries one at a time.
126 """
127 - def __init__(self, handle, parser = None):
128 """Initialize the iterator.
129
130 Arguments:
131 o handle - A handle with GenBank entries to iterate through.
132 o parser - An optional parser to pass the entries through before
133 returning them. If None, then the raw entry will be returned.
134 """
135 self.handle = handle
136 self._parser = parser
137
139 """Return the next GenBank record from the handle.
140
141 Will return None if we ran out of records.
142 """
143 if self._parser is None :
144 lines = []
145 while True :
146 line = self.handle.readline()
147 if not line : return None
148 lines.append(line)
149 if line.rstrip() == "//" : break
150 return "".join(lines)
151 try :
152 return self._parser.parse(self.handle)
153 except StopIteration :
154 return None
155
157 return iter(self.next, None)
158
160 """Failure caused by some kind of problem in the parser.
161 """
162 pass
163
165 """Could not Properly parse out a location from a GenBank file.
166 """
167 pass
168
170 """Parse GenBank files into Seq + Feature objects.
171 """
174 """Initialize a GenBank parser and Feature consumer.
175
176 Arguments:
177 o debug_level - An optional argument that species the amount of
178 debugging information the parser should spit out. By default we have
179 no debugging info (the fastest way to do things), but if you want
180 you can set this as high as two and see exactly where a parse fails.
181 o use_fuzziness - Specify whether or not to use fuzzy representations.
182 The default is 1 (use fuzziness).
183 o feature_cleaner - A class which will be used to clean out the
184 values of features. This class must implement the function
185 clean_value. GenBank.utils has a "standard" cleaner class, which
186 is used by default.
187 """
188 self._scanner = GenBankScanner(debug_level)
189 self.use_fuzziness = use_fuzziness
190 self._cleaner = feature_cleaner
191
192 - def parse(self, handle):
193 """Parse the specified handle.
194 """
195 self._consumer = _FeatureConsumer(self.use_fuzziness,
196 self._cleaner)
197 self._scanner.feed(handle, self._consumer)
198 return self._consumer.data
199
201 """Parse GenBank files into Record objects
202 """
204 """Initialize the parser.
205
206 Arguments:
207 o debug_level - An optional argument that species the amount of
208 debugging information the parser should spit out. By default we have
209 no debugging info (the fastest way to do things), but if you want
210 you can set this as high as two and see exactly where a parse fails.
211 """
212 self._scanner = GenBankScanner(debug_level)
213
214 - def parse(self, handle):
215 """Parse the specified handle into a GenBank record.
216 """
217 self._consumer = _RecordConsumer()
218 self._scanner.feed(handle, self._consumer)
219 return self._consumer.data
220
222 """Abstract GenBank consumer providing useful general functions.
223
224 This just helps to eliminate some duplication in things that most
225 GenBank consumers want to do.
226 """
227
228
229
230
231 remove_space_keys = ["translation"]
232
235
237 """Split a string of keywords into a nice clean list.
238 """
239
240 if keyword_string == "" or keyword_string == "." :
241 keywords = ""
242 elif keyword_string[-1] == '.':
243 keywords = keyword_string[:-1]
244 else:
245 keywords = keyword_string
246 keyword_list = keywords.split(';')
247 clean_keyword_list = [x.strip() for x in keyword_list]
248 return clean_keyword_list
249
251 """Split a string of accession numbers into a list.
252 """
253
254
255 accession = accession_string.replace("\n", " ").replace(";"," ")
256
257 return [x.strip() for x in accession.split(' ')]
258
260 """Split a string with taxonomy info into a list.
261 """
262 if not taxonomy_string or taxonomy_string=="." :
263
264 return []
265
266 if taxonomy_string[-1] == '.':
267 tax_info = taxonomy_string[:-1]
268 else:
269 tax_info = taxonomy_string
270 tax_list = tax_info.split(';')
271 new_tax_list = []
272 for tax_item in tax_list:
273 new_items = tax_item.split("\n")
274 new_tax_list.extend(new_items)
275 while '' in new_tax_list:
276 new_tax_list.remove('')
277 clean_tax_list = [x.strip() for x in new_tax_list]
278
279 return clean_tax_list
280
282 """Clean whitespace out of a location string.
283
284 The location parser isn't a fan of whitespace, so we clean it out
285 before feeding it into the parser.
286 """
287 import string
288 location_line = location_string
289 for ws in string.whitespace:
290 location_line = location_line.replace(ws, '')
291
292 return location_line
293
295 """Remove any newlines in the passed text, returning the new string.
296 """
297
298 newlines = ["\n", "\r"]
299 for ws in newlines:
300 text = text.replace(ws, "")
301
302 return text
303
305 """Replace multiple spaces in the passed text with single spaces.
306 """
307
308 text_parts = text.split(" ")
309 text_parts = filter(None, text_parts)
310 return ' '.join(text_parts)
311
313 """Remove all spaces from the passed text.
314 """
315 return text.replace(" ", "")
316
318 """Convert a start and end range to python notation.
319
320 In GenBank, starts and ends are defined in "biological" coordinates,
321 where 1 is the first base and [i, j] means to include both i and j.
322
323 In python, 0 is the first base and [i, j] means to include i, but
324 not j.
325
326 So, to convert "biological" to python coordinates, we need to
327 subtract 1 from the start, and leave the end and things should
328 be converted happily.
329 """
330 new_start = start - 1
331 new_end = end
332
333 return new_start, new_end
334
336 """Create a SeqRecord object with Features to return.
337
338 Attributes:
339 o use_fuzziness - specify whether or not to parse with fuzziness in
340 feature locations.
341 o feature_cleaner - a class that will be used to provide specialized
342 cleaning-up of feature values.
343 """
344 - def __init__(self, use_fuzziness, feature_cleaner = None):
345 from Bio.SeqRecord import SeqRecord
346 _BaseGenBankConsumer.__init__(self)
347 self.data = SeqRecord(None, id = None)
348 self.data.id = None
349 self.data.description = ""
350
351 self._use_fuzziness = use_fuzziness
352 self._feature_cleaner = feature_cleaner
353
354 self._seq_type = ''
355 self._seq_data = []
356 self._current_ref = None
357 self._cur_feature = None
358 self._cur_qualifier_key = None
359 self._cur_qualifier_value = None
360
361 - def locus(self, locus_name):
362 """Set the locus name is set as the name of the Sequence.
363 """
364 self.data.name = locus_name
365
366 - def size(self, content):
368
370 """Record the sequence type so we can choose an appropriate alphabet.
371 """
372 self._seq_type = type
373
376
377 - def date(self, submit_date):
379
389
391 """Set the accession number as the id of the sequence.
392
393 If we have multiple accession numbers, the first one passed is
394 used.
395 """
396 new_acc_nums = self._split_accessions(acc_num)
397
398
399 try :
400
401 self.data.annotations['accessions'].extend(new_acc_nums)
402 except KeyError :
403 self.data.annotations['accessions'] = new_acc_nums
404
405
406 if self.data.id is None:
407 if len(new_acc_nums) > 0:
408
409
410 self.data.id = self.data.annotations['accessions'][0]
411
412
413 - def nid(self, content):
415
416 - def pid(self, content):
418
431
433 """Set the version to overwrite the id.
434
435 Since the verison provides the same information as the accession
436 number, plus some extra info, we set this as the id if we have
437 a version.
438 """
439
440
441
442
443
444
445
446
447
448
449 assert version.isdigit()
450 self.data.annotations['sequence_version'] = int(version)
451
454
455 - def gi(self, content):
457
460
463
465 if content[-1] == '.':
466 source_info = content[:-1]
467 else:
468 source_info = content
469 self.data.annotations['source'] = source_info
470
473
476
489
491 """Attempt to determine the sequence region the reference entails.
492
493 Possible types of information we may have to deal with:
494
495 (bases 1 to 86436)
496 (sites)
497 (bases 1 to 105654; 110423 to 111122)
498 1 (residues 1 to 182)
499 """
500
501 ref_base_info = content[1:-1]
502
503 all_locations = []
504
505 if ref_base_info.find('bases') != -1 and \
506 ref_base_info.find('to') != -1:
507
508 ref_base_info = ref_base_info[5:]
509 locations = self._split_reference_locations(ref_base_info)
510 all_locations.extend(locations)
511 elif (ref_base_info.find("residues") >= 0 and
512 ref_base_info.find("to") >= 0):
513 residues_start = ref_base_info.find("residues")
514
515 ref_base_info = ref_base_info[(residues_start + len("residues ")):]
516 locations = self._split_reference_locations(ref_base_info)
517 all_locations.extend(locations)
518
519
520
521 elif (ref_base_info == 'sites' or
522 ref_base_info.strip() == 'bases'):
523 pass
524
525 else:
526 raise ValueError("Could not parse base info %s in record %s" %
527 (ref_base_info, self.data.id))
528
529 self._current_ref.location = all_locations
530
532 """Get reference locations out of a string of reference information
533
534 The passed string should be of the form:
535
536 1 to 20; 20 to 100
537
538 This splits the information out and returns a list of location objects
539 based on the reference locations.
540 """
541 from Bio import SeqFeature
542
543 all_base_info = location_string.split(';')
544
545 new_locations = []
546 for base_info in all_base_info:
547 start, end = base_info.split('to')
548 new_start, new_end = \
549 self._convert_to_python_numbers(int(start.strip()),
550 int(end.strip()))
551 this_location = SeqFeature.FeatureLocation(new_start, new_end)
552 new_locations.append(this_location)
553 return new_locations
554
556 self._current_ref.authors = content
557
559 self._current_ref.consrtm = content
560
561 - def title(self, content):
562 self._current_ref.title = content
563
565 self._current_ref.journal = content
566
569
572
574 self._current_ref.comment = content
575
581
583 """Get ready for the feature table when we reach the FEATURE line.
584 """
585 self.start_feature_table()
586
588 """Indicate we've got to the start of the feature table.
589 """
590
591 if self._current_ref is not None:
592 self.data.annotations['references'].append(self._current_ref)
593 self._current_ref = None
594
596 """Utility function to add a feature to the SeqRecord.
597
598 This does all of the appropriate checking to make sure we haven't
599 left any info behind, and that we are only adding info if it
600 exists.
601 """
602 if self._cur_feature:
603
604
605 self._add_qualifier()
606
607 self._cur_qualifier_key = ''
608 self._cur_qualifier_value = ''
609 self.data.features.append(self._cur_feature)
610
625
627 """Parse out location information from the location string.
628
629 This uses Andrew's nice spark based parser to do the parsing
630 for us, and translates the results of the parse into appropriate
631 Location objects.
632 """
633 from Bio.GenBank import LocationParser
634
635
636
637
638
639
640 location_line = self._clean_location(content)
641
642
643
644
645
646
647 if location_line.find('replace') != -1:
648 comma_pos = location_line.find(',')
649 location_line = location_line[8:comma_pos]
650
651
652 try:
653 parse_info = \
654 LocationParser.parse(LocationParser.scan(location_line))
655
656 except SystemExit:
657 raise LocationParserError(location_line)
658
659
660
661
662 self._set_location_info(parse_info, self._cur_feature)
663
665 """Set the location information based on a function.
666
667 This handles all of the location functions like 'join', 'complement'
668 and 'order'.
669
670 Arguments:
671 o function - A LocationParser.Function object specifying the
672 function we are acting on.
673 o cur_feature - The feature to add information to.
674 """
675 from Bio import SeqFeature
676 from Bio.GenBank import LocationParser
677 assert isinstance(function, LocationParser.Function), \
678 "Expected a Function object, got %s" % function
679
680 if function.name == "complement":
681
682 cur_feature.strand = -1
683
684 for inner_info in function.args:
685 self._set_location_info(inner_info, cur_feature)
686
687
688
689
690
691
692
693 elif (function.name == "join" or function.name == "order" or
694 function.name == "one-of" or function.name == "bond"):
695 self._set_ordering_info(function, cur_feature)
696 elif (function.name == "gap"):
697 assert len(function.args) == 1, \
698 "Unexpected number of arguments in gap %s" % function.args
699
700 position = self._get_position(function.args[0].local_location)
701 cur_feature.location = SeqFeature.PositionGap(position)
702 else:
703 raise ValueError("Unexpected function name: %s" % function.name)
704
706 """Parse a join or order and all of the information in it.
707
708 This deals with functions that order a bunch of locations,
709 specifically 'join' and 'order'. The inner locations are
710 added as subfeatures of the top level feature
711 """
712 from Bio import SeqFeature
713
714
715 for inner_element in function.args:
716 new_sub_feature = SeqFeature.SeqFeature()
717
718 new_sub_feature.type = cur_feature.type
719
720 cur_feature.location_operator = function.name
721 new_sub_feature.location_operator = function.name
722
723 new_sub_feature.ref = cur_feature.ref
724 new_sub_feature.ref_db = cur_feature.ref_db
725 new_sub_feature.strand = cur_feature.strand
726
727
728 self._set_location_info(inner_element, new_sub_feature)
729
730
731 cur_feature.sub_features.append(new_sub_feature)
732
733
734
735
736
737
738
739
740 feature_start = cur_feature.sub_features[0].location.start
741 feature_end = cur_feature.sub_features[-1].location.end
742 cur_feature.location = SeqFeature.FeatureLocation(feature_start,
743 feature_end)
744
746 """Set the location information for a feature from the parse info.
747
748 Arguments:
749 o parse_info - The classes generated by the LocationParser.
750 o cur_feature - The feature to add the information to.
751 """
752 from Bio.GenBank import LocationParser
753
754 if parse_info is None:
755 return
756
757
758 elif isinstance(parse_info, LocationParser.AbsoluteLocation):
759 self._set_location(parse_info, cur_feature)
760 return
761
762 elif isinstance(parse_info, LocationParser.Function):
763 self._set_function(parse_info, cur_feature)
764
765 else:
766 raise ValueError("Could not parse location info: %s"
767 % parse_info)
768
770 """Set the location information for a feature.
771
772 Arguments:
773 o location - An AbsoluteLocation object specifying the info
774 about the location.
775 o cur_feature - The feature to add the information to.
776 """
777
778
779 if location.path is not None:
780 cur_feature.ref = location.path.accession
781 cur_feature.ref_db = location.path.database
782
783 cur_feature.location = self._get_location(location.local_location)
784
786 """Return a (possibly fuzzy) location from a Range object.
787
788 Arguments:
789 o range_info - A location range (ie. something like 67..100). This
790 may also be a single position (ie 27).
791
792 This returns a FeatureLocation object.
793 If parser.use_fuzziness is set at one, the positions for the
794 end points will possibly be fuzzy.
795 """
796 from Bio import SeqFeature
797 from Bio.GenBank import LocationParser
798
799 if not(isinstance(range_info, LocationParser.Range)):
800 pos = self._get_position(range_info)
801
802
803 pos.position = pos.position - 1
804 return SeqFeature.FeatureLocation(pos, pos)
805
806 else:
807
808 start_pos = self._get_position(range_info.low)
809 end_pos = self._get_position(range_info.high)
810
811 start_pos.position, end_pos.position = \
812 self._convert_to_python_numbers(start_pos.position,
813 end_pos.position)
814
815 return SeqFeature.FeatureLocation(start_pos, end_pos)
816
818 """Return a (possibly fuzzy) position for a single coordinate.
819
820 Arguments:
821 o position - This is a LocationParser.* object that specifies
822 a single coordinate. We will examine the object to determine
823 the fuzziness of the position.
824
825 This is used with _get_location to parse out a location of any
826 end_point of arbitrary fuzziness.
827 """
828 from Bio import SeqFeature
829 from Bio.GenBank import LocationParser
830
831 if (isinstance(position, LocationParser.Integer)):
832 final_pos = SeqFeature.ExactPosition(position.val)
833
834 elif isinstance(position, LocationParser.LowBound):
835 final_pos = SeqFeature.AfterPosition(position.base.val)
836
837 elif isinstance(position, LocationParser.HighBound):
838 final_pos = SeqFeature.BeforePosition(position.base.val)
839
840 elif isinstance(position, LocationParser.Between):
841 final_pos = SeqFeature.BetweenPosition(position.low.val,
842 position.high.val)
843
844 elif isinstance(position, LocationParser.TwoBound):
845 final_pos = SeqFeature.WithinPosition(position.low.val,
846 position.high.val)
847
848 elif isinstance(position, LocationParser.Function) and \
849 position.name == "one-of":
850
851 position_choices = []
852 for arg in position.args:
853
854
855 assert isinstance(arg, LocationParser.AbsoluteLocation), \
856 "Unhandled Location type %r" % arg
857 assert arg.path is None, "Unhandled path in location"
858 position = self._get_position(arg.local_location)
859 position_choices.append(position)
860 final_pos = SeqFeature.OneOfPosition(position_choices)
861
862 else:
863 raise ValueError("Unexpected LocationParser object %r" %
864 position)
865
866
867 if self._use_fuzziness:
868 return final_pos
869
870 else:
871 return SeqFeature.ExactPosition(final_pos.location)
872
874 """Add a qualifier to the current feature without loss of info.
875
876 If there are multiple qualifier keys with the same name we
877 would lose some info in the dictionary, so we append a unique
878 number to the end of the name in case of conflicts.
879 """
880
881
882 if self._cur_qualifier_key:
883 key = self._cur_qualifier_key
884 value = "".join(self._cur_qualifier_value)
885 if self._feature_cleaner is not None:
886 value = self._feature_cleaner.clean_value(key, value)
887
888 if key in self._cur_feature.qualifiers:
889 self._cur_feature.qualifiers[key].append(value)
890
891 else:
892 self._cur_feature.qualifiers[key] = [value]
893
895 """When we get a qualifier key, use it as a dictionary key.
896
897 We receive a list of keys, since you can have valueless keys such as
898 /pseudo which would be passed in with the next key (since no other
899 tags separate them in the file)
900 """
901 for content in content_list:
902
903 self._add_qualifier()
904
905
906 qual_key = content.replace('/', '')
907 qual_key = qual_key.replace('=', '')
908 qual_key = qual_key.strip()
909
910 self._cur_qualifier_key = qual_key
911 self._cur_qualifier_value = []
912
914
915 qual_value = content.replace('"', '')
916
917 self._cur_qualifier_value.append(qual_value)
918
938
941
944
947
949 """Add up sequence information as we get it.
950
951 To try and make things speedier, this puts all of the strings
952 into a list of strings, and then uses string.join later to put
953 them together. Supposedly, this is a big time savings
954 """
955 new_seq = content.replace(' ', '')
956 new_seq = new_seq.upper()
957
958 self._seq_data.append(new_seq)
959
961 """Clean up when we've finished the record.
962 """
963 from Bio import Alphabet
964 from Bio.Alphabet import IUPAC
965 from Bio.Seq import Seq
966
967
968 if self.data.id.count('.') == 0 :
969 try :
970 self.data.id+='.%i' % self.data.annotations['sequence_version']
971 except KeyError :
972 pass
973
974
975 self._add_feature()
976
977
978
979
980
981 seq_alphabet = Alphabet.generic_alphabet
982
983 if self._seq_type:
984
985 if self._seq_type.find('DNA') != -1 or \
986 self._seq_type.find('mRNA') != -1:
987 seq_alphabet = IUPAC.ambiguous_dna
988
989 elif self._seq_type.find('RNA') != -1:
990 seq_alphabet = IUPAC.ambiguous_rna
991 elif self._seq_type.find('PROTEIN') != -1 :
992 seq_alphabet = IUPAC.protein
993
994
995 elif self._seq_type in ["circular", "linear"]:
996 pass
997
998 else:
999 raise ValueError("Could not determine alphabet for seq_type %s"
1000 % self._seq_type)
1001
1002
1003 sequence = "".join(self._seq_data)
1004 self.data.seq = Seq(sequence, seq_alphabet)
1005
1007 """Create a GenBank Record object from scanner generated information.
1008 """
1018
1019 - def locus(self, content):
1021
1022 - def size(self, content):
1024
1027
1030
1031 - def date(self, content):
1033
1036
1040
1041 - def nid(self, content):
1043
1044 - def pid(self, content):
1046
1049
1052
1053 - def gi(self, content):
1055
1058
1061
1064
1067
1070
1072 """Grab the reference number and signal the start of a new reference.
1073 """
1074
1075 if self._cur_reference is not None:
1076 self.data.references.append(self._cur_reference)
1077
1078 self._cur_reference = Record.Reference()
1079 self._cur_reference.number = content
1080
1082 self._cur_reference.bases = content
1083
1085 self._cur_reference.authors = content
1086
1088 self._cur_reference.consrtm = content
1089
1090 - def title(self, content):
1091 self._cur_reference.title = content
1092
1094 self._cur_reference.journal = content
1095
1098
1101
1103 self._cur_reference.remark = content
1104
1107
1111
1114
1116 """Get ready for the feature table when we reach the FEATURE line.
1117 """
1118 self.start_feature_table()
1119
1121 """Signal the start of the feature table.
1122 """
1123
1124 if self._cur_reference is not None:
1125 self.data.references.append(self._cur_reference)
1126
1128 """Grab the key of the feature and signal the start of a new feature.
1129 """
1130
1131 self._add_feature()
1132
1133 self._cur_feature = Record.Feature()
1134 self._cur_feature.key = content
1135
1137 """Utility function to add a feature to the Record.
1138
1139 This does all of the appropriate checking to make sure we haven't
1140 left any info behind, and that we are only adding info if it
1141 exists.
1142 """
1143 if self._cur_feature is not None:
1144
1145
1146 if self._cur_qualifier is not None:
1147 self._cur_feature.qualifiers.append(self._cur_qualifier)
1148
1149 self._cur_qualifier = None
1150 self.data.features.append(self._cur_feature)
1151
1154
1156 """Deal with qualifier names
1157
1158 We receive a list of keys, since you can have valueless keys such as
1159 /pseudo which would be passed in with the next key (since no other
1160 tags separate them in the file)
1161 """
1162 for content in content_list:
1163
1164 if content.find("/") != 0:
1165 content = "/%s" % content
1166
1167 if self._cur_qualifier is not None:
1168 self._cur_feature.qualifiers.append(self._cur_qualifier)
1169
1170 self._cur_qualifier = Record.Qualifier()
1171 self._cur_qualifier.key = content
1172
1184
1186 self.data.base_counts = content
1187
1189 self.data.origin = content
1190
1192 """Signal that we have contig information to add to the record.
1193 """
1194 self.data.contig = self._clean_location(content)
1195
1197 """Add sequence information to a list of sequence strings.
1198
1199 This removes spaces in the data and uppercases the sequence, and
1200 then adds it to a list of sequences. Later on we'll join this
1201 list together to make the final sequence. This is faster than
1202 adding on the new string every time.
1203 """
1204 new_seq = content.replace(' ', '')
1205 self._seq_data.append(new_seq.upper())
1206
1208 """Signal the end of the record and do any necessary clean-up.
1209 """
1210
1211
1212 self.data.sequence = "".join(self._seq_data)
1213
1214 self._add_feature()
1215
1217 """Combine multiple lines of content separated by spaces.
1218
1219 This function is used by the EventGenerator callback function to
1220 combine multiple lines of information. The lines are first
1221 stripped to remove whitepsace, and then combined so they are separated
1222 by a space. This is a simple minded way to combine lines, but should
1223 work for most cases.
1224 """
1225
1226 stripped_line_list = [x.strip() for x in line_list]
1227
1228
1229 return ' '.join(stripped_line_list)
1230
1231 -def index_file(filename, indexname, rec2key = None, use_berkeley = 0):
1232 """Index a GenBank file to prepare it for use as a dictionary. DEPRECATED
1233
1234 Arguments:
1235 filename - The name of the GenBank file to be indexed.
1236 indexname - The name of the index to create
1237 rec2key - A reference to a function object which, when called with a
1238 SeqRecord object, will return a key to be used for the record. If no
1239 function is specified then the records will be indexed by the 'id'
1240 attribute of the SeqRecord (the versioned GenBank id).
1241 use_berkeley - specifies whether to use the BerkeleyDB indexer, which
1242 uses the bsddb3 wrappers around the embedded database Berkeley DB. By
1243 default, the standard flat file (non-Berkeley) indexes are used.
1244 """
1245
1246 import warnings
1247 warnings.warn("Bio.GenBank.index_file Bio.GenBank.Dictionary are deprecated." \
1248 + " We hope an in memory dictionary, for example using the" \
1249 + " Bio.SeqIO.to_dict() function, will be suitable for" \
1250 + " most users. Please get in touch on the mailing lists if" \
1251 + " this (or its removal) causes any problems for you.",
1252 DeprecationWarning)
1253
1254 from Bio.Mindy import SimpleSeqRecord
1255 if rec2key:
1256 indexer = SimpleSeqRecord.FunctionIndexer(rec2key)
1257 else:
1258 indexer = SimpleSeqRecord.SimpleIndexer()
1259
1260 if use_berkeley:
1261 SimpleSeqRecord.create_berkeleydb([filename], indexname, indexer)
1262 else:
1263 SimpleSeqRecord.create_flatdb([filename], indexname, indexer)
1264
1266 """Access GenBank using a read-only dictionary interface.
1267 """
1268 VALID_DATABASES = ['nucleotide', 'protein', 'genome']
1269 VALID_FORMATS = ['genbank', 'fasta']
1270 - def __init__(self, database, format, parser = None):
1271 """Initialize an NCBI dictionary to retrieve sequences.
1272
1273 Create a new Dictionary to access GenBank. Valid values for
1274 database are 'nucleotide' and 'protein'.
1275 Valid values for format are 'genbank' (for nucleotide genbank and
1276 protein genpept) and 'fasta'.
1277 dely and retmax are old options kept only for compatibility -- do not
1278 bother to set them.
1279 parser is an optional parser object
1280 to change the results into another form. If unspecified, then
1281 the raw contents of the file will be returned.
1282 """
1283 from Bio import db
1284 self.parser = parser
1285 if database not in self.__class__.VALID_DATABASES:
1286 raise ValueError("Invalid database %s, should be one of %s" %
1287 (database, self.__class__.VALID_DATABASES))
1288 if format not in self.__class__.VALID_FORMATS:
1289 raise ValueError("Invalid format %s, should be one of %s" %
1290 (format, self.__class__.VALID_FORMATS))
1291
1292 if format == 'fasta':
1293 self.db = db["fasta-sequence-eutils"]
1294 elif format == 'genbank':
1295 if database == 'nucleotide':
1296 self.db = db["nucleotide-genbank-eutils"]
1297 elif database == 'protein':
1298 self.db = db["protein-genbank-eutils"]
1299 elif database == 'genome':
1300 self.db = db["genome-genbank-eutils"]
1301
1303 raise NotImplementedError, "GenBank contains lots of entries"
1305 raise NotImplementedError, "This is a read-only dictionary"
1307 raise NotImplementedError, "This is a read-only dictionary"
1309 raise NotImplementedError, "This is a read-only dictionary"
1311 raise NotImplementedError, "You don't need to do this..."
1313 raise NotImplementedError, "You don't really want to do this..."
1315 raise NotImplementedError, "You don't really want to do this..."
1317 raise NotImplementedError, "You don't really want to do this..."
1318
1320 """S.has_key(id) -> bool"""
1321 try:
1322 self[id]
1323 except KeyError:
1324 return 0
1325 return 1
1326
1327 - def get(self, id, failobj=None):
1328 try:
1329 return self[id]
1330 except KeyError:
1331 return failobj
1332 raise "How did I get here?"
1333
1335 """Return the GenBank entry specified by the GenBank ID.
1336
1337 Raises a KeyError if there's an error.
1338 """
1339 handle = self.db[id]
1340
1341 if self.parser is not None:
1342 return self.parser.parse(handle)
1343 return handle.read()
1344
1345 -def search_for(search, database='nucleotide',
1346 reldate=None, mindate=None, maxdate=None,
1347 start_id = 0, max_ids = 50000000):
1348 """search_for(search[, reldate][, mindate][, maxdate]
1349 [, batchsize][, delay][, callback_fn][, start_id][, max_ids]) -> ids
1350
1351 Search GenBank and return a list of the GenBank identifiers (gi's)
1352 that match the criteria. search is the search string used to
1353 search the database. Valid values for database are
1354 'nucleotide', 'protein', 'popset' and 'genome'. reldate is
1355 the number of dates prior to the current date to restrict the
1356 search. mindate and maxdate are the dates to restrict the search,
1357 e.g. 2002/01/01. start_id is the number to begin retrieval on.
1358 max_ids specifies the maximum number of id's to retrieve.
1359
1360 batchsize, delay and callback_fn are old parameters for
1361 compatibility -- do not set them.
1362 """
1363
1364 date_restrict = None
1365 if reldate:
1366 date_restrict = EUtils.WithinNDays(reldate)
1367 elif mindate:
1368 date_restrict = EUtils.DateRange(mindate, maxdate)
1369
1370 eutils_client = DBIdsClient.DBIdsClient()
1371 db_ids = eutils_client.search(search, database, daterange = date_restrict,
1372 retstart = start_id, retmax = max_ids)
1373 ids = []
1374 for db_id in db_ids:
1375 ids.append(db_id.dbids.ids[0])
1376 return ids
1377
1379 """download_many(ids, database) -> handle of results
1380
1381 Download many records from GenBank. ids is a list of gis or
1382 accessions.
1383
1384 callback_fn, broken_fn, delay, faildelay, batchsize, parser are old
1385 parameter for compatibility. They should not be used.
1386 """
1387 db_ids = DBIds(database, ids)
1388 if database in ['nucleotide']:
1389 format = 'gb'
1390 elif database in ['protein']:
1391 format = 'gp'
1392 else:
1393 raise ValueError("Unexpected database: %s" % database)
1394
1395 eutils_client = DBIdsClient.from_dbids(db_ids)
1396 result_handle = eutils_client.efetch(retmode = "text", rettype = format)
1397 return cStringIO.StringIO(result_handle.read())
1398