1
2
3
4
5
6
7 """Code to work with GenBank formatted files.
8
9 Classes:
10 Iterator Iterate through a file of GenBank entries
11 Dictionary Access a GenBank file using a dictionary interface.
12 ErrorFeatureParser Catch errors caused during parsing.
13 FeatureParser Parse GenBank data in Seq and SeqFeature objects.
14 RecordParser Parse GenBank data into a Record object.
15 NCBIDictionary Access GenBank using a dictionary interface.
16
17 _BaseGenBankConsumer A base class for GenBank consumer that implements
18 some helpful functions that are in common between
19 consumers.
20 _FeatureConsumer Create SeqFeature objects from info generated by
21 the Scanner
22 _RecordConsumer Create a GenBank record object from Scanner info.
23 _PrintingConsumer A debugging consumer.
24
25 ParserFailureError Exception indicating a failure in the parser (ie.
26 scanner or consumer)
27 LocationParserError Exception indiciating a problem with the spark based
28 location parser.
29
30 Functions:
31 search_for Do a query against GenBank.
32 download_many Download many GenBank records.
33
34 """
35 import cStringIO
36
37
38 from Bio.ParserSupport import AbstractConsumer
39 from utils import FeatureValueCleaner
40 from Bio import Entrez
41
42
43
44 from Scanner import GenBankScanner
45
46
47 from Bio import EUtils
48 from Bio.EUtils import DBIds, DBIdsClient
49
50
51 GENBANK_INDENT = 12
52 GENBANK_SPACER = " " * GENBANK_INDENT
53
54
55 FEATURE_KEY_INDENT = 5
56 FEATURE_QUALIFIER_INDENT = 21
57 FEATURE_KEY_SPACER = " " * FEATURE_KEY_INDENT
58 FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT
59
61 """Iterator interface to move over a file of GenBank entries one at a time.
62 """
63 - def __init__(self, handle, parser = None):
64 """Initialize the iterator.
65
66 Arguments:
67 o handle - A handle with GenBank entries to iterate through.
68 o parser - An optional parser to pass the entries through before
69 returning them. If None, then the raw entry will be returned.
70 """
71 self.handle = handle
72 self._parser = parser
73
75 """Return the next GenBank record from the handle.
76
77 Will return None if we ran out of records.
78 """
79 if self._parser is None :
80 lines = []
81 while True :
82 line = self.handle.readline()
83 if not line : return None
84 lines.append(line)
85 if line.rstrip() == "//" : break
86 return "".join(lines)
87 try :
88 return self._parser.parse(self.handle)
89 except StopIteration :
90 return None
91
93 return iter(self.next, None)
94
96 """Failure caused by some kind of problem in the parser.
97 """
98 pass
99
101 """Could not Properly parse out a location from a GenBank file.
102 """
103 pass
104
106 """Parse GenBank files into Seq + Feature objects.
107 """
110 """Initialize a GenBank parser and Feature consumer.
111
112 Arguments:
113 o debug_level - An optional argument that species the amount of
114 debugging information the parser should spit out. By default we have
115 no debugging info (the fastest way to do things), but if you want
116 you can set this as high as two and see exactly where a parse fails.
117 o use_fuzziness - Specify whether or not to use fuzzy representations.
118 The default is 1 (use fuzziness).
119 o feature_cleaner - A class which will be used to clean out the
120 values of features. This class must implement the function
121 clean_value. GenBank.utils has a "standard" cleaner class, which
122 is used by default.
123 """
124 self._scanner = GenBankScanner(debug_level)
125 self.use_fuzziness = use_fuzziness
126 self._cleaner = feature_cleaner
127
128 - def parse(self, handle):
129 """Parse the specified handle.
130 """
131 self._consumer = _FeatureConsumer(self.use_fuzziness,
132 self._cleaner)
133 self._scanner.feed(handle, self._consumer)
134 return self._consumer.data
135
137 """Parse GenBank files into Record objects
138 """
140 """Initialize the parser.
141
142 Arguments:
143 o debug_level - An optional argument that species the amount of
144 debugging information the parser should spit out. By default we have
145 no debugging info (the fastest way to do things), but if you want
146 you can set this as high as two and see exactly where a parse fails.
147 """
148 self._scanner = GenBankScanner(debug_level)
149
150 - def parse(self, handle):
151 """Parse the specified handle into a GenBank record.
152 """
153 self._consumer = _RecordConsumer()
154 self._scanner.feed(handle, self._consumer)
155 return self._consumer.data
156
158 """Abstract GenBank consumer providing useful general functions.
159
160 This just helps to eliminate some duplication in things that most
161 GenBank consumers want to do.
162 """
163
164
165
166
167 remove_space_keys = ["translation"]
168
171
173 """Split a string of keywords into a nice clean list.
174 """
175
176 if keyword_string == "" or keyword_string == "." :
177 keywords = ""
178 elif keyword_string[-1] == '.':
179 keywords = keyword_string[:-1]
180 else:
181 keywords = keyword_string
182 keyword_list = keywords.split(';')
183 clean_keyword_list = [x.strip() for x in keyword_list]
184 return clean_keyword_list
185
187 """Split a string of accession numbers into a list.
188 """
189
190
191 accession = accession_string.replace("\n", " ").replace(";"," ")
192
193 return [x.strip() for x in accession.split() if x.strip()]
194
196 """Split a string with taxonomy info into a list.
197 """
198 if not taxonomy_string or taxonomy_string=="." :
199
200 return []
201
202 if taxonomy_string[-1] == '.':
203 tax_info = taxonomy_string[:-1]
204 else:
205 tax_info = taxonomy_string
206 tax_list = tax_info.split(';')
207 new_tax_list = []
208 for tax_item in tax_list:
209 new_items = tax_item.split("\n")
210 new_tax_list.extend(new_items)
211 while '' in new_tax_list:
212 new_tax_list.remove('')
213 clean_tax_list = [x.strip() for x in new_tax_list]
214
215 return clean_tax_list
216
218 """Clean whitespace out of a location string.
219
220 The location parser isn't a fan of whitespace, so we clean it out
221 before feeding it into the parser.
222 """
223 import string
224 location_line = location_string
225 for ws in string.whitespace:
226 location_line = location_line.replace(ws, '')
227
228 return location_line
229
231 """Remove any newlines in the passed text, returning the new string.
232 """
233
234 newlines = ["\n", "\r"]
235 for ws in newlines:
236 text = text.replace(ws, "")
237
238 return text
239
241 """Replace multiple spaces in the passed text with single spaces.
242 """
243
244 text_parts = text.split(" ")
245 text_parts = filter(None, text_parts)
246 return ' '.join(text_parts)
247
249 """Remove all spaces from the passed text.
250 """
251 return text.replace(" ", "")
252
254 """Convert a start and end range to python notation.
255
256 In GenBank, starts and ends are defined in "biological" coordinates,
257 where 1 is the first base and [i, j] means to include both i and j.
258
259 In python, 0 is the first base and [i, j] means to include i, but
260 not j.
261
262 So, to convert "biological" to python coordinates, we need to
263 subtract 1 from the start, and leave the end and things should
264 be converted happily.
265 """
266 new_start = start - 1
267 new_end = end
268
269 return new_start, new_end
270
272 """Create a SeqRecord object with Features to return.
273
274 Attributes:
275 o use_fuzziness - specify whether or not to parse with fuzziness in
276 feature locations.
277 o feature_cleaner - a class that will be used to provide specialized
278 cleaning-up of feature values.
279 """
280 - def __init__(self, use_fuzziness, feature_cleaner = None):
281 from Bio.SeqRecord import SeqRecord
282 _BaseGenBankConsumer.__init__(self)
283 self.data = SeqRecord(None, id = None)
284 self.data.id = None
285 self.data.description = ""
286
287 self._use_fuzziness = use_fuzziness
288 self._feature_cleaner = feature_cleaner
289
290 self._seq_type = ''
291 self._seq_data = []
292 self._current_ref = None
293 self._cur_feature = None
294 self._cur_qualifier_key = None
295 self._cur_qualifier_value = None
296
297 - def locus(self, locus_name):
298 """Set the locus name is set as the name of the Sequence.
299 """
300 self.data.name = locus_name
301
302 - def size(self, content):
304
306 """Record the sequence type so we can choose an appropriate alphabet.
307 """
308 self._seq_type = type
309
312
313 - def date(self, submit_date):
315
325
327 """Set the accession number as the id of the sequence.
328
329 If we have multiple accession numbers, the first one passed is
330 used.
331 """
332 new_acc_nums = self._split_accessions(acc_num)
333
334
335 try :
336
337 for acc in new_acc_nums :
338
339 if acc not in self.data.annotations['accessions'] :
340 self.data.annotations['accessions'].append(acc)
341 except KeyError :
342 self.data.annotations['accessions'] = new_acc_nums
343
344
345 if self.data.id is None:
346 if len(new_acc_nums) > 0:
347
348
349 self.data.id = self.data.annotations['accessions'][0]
350
351
352 - def nid(self, content):
354
355 - def pid(self, content):
357
370
372 """Set the version to overwrite the id.
373
374 Since the verison provides the same information as the accession
375 number, plus some extra info, we set this as the id if we have
376 a version.
377 """
378
379
380
381
382
383
384
385
386
387
388 assert version.isdigit()
389 self.data.annotations['sequence_version'] = int(version)
390
393
394 - def gi(self, content):
396
399
402
404 if content[-1] == '.':
405 source_info = content[:-1]
406 else:
407 source_info = content
408 self.data.annotations['source'] = source_info
409
412
421
434
436 """Attempt to determine the sequence region the reference entails.
437
438 Possible types of information we may have to deal with:
439
440 (bases 1 to 86436)
441 (sites)
442 (bases 1 to 105654; 110423 to 111122)
443 1 (residues 1 to 182)
444 """
445
446 ref_base_info = content[1:-1]
447
448 all_locations = []
449
450 if ref_base_info.find('bases') != -1 and \
451 ref_base_info.find('to') != -1:
452
453 ref_base_info = ref_base_info[5:]
454 locations = self._split_reference_locations(ref_base_info)
455 all_locations.extend(locations)
456 elif (ref_base_info.find("residues") >= 0 and
457 ref_base_info.find("to") >= 0):
458 residues_start = ref_base_info.find("residues")
459
460 ref_base_info = ref_base_info[(residues_start + len("residues ")):]
461 locations = self._split_reference_locations(ref_base_info)
462 all_locations.extend(locations)
463
464
465
466 elif (ref_base_info == 'sites' or
467 ref_base_info.strip() == 'bases'):
468 pass
469
470 else:
471 raise ValueError("Could not parse base info %s in record %s" %
472 (ref_base_info, self.data.id))
473
474 self._current_ref.location = all_locations
475
477 """Get reference locations out of a string of reference information
478
479 The passed string should be of the form:
480
481 1 to 20; 20 to 100
482
483 This splits the information out and returns a list of location objects
484 based on the reference locations.
485 """
486 from Bio import SeqFeature
487
488 all_base_info = location_string.split(';')
489
490 new_locations = []
491 for base_info in all_base_info:
492 start, end = base_info.split('to')
493 new_start, new_end = \
494 self._convert_to_python_numbers(int(start.strip()),
495 int(end.strip()))
496 this_location = SeqFeature.FeatureLocation(new_start, new_end)
497 new_locations.append(this_location)
498 return new_locations
499
501 self._current_ref.authors = content
502
504 self._current_ref.consrtm = content
505
506 - def title(self, content):
507 self._current_ref.title = content
508
510 self._current_ref.journal = content
511
514
517
519 self._current_ref.comment = content
520
526
528 """Get ready for the feature table when we reach the FEATURE line.
529 """
530 self.start_feature_table()
531
533 """Indicate we've got to the start of the feature table.
534 """
535
536 if self._current_ref is not None:
537 self.data.annotations['references'].append(self._current_ref)
538 self._current_ref = None
539
541 """Utility function to add a feature to the SeqRecord.
542
543 This does all of the appropriate checking to make sure we haven't
544 left any info behind, and that we are only adding info if it
545 exists.
546 """
547 if self._cur_feature:
548
549
550 self._add_qualifier()
551
552 self._cur_qualifier_key = ''
553 self._cur_qualifier_value = ''
554 self.data.features.append(self._cur_feature)
555
570
572 """Parse out location information from the location string.
573
574 This uses Andrew's nice spark based parser to do the parsing
575 for us, and translates the results of the parse into appropriate
576 Location objects.
577 """
578 from Bio.GenBank import LocationParser
579
580
581
582
583
584
585 location_line = self._clean_location(content)
586
587
588
589
590
591
592 if location_line.find('replace') != -1:
593 comma_pos = location_line.find(',')
594 location_line = location_line[8:comma_pos]
595
596
597 try:
598 parse_info = \
599 LocationParser.parse(LocationParser.scan(location_line))
600
601 except SystemExit:
602 raise LocationParserError(location_line)
603
604
605
606
607 self._set_location_info(parse_info, self._cur_feature)
608
610 """Set the location information based on a function.
611
612 This handles all of the location functions like 'join', 'complement'
613 and 'order'.
614
615 Arguments:
616 o function - A LocationParser.Function object specifying the
617 function we are acting on.
618 o cur_feature - The feature to add information to.
619 """
620 from Bio import SeqFeature
621 from Bio.GenBank import LocationParser
622 assert isinstance(function, LocationParser.Function), \
623 "Expected a Function object, got %s" % function
624
625 if function.name == "complement":
626
627 cur_feature.strand = -1
628
629 for inner_info in function.args:
630 self._set_location_info(inner_info, cur_feature)
631
632
633
634
635
636
637
638 elif (function.name == "join" or function.name == "order" or
639 function.name == "one-of" or function.name == "bond"):
640 self._set_ordering_info(function, cur_feature)
641 elif (function.name == "gap"):
642 assert len(function.args) == 1, \
643 "Unexpected number of arguments in gap %s" % function.args
644
645 position = self._get_position(function.args[0].local_location)
646 cur_feature.location = SeqFeature.PositionGap(position)
647 else:
648 raise ValueError("Unexpected function name: %s" % function.name)
649
651 """Parse a join or order and all of the information in it.
652
653 This deals with functions that order a bunch of locations,
654 specifically 'join' and 'order'. The inner locations are
655 added as subfeatures of the top level feature
656 """
657 from Bio import SeqFeature
658
659
660 for inner_element in function.args:
661 new_sub_feature = SeqFeature.SeqFeature()
662
663 new_sub_feature.type = cur_feature.type
664
665 cur_feature.location_operator = function.name
666 new_sub_feature.location_operator = function.name
667
668 new_sub_feature.ref = cur_feature.ref
669 new_sub_feature.ref_db = cur_feature.ref_db
670 new_sub_feature.strand = cur_feature.strand
671
672
673 self._set_location_info(inner_element, new_sub_feature)
674
675
676 cur_feature.sub_features.append(new_sub_feature)
677
678
679
680
681
682
683
684
685 feature_start = cur_feature.sub_features[0].location.start
686 feature_end = cur_feature.sub_features[-1].location.end
687 cur_feature.location = SeqFeature.FeatureLocation(feature_start,
688 feature_end)
689
691 """Set the location information for a feature from the parse info.
692
693 Arguments:
694 o parse_info - The classes generated by the LocationParser.
695 o cur_feature - The feature to add the information to.
696 """
697 from Bio.GenBank import LocationParser
698
699 if parse_info is None:
700 return
701
702
703 elif isinstance(parse_info, LocationParser.AbsoluteLocation):
704 self._set_location(parse_info, cur_feature)
705 return
706
707 elif isinstance(parse_info, LocationParser.Function):
708 self._set_function(parse_info, cur_feature)
709
710 else:
711 raise ValueError("Could not parse location info: %s"
712 % parse_info)
713
715 """Set the location information for a feature.
716
717 Arguments:
718 o location - An AbsoluteLocation object specifying the info
719 about the location.
720 o cur_feature - The feature to add the information to.
721 """
722
723
724 if location.path is not None:
725 cur_feature.ref = location.path.accession
726 cur_feature.ref_db = location.path.database
727
728 cur_feature.location = self._get_location(location.local_location)
729
731 """Return a (possibly fuzzy) location from a Range object.
732
733 Arguments:
734 o range_info - A location range (ie. something like 67..100). This
735 may also be a single position (ie 27).
736
737 This returns a FeatureLocation object.
738 If parser.use_fuzziness is set at one, the positions for the
739 end points will possibly be fuzzy.
740 """
741 from Bio import SeqFeature
742 from Bio.GenBank import LocationParser
743
744 if not(isinstance(range_info, LocationParser.Range)):
745 pos = self._get_position(range_info)
746
747
748 pos.position = pos.position - 1
749 return SeqFeature.FeatureLocation(pos, pos)
750
751 else:
752
753 start_pos = self._get_position(range_info.low)
754 end_pos = self._get_position(range_info.high)
755
756 start_pos.position, end_pos.position = \
757 self._convert_to_python_numbers(start_pos.position,
758 end_pos.position)
759
760 return SeqFeature.FeatureLocation(start_pos, end_pos)
761
763 """Return a (possibly fuzzy) position for a single coordinate.
764
765 Arguments:
766 o position - This is a LocationParser.* object that specifies
767 a single coordinate. We will examine the object to determine
768 the fuzziness of the position.
769
770 This is used with _get_location to parse out a location of any
771 end_point of arbitrary fuzziness.
772 """
773 from Bio import SeqFeature
774 from Bio.GenBank import LocationParser
775
776 if (isinstance(position, LocationParser.Integer)):
777 final_pos = SeqFeature.ExactPosition(position.val)
778
779 elif isinstance(position, LocationParser.LowBound):
780 final_pos = SeqFeature.AfterPosition(position.base.val)
781
782 elif isinstance(position, LocationParser.HighBound):
783 final_pos = SeqFeature.BeforePosition(position.base.val)
784
785 elif isinstance(position, LocationParser.Between):
786 final_pos = SeqFeature.BetweenPosition(position.low.val,
787 position.high.val)
788
789 elif isinstance(position, LocationParser.TwoBound):
790 final_pos = SeqFeature.WithinPosition(position.low.val,
791 position.high.val)
792
793 elif isinstance(position, LocationParser.Function) and \
794 position.name == "one-of":
795
796 position_choices = []
797 for arg in position.args:
798
799
800 assert isinstance(arg, LocationParser.AbsoluteLocation), \
801 "Unhandled Location type %r" % arg
802 assert arg.path is None, "Unhandled path in location"
803 position = self._get_position(arg.local_location)
804 position_choices.append(position)
805 final_pos = SeqFeature.OneOfPosition(position_choices)
806
807 else:
808 raise ValueError("Unexpected LocationParser object %r" %
809 position)
810
811
812 if self._use_fuzziness:
813 return final_pos
814
815 else:
816 return SeqFeature.ExactPosition(final_pos.location)
817
819 """Add a qualifier to the current feature without loss of info.
820
821 If there are multiple qualifier keys with the same name we
822 would lose some info in the dictionary, so we append a unique
823 number to the end of the name in case of conflicts.
824 """
825
826
827 if self._cur_qualifier_key:
828 key = self._cur_qualifier_key
829 value = "".join(self._cur_qualifier_value)
830 if self._feature_cleaner is not None:
831 value = self._feature_cleaner.clean_value(key, value)
832
833 if key in self._cur_feature.qualifiers:
834 self._cur_feature.qualifiers[key].append(value)
835
836 else:
837 self._cur_feature.qualifiers[key] = [value]
838
840 """When we get a qualifier key, use it as a dictionary key.
841
842 We receive a list of keys, since you can have valueless keys such as
843 /pseudo which would be passed in with the next key (since no other
844 tags separate them in the file)
845 """
846 for content in content_list:
847
848 self._add_qualifier()
849
850
851 qual_key = content.replace('/', '')
852 qual_key = qual_key.replace('=', '')
853 qual_key = qual_key.strip()
854
855 self._cur_qualifier_key = qual_key
856 self._cur_qualifier_value = []
857
859
860 qual_value = content.replace('"', '')
861
862 self._cur_qualifier_value.append(qual_value)
863
883
886
889
892
894 """Add up sequence information as we get it.
895
896 To try and make things speedier, this puts all of the strings
897 into a list of strings, and then uses string.join later to put
898 them together. Supposedly, this is a big time savings
899 """
900 new_seq = content.replace(' ', '')
901 new_seq = new_seq.upper()
902
903 self._seq_data.append(new_seq)
904
956
958 """Create a GenBank Record object from scanner generated information.
959 """
969
970 - def locus(self, content):
972
973 - def size(self, content):
975
978
981
982 - def date(self, content):
984
987
992
993 - def nid(self, content):
995
996 - def pid(self, content):
998
1001
1004
1005 - def gi(self, content):
1007
1010
1013
1016
1019
1022
1024 """Grab the reference number and signal the start of a new reference.
1025 """
1026
1027 if self._cur_reference is not None:
1028 self.data.references.append(self._cur_reference)
1029
1030 self._cur_reference = Record.Reference()
1031 self._cur_reference.number = content
1032
1034 self._cur_reference.bases = content
1035
1037 self._cur_reference.authors = content
1038
1040 self._cur_reference.consrtm = content
1041
1042 - def title(self, content):
1043 self._cur_reference.title = content
1044
1046 self._cur_reference.journal = content
1047
1050
1053
1055 self._cur_reference.remark = content
1056
1059
1063
1066
1068 """Get ready for the feature table when we reach the FEATURE line.
1069 """
1070 self.start_feature_table()
1071
1073 """Signal the start of the feature table.
1074 """
1075
1076 if self._cur_reference is not None:
1077 self.data.references.append(self._cur_reference)
1078
1080 """Grab the key of the feature and signal the start of a new feature.
1081 """
1082
1083 self._add_feature()
1084
1085 self._cur_feature = Record.Feature()
1086 self._cur_feature.key = content
1087
1089 """Utility function to add a feature to the Record.
1090
1091 This does all of the appropriate checking to make sure we haven't
1092 left any info behind, and that we are only adding info if it
1093 exists.
1094 """
1095 if self._cur_feature is not None:
1096
1097
1098 if self._cur_qualifier is not None:
1099 self._cur_feature.qualifiers.append(self._cur_qualifier)
1100
1101 self._cur_qualifier = None
1102 self.data.features.append(self._cur_feature)
1103
1106
1108 """Deal with qualifier names
1109
1110 We receive a list of keys, since you can have valueless keys such as
1111 /pseudo which would be passed in with the next key (since no other
1112 tags separate them in the file)
1113 """
1114 for content in content_list:
1115
1116 if content.find("/") != 0:
1117 content = "/%s" % content
1118
1119 if self._cur_qualifier is not None:
1120 self._cur_feature.qualifiers.append(self._cur_qualifier)
1121
1122 self._cur_qualifier = Record.Qualifier()
1123 self._cur_qualifier.key = content
1124
1136
1138 self.data.base_counts = content
1139
1141 self.data.origin = content
1142
1144 """Signal that we have contig information to add to the record.
1145 """
1146 self.data.contig = self._clean_location(content)
1147
1149 """Add sequence information to a list of sequence strings.
1150
1151 This removes spaces in the data and uppercases the sequence, and
1152 then adds it to a list of sequences. Later on we'll join this
1153 list together to make the final sequence. This is faster than
1154 adding on the new string every time.
1155 """
1156 new_seq = content.replace(' ', '')
1157 self._seq_data.append(new_seq.upper())
1158
1160 """Signal the end of the record and do any necessary clean-up.
1161 """
1162
1163
1164 self.data.sequence = "".join(self._seq_data)
1165
1166 self._add_feature()
1167
1168
1170 """Access GenBank using a read-only dictionary interface.
1171 """
1172 VALID_DATABASES = ['nucleotide', 'protein', 'genome']
1173 VALID_FORMATS = ['genbank', 'fasta']
1174 - def __init__(self, database, format, parser = None):
1175 """Initialize an NCBI dictionary to retrieve sequences.
1176
1177 Create a new Dictionary to access GenBank. Valid values for
1178 database are 'nucleotide' and 'protein'.
1179 Valid values for format are 'genbank' (for nucleotide genbank and
1180 protein genpept) and 'fasta'.
1181 dely and retmax are old options kept only for compatibility -- do not
1182 bother to set them.
1183 parser is an optional parser object
1184 to change the results into another form. If unspecified, then
1185 the raw contents of the file will be returned.
1186 """
1187 self.parser = parser
1188 if database not in self.__class__.VALID_DATABASES:
1189 raise ValueError("Invalid database %s, should be one of %s" %
1190 (database, self.__class__.VALID_DATABASES))
1191 if format not in self.__class__.VALID_FORMATS:
1192 raise ValueError("Invalid format %s, should be one of %s" %
1193 (format, self.__class__.VALID_FORMATS))
1194
1195 if format=="genbank": format = "gb"
1196 self.db = database
1197 self.format = format
1198
1200 raise NotImplementedError, "GenBank contains lots of entries"
1202 raise NotImplementedError, "This is a read-only dictionary"
1204 raise NotImplementedError, "This is a read-only dictionary"
1206 raise NotImplementedError, "This is a read-only dictionary"
1208 raise NotImplementedError, "You don't need to do this..."
1210 raise NotImplementedError, "You don't really want to do this..."
1212 raise NotImplementedError, "You don't really want to do this..."
1214 raise NotImplementedError, "You don't really want to do this..."
1215
1217 """S.has_key(id) -> bool"""
1218 try:
1219 self[id]
1220 except KeyError:
1221 return 0
1222 return 1
1223
1224 - def get(self, id, failobj=None):
1225 try:
1226 return self[id]
1227 except KeyError:
1228 return failobj
1229 raise "How did I get here?"
1230
1232 """Return the GenBank entry specified by the GenBank ID.
1233
1234 Raises a KeyError if there's an error.
1235 """
1236 handle = Entrez.efetch(db = self.db, id = id, rettype = self.format)
1237
1238 if self.parser is not None:
1239 return self.parser.parse(handle)
1240 return handle.read()
1241
1242 -def search_for(search, database='nucleotide',
1243 reldate=None, mindate=None, maxdate=None,
1244 start_id = 0, max_ids = 50000000):
1245 """search_for(search[, reldate][, mindate][, maxdate]
1246 [, batchsize][, delay][, callback_fn][, start_id][, max_ids]) -> ids
1247
1248 Search GenBank and return a list of the GenBank identifiers (gi's)
1249 that match the criteria. search is the search string used to
1250 search the database. Valid values for database are
1251 'nucleotide', 'protein', 'popset' and 'genome'. reldate is
1252 the number of dates prior to the current date to restrict the
1253 search. mindate and maxdate are the dates to restrict the search,
1254 e.g. 2002/01/01. start_id is the number to begin retrieval on.
1255 max_ids specifies the maximum number of id's to retrieve.
1256
1257 batchsize, delay and callback_fn are old parameters for
1258 compatibility -- do not set them.
1259 """
1260
1261 date_restrict = None
1262 if reldate:
1263 date_restrict = EUtils.WithinNDays(reldate)
1264 elif mindate:
1265 date_restrict = EUtils.DateRange(mindate, maxdate)
1266
1267 eutils_client = DBIdsClient.DBIdsClient()
1268 db_ids = eutils_client.search(search, database, daterange = date_restrict,
1269 retstart = start_id, retmax = max_ids)
1270 ids = []
1271 for db_id in db_ids:
1272 ids.append(db_id.dbids.ids[0])
1273 return ids
1274
1276 """download_many(ids, database) -> handle of results
1277
1278 Download many records from GenBank. ids is a list of gis or
1279 accessions.
1280
1281 callback_fn, broken_fn, delay, faildelay, batchsize, parser are old
1282 parameter for compatibility. They should not be used.
1283 """
1284 db_ids = DBIds(database, ids)
1285 if database in ['nucleotide']:
1286 format = 'gb'
1287 elif database in ['protein']:
1288 format = 'gp'
1289 else:
1290 raise ValueError("Unexpected database: %s" % database)
1291
1292 eutils_client = DBIdsClient.from_dbids(db_ids)
1293 result_handle = eutils_client.efetch(retmode = "text", rettype = format)
1294 return cStringIO.StringIO(result_handle.read())
1295