1
2
3
4
5
6 """
7 This module provides code to work with the sprotXX.dat file from
8 SwissProt.
9 http://www.expasy.ch/sprot/sprot-top.html
10
11 Tested with:
12 Release 37, Release 38, Release 39
13
14 Limited testing with:
15 Release 51, 54
16
17
18 Classes:
19 Record Holds SwissProt data.
20 Reference Holds reference data from a SwissProt entry.
21 Iterator Iterates over entries in a SwissProt file.
22 Dictionary Accesses a SwissProt file using a dictionary interface.
23 ExPASyDictionary Accesses SwissProt records from ExPASy.
24 RecordParser Parses a SwissProt record into a Record object.
25 SequenceParser Parses a SwissProt record into a SeqRecord object.
26
27 _Scanner Scans SwissProt-formatted data.
28 _RecordConsumer Consumes SwissProt data to a Record object.
29 _SequenceConsumer Consumes SwissProt data to a Seq object.
30
31
32 Functions:
33 index_file Index a SwissProt file for a Dictionary.
34
35 """
36 from types import *
37 import os
38 from Bio import File
39 from Bio import Index
40 from Bio import Alphabet
41 from Bio import Seq
42 from Bio import SeqRecord
43 from Bio.ParserSupport import *
44 from Bio.WWW import ExPASy
45 from Bio.WWW import RequestLimiter
46
47 _CHOMP = " \n\r\t.,;"
48
50 """Holds information from a SwissProt record.
51
52 Members:
53 entry_name Name of this entry, e.g. RL1_ECOLI.
54 data_class Either 'STANDARD' or 'PRELIMINARY'.
55 molecule_type Type of molecule, 'PRT',
56 sequence_length Number of residues.
57
58 accessions List of the accession numbers, e.g. ['P00321']
59 created A tuple of (date, release).
60 sequence_update A tuple of (date, release).
61 annotation_update A tuple of (date, release).
62
63 description Free-format description.
64 gene_name Gene name. See userman.txt for description.
65 organism The source of the sequence.
66 organelle The origin of the sequence.
67 organism_classification The taxonomy classification. List of strings.
68 (http://www.ncbi.nlm.nih.gov/Taxonomy/)
69 taxonomy_id A list of NCBI taxonomy id's.
70 host_organism A list of NCBI taxonomy id's of the hosts of a virus,
71 if any.
72 references List of Reference objects.
73 comments List of strings.
74 cross_references List of tuples (db, id1[, id2][, id3]). See the docs.
75 keywords List of the keywords.
76 features List of tuples (key name, from, to, description).
77 from and to can be either integers for the residue
78 numbers, '<', '>', or '?'
79
80 seqinfo tuple of (length, molecular weight, CRC32 value)
81 sequence The sequence.
82
83 """
85 self.entry_name = None
86 self.data_class = None
87 self.molecule_type = None
88 self.sequence_length = None
89
90 self.accessions = []
91 self.created = None
92 self.sequence_update = None
93 self.annotation_update = None
94
95 self.description = ''
96 self.gene_name = ''
97 self.organism = ''
98 self.organelle = ''
99 self.organism_classification = []
100 self.taxonomy_id = []
101 self.host_organism = []
102 self.references = []
103 self.comments = []
104 self.cross_references = []
105 self.keywords = []
106 self.features = []
107
108 self.seqinfo = None
109 self.sequence = ''
110
112 """Holds information from 1 references in a SwissProt entry.
113
114 Members:
115 number Number of reference in an entry.
116 positions Describes extent of work. list of strings.
117 comments Comments. List of (token, text).
118 references References. List of (dbname, identifier)
119 authors The authors of the work.
120 title Title of the work.
121 location A citation for the work.
122
123 """
125 self.number = None
126 self.positions = []
127 self.comments = []
128 self.references = []
129 self.authors = ''
130 self.title = ''
131 self.location = ''
132
134 """Returns one record at a time from a SwissProt file.
135
136 Methods:
137 next Return the next record from the stream, or None.
138
139 """
140 - def __init__(self, handle, parser=None):
141 """__init__(self, handle, parser=None)
142
143 Create a new iterator. handle is a file-like object. parser
144 is an optional Parser object to change the results into another form.
145 If set to None, then the raw contents of the file will be returned.
146
147 """
148 if type(handle) is not FileType and type(handle) is not InstanceType:
149 raise ValueError, "I expected a file handle or file-like object"
150 self._uhandle = File.UndoHandle(handle)
151 self._parser = parser
152
154 """next(self) -> object
155
156 Return the next swissprot record from the file. If no more records,
157 return None.
158
159 """
160 lines = []
161 while 1:
162 line = self._uhandle.readline()
163 if not line:
164 break
165 lines.append(line)
166 if line[:2] == '//':
167 break
168
169 if not lines:
170 return None
171
172 data = ''.join(lines)
173 if self._parser is not None:
174 return self._parser.parse(File.StringHandle(data))
175 return data
176
178 return iter(self.next, None)
179
181 """Accesses a SwissProt file using a dictionary interface.
182
183 """
184 __filename_key = '__filename'
185
186 - def __init__(self, indexname, parser=None):
187 """__init__(self, indexname, parser=None)
188
189 Open a SwissProt Dictionary. indexname is the name of the
190 index for the dictionary. The index should have been created
191 using the index_file function. parser is an optional Parser
192 object to change the results into another form. If set to None,
193 then the raw contents of the file will be returned.
194
195 """
196 self._index = Index.Index(indexname)
197 self._handle = open(self._index[self.__filename_key])
198 self._parser = parser
199
201 return len(self._index)
202
210
212 return getattr(self._index, name)
213
219
221 """Access SwissProt at ExPASy using a read-only dictionary interface.
222
223 """
224 - def __init__(self, delay=5.0, parser=None):
225 """__init__(self, delay=5.0, parser=None)
226
227 Create a new Dictionary to access SwissProt. parser is an optional
228 parser (e.g. SProt.RecordParser) object to change the results
229 into another form. If set to None, then the raw contents of the
230 file will be returned. delay is the number of seconds to wait
231 between each query.
232
233 """
234 self.parser = parser
235 self.limiter = RequestLimiter(delay)
236
238 raise NotImplementedError, "SwissProt contains lots of entries"
240 raise NotImplementedError, "This is a read-only dictionary"
242 raise NotImplementedError, "This is a read-only dictionary"
244 raise NotImplementedError, "This is a read-only dictionary"
246 raise NotImplementedError, "You don't need to do this..."
248 raise NotImplementedError, "You don't really want to do this..."
250 raise NotImplementedError, "You don't really want to do this..."
252 raise NotImplementedError, "You don't really want to do this..."
253
255 """has_key(self, id) -> bool"""
256 try:
257 self[id]
258 except KeyError:
259 return 0
260 return 1
261
262 - def get(self, id, failobj=None):
263 try:
264 return self[id]
265 except KeyError:
266 return failobj
267 raise "How did I get here?"
268
270 """__getitem__(self, id) -> object
271
272 Return a SwissProt entry. id is either the id or accession
273 for the entry. Raises a KeyError if there's an error.
274
275 """
276
277
278 self.limiter.wait()
279
280 try:
281 handle = ExPASy.get_sprot_raw(id)
282 except IOError:
283 raise KeyError, id
284
285 if self.parser is not None:
286 return self.parser.parse(handle)
287 return handle.read()
288
290 """Parses SwissProt data into a Record object.
291
292 """
296
297 - def parse(self, handle):
298 self._scanner.feed(handle, self._consumer)
299 return self._consumer.data
300
302 """Parses SwissProt data into a standard SeqRecord object.
303 """
305 """Initialize a SequenceParser.
306
307 Arguments:
308 o alphabet - The alphabet to use for the generated Seq objects. If
309 not supplied this will default to the generic protein alphabet.
310 """
311 self._scanner = _Scanner()
312 self._consumer = _SequenceConsumer(alphabet)
313
314 - def parse(self, handle):
315 self._scanner.feed(handle, self._consumer)
316 return self._consumer.data
317
319 """Scans SwissProt-formatted data.
320
321 Tested with:
322 Release 37
323 Release 38
324 """
325
326 - def feed(self, handle, consumer):
327 """feed(self, handle, consumer)
328
329 Feed in SwissProt data for scanning. handle is a file-like
330 object that contains swissprot data. consumer is a
331 Consumer object that will receive events as the report is scanned.
332
333 """
334 if isinstance(handle, File.UndoHandle):
335 uhandle = handle
336 else:
337 uhandle = File.UndoHandle(handle)
338
339 while uhandle.peekline():
340 self._scan_record(uhandle, consumer)
341
343 """Ignores any lines starting **"""
344
345
346
347
348
349
350
351 while "**" == uhandle.peekline()[:2] :
352 skip = uhandle.readline()
353
354
368
369 - def _scan_line(self, line_type, uhandle, event_fn,
370 exactly_one=None, one_or_more=None, any_number=None,
371 up_to_one=None):
389
392
398
404
408
411
415
418
422
426
430
448
452
456
460
464
478
482
488
491
495
498
501
504
507
510
513
514 _scan_fns = [
515 _scan_id,
516 _scan_ac,
517 _scan_dt,
518 _scan_de,
519 _scan_gn,
520 _scan_os,
521 _scan_og,
522 _scan_oc,
523 _scan_ox,
524 _scan_oh,
525 _scan_reference,
526 _scan_cc,
527 _scan_dr,
528 _scan_pe,
529 _scan_kw,
530 _scan_ft,
531 _scan_sq,
532 _scan_sequence_data,
533 _scan_terminator
534 ]
535
537 """Consumer that converts a SwissProt record to a Record object.
538
539 Members:
540 data Record with SwissProt data.
541
542 """
545
547 return "Bio.SwissProt.SProt._RecordConsumer()"
548
551
554
556 cols = line.split()
557
558
559
560
561
562
563
564
565 if len(cols) == 6 :
566 self.data.entry_name = cols[1]
567 self.data.data_class = cols[2].rstrip(_CHOMP)
568 self.data.molecule_type = cols[3].rstrip(_CHOMP)
569 self.data.sequence_length = int(cols[4])
570 elif len(cols) == 5 :
571 self.data.entry_name = cols[1]
572 self.data.data_class = cols[2].rstrip(_CHOMP)
573 self.data.molecule_type = None
574 self.data.sequence_length = int(cols[3])
575 else :
576
577 raise SyntaxError("ID line has unrecognised format:\n"+line)
578
579
580
581
582 if self.data.data_class not in ['STANDARD', 'PRELIMINARY', 'IPI',
583 'Reviewed', 'Unreviewed']:
584 raise SyntaxError, "Unrecognized data class %s in line\n%s" % \
585 (self.data.data_class, line)
586
587
588 if self.data.molecule_type is not None \
589 and self.data.molecule_type != 'PRT':
590 raise SyntaxError, "Unrecognized molecule type %s in line\n%s" % \
591 (self.data.molecule_type, line)
592
594 cols = line[5:].rstrip(_CHOMP).split(';')
595 for ac in cols:
596 self.data.accessions.append(ac.lstrip())
597
598 - def date(self, line):
599 uprline = string.upper(line)
600 cols = line.rstrip().split()
601
602 if uprline.find('CREATED') >= 0 \
603 or uprline.find('LAST SEQUENCE UPDATE') >= 0 \
604 or uprline.find('LAST ANNOTATION UPDATE') >= 0:
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620 uprcols = uprline.split()
621 rel_index = -1
622 for index in range(len(uprcols)):
623 if uprcols[index].find("REL.") >= 0:
624 rel_index = index
625 assert rel_index >= 0, \
626 "Could not find Rel. in DT line: %s" % (line)
627 version_index = rel_index + 1
628
629 str_version = cols[version_index].rstrip(_CHOMP)
630
631 if str_version == '':
632 version = 0
633
634 elif str_version.find(".") >= 0:
635 version = str_version
636
637 else:
638 version = int(str_version)
639
640 if uprline.find('CREATED') >= 0:
641 self.data.created = cols[1], version
642 elif uprline.find('LAST SEQUENCE UPDATE') >= 0:
643 self.data.sequence_update = cols[1], version
644 elif uprline.find( 'LAST ANNOTATION UPDATE') >= 0:
645 self.data.annotation_update = cols[1], version
646 else:
647 assert False, "Shouldn't reach this line!"
648 elif uprline.find('INTEGRATED INTO') >= 0 \
649 or uprline.find('SEQUENCE VERSION') >= 0 \
650 or uprline.find('ENTRY VERSION') >= 0:
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671 try:
672 version = int(cols[-1])
673 except ValueError :
674 version = 0
675
676
677
678 if uprline.find("INTEGRATED") >= 0:
679 self.data.created = cols[1], version
680 elif uprline.find('SEQUENCE VERSION') >= 0:
681 self.data.sequence_update = cols[1], version
682 elif uprline.find( 'ENTRY VERSION') >= 0:
683 self.data.annotation_update = cols[1], version
684 else:
685 assert False, "Shouldn't reach this line!"
686 else:
687 raise SyntaxError, "I don't understand the date line %s" % line
688
691
694
697
700
706
728
741
743 rn = line[5:].rstrip()
744 assert rn[0] == '[' and rn[-1] == ']', "Missing brackets %s" % rn
745 ref = Reference()
746 ref.number = int(rn[1:-1])
747 self.data.references.append(ref)
748
750 assert self.data.references, "RP: missing RN"
751 self.data.references[-1].positions.append(line[5:].rstrip())
752
780
782 assert self.data.references, "RX: missing RN"
783
784
785
786
787
788
789
790
791 ind = line.find('[NCBI, ExPASy, Israel, Japan]')
792 if ind >= 0:
793 line = line[:ind]
794
795
796
797
798
799
800
801
802 if line.find("=") != -1:
803 cols = line[2:].split("; ")
804 cols = [x.strip() for x in cols]
805 cols = [x for x in cols if x]
806 for col in cols:
807 x = col.split("=")
808 assert len(x) == 2, "I don't understand RX line %s" % line
809 key, value = x[0].rstrip(_CHOMP), x[1].rstrip(_CHOMP)
810 ref = self.data.references[-1].references
811 ref.append((key, value))
812
813 else:
814 cols = line.split()
815
816 assert len(cols) == 3, "I don't understand RX line %s" % line
817 self.data.references[-1].references.append(
818 (cols[1].rstrip(_CHOMP), cols[2].rstrip(_CHOMP)))
819
821 assert self.data.references, "RA: missing RN"
822 ref = self.data.references[-1]
823 ref.authors = ref.authors + line[5:]
824
826 assert self.data.references, "RT: missing RN"
827 ref = self.data.references[-1]
828 ref.title = ref.title + line[5:]
829
831 assert self.data.references, "RL: missing RN"
832 ref = self.data.references[-1]
833 ref.location = ref.location + line[5:]
834
853
855
856
857
858
859 line = line[5:]
860
861 i = line.find('[')
862 if i >= 0:
863 line = line[:i]
864 cols = line.rstrip(_CHOMP).split(';')
865 cols = [col.lstrip() for col in cols]
866 self.data.cross_references.append(tuple(cols))
867
871
873 line = line[5:]
874 name = line[0:8].rstrip()
875 try:
876 from_res = int(line[9:15])
877 except ValueError:
878 from_res = line[9:15].lstrip()
879 try:
880 to_res = int(line[16:22])
881 except ValueError:
882 to_res = line[16:22].lstrip()
883 description = line[29:70].rstrip()
884
885 if line[29:35]==r"/FTId=":
886 ft_id = line[35:70].rstrip()[:-1]
887 else:
888 ft_id =""
889 if not name:
890 assert not from_res and not to_res
891 name, from_res, to_res, old_description,old_ft_id = self.data.features[-1]
892 del self.data.features[-1]
893 description = "%s %s" % (old_description, description)
894
895
896 if name == "VARSPLIC":
897 description = self._fix_varsplic_sequences(description)
898 self.data.features.append((name, from_res, to_res, description,ft_id))
899
901 """Remove unwanted spaces in sequences.
902
903 During line carryover, the sequences in VARSPLIC can get mangled
904 with unwanted spaces like:
905 'DISSTKLQALPSHGLESIQT -> PCRATGWSPFRRSSPC LPTH'
906 We want to check for this case and correct it as it happens.
907 """
908 descr_cols = description.split(" -> ")
909 if len(descr_cols) == 2:
910 first_seq = descr_cols[0]
911 second_seq = descr_cols[1]
912 extra_info = ''
913
914
915 extra_info_pos = second_seq.find(" (")
916 if extra_info_pos != -1:
917 extra_info = second_seq[extra_info_pos:]
918 second_seq = second_seq[:extra_info_pos]
919
920
921 first_seq = first_seq.replace(" ", "")
922 second_seq = second_seq.replace(" ", "")
923
924
925 description = first_seq + " -> " + second_seq + extra_info
926
927 return description
928
932
934 cols = line.split()
935 assert len(cols) == 8, "I don't understand SQ line %s" % line
936
937 self.data.seqinfo = int(cols[2]), int(cols[4]), cols[6]
938
942
945
946
947
948
949
950
952
953 members = ['description', 'gene_name', 'organism', 'organelle']
954 for m in members:
955 attr = getattr(rec, m)
956 setattr(rec, m, attr.rstrip())
957 for ref in rec.references:
958 self._clean_references(ref)
959
961
962 members = ['authors', 'title', 'location']
963 for m in members:
964 attr = getattr(ref, m)
965 setattr(ref, m, attr.rstrip())
966
968 """Consumer that converts a SwissProt record to a SeqRecord object.
969
970 Members:
971 data Record with SwissProt data.
972 alphabet The alphabet the generated Seq objects will have.
973 """
974
976 """Initialize a Sequence Consumer
977
978 Arguments:
979 o alphabet - The alphabet to use for the generated Seq objects. If
980 not supplied this will default to the generic protein alphabet.
981 """
982 self.data = None
983 self.alphabet = alphabet
984
990
993
997
1010
1011
1015
1020
1027
1036
1037
1038 - def date(self, line):
1039 date_str = line.split()[0]
1040 uprline = string.upper(line)
1041 if uprline.find('CREATED') >= 0 :
1042
1043
1044 self.data.annotations['date'] = date_str
1045 elif uprline.find('LAST SEQUENCE UPDATE') >= 0 :
1046
1047 self.data.annotations['date_last_sequence_update'] = date_str
1048 elif uprline.find('LAST ANNOTATION UPDATE') >= 0:
1049
1050 self.data.annotations['date_last_annotation_update'] = date_str
1051
1064
1074
1092
1111
1112 -def index_file(filename, indexname, rec2key=None):
1113 """index_file(filename, indexname, rec2key=None)
1114
1115 Index a SwissProt file. filename is the name of the file.
1116 indexname is the name of the dictionary. rec2key is an
1117 optional callback that takes a Record and generates a unique key
1118 (e.g. the accession number) for the record. If not specified,
1119 the entry name will be used.
1120
1121 """
1122 if not os.path.exists(filename):
1123 raise ValueError, "%s does not exist" % filename
1124
1125 index = Index.Index(indexname, truncate=1)
1126 index[Dictionary._Dictionary__filename_key] = filename
1127
1128 iter = Iterator(open(filename), parser=RecordParser())
1129 while 1:
1130 start = iter._uhandle.tell()
1131 rec = iter.next()
1132 length = iter._uhandle.tell() - start
1133
1134 if rec is None:
1135 break
1136 if rec2key is not None:
1137 key = rec2key(rec)
1138 else:
1139 key = rec.entry_name
1140
1141 if not key:
1142 raise KeyError, "empty sequence key was produced"
1143 elif index.has_key(key):
1144 raise KeyError, "duplicate key %s found" % key
1145
1146 index[key] = start, length
1147
1148 if __name__ == "__main__" :
1149 print "Quick self test..."
1150
1151 example_filename = "../../Tests/SwissProt/sp008"
1152
1153 import os
1154 if not os.path.isfile(example_filename) :
1155 print "Missing test file %s" % example_filename
1156 else :
1157
1158
1159 handle = open(example_filename)
1160 for record in Iterator(handle, RecordParser()) :
1161 print record.entry_name
1162 print ",".join(record.accessions)
1163 print record.keywords
1164 print repr(record.organism)
1165 print record.sequence[:20] + "..."
1166 handle.close()
1167
1168 handle = open(example_filename)
1169 for record in Iterator(handle, SequenceParser()) :
1170 print record.name
1171 print record.id
1172 print record.annotations['keywords']
1173 print repr(record.annotations['organism'])
1174 print record.seq.tostring()[:20] + "..."
1175 handle.close()
1176