1
2
3
4
5
6 """
7 This module provides code to work with the sprotXX.dat file from
8 SwissProt.
9 http://www.expasy.ch/sprot/sprot-top.html
10
11 Tested with:
12 Release 37, Release 38, Release 39
13
14 Limited testing with:
15 Release 51, 54
16
17
18 Classes:
19 Record Holds SwissProt data.
20 Reference Holds reference data from a SwissProt entry.
21 Iterator Iterates over entries in a SwissProt file.
22 Dictionary Accesses a SwissProt file using a dictionary interface.
23 RecordParser Parses a SwissProt record into a Record object.
24 SequenceParser Parses a SwissProt record into a SeqRecord object.
25
26 _Scanner Scans SwissProt-formatted data.
27 _RecordConsumer Consumes SwissProt data to a SProt.Record object.
28 _SequenceConsumer Consumes SwissProt data to a SeqRecord object.
29
30
31 Functions:
32 index_file Index a SwissProt file for a Dictionary.
33
34 """
35 from types import *
36 import os
37 from Bio import File
38 from Bio import Index
39 from Bio import Alphabet
40 from Bio import Seq
41 from Bio import SeqRecord
42 from Bio.ParserSupport import *
43
44 _CHOMP = " \n\r\t.,;"
45
47 """Holds information from a SwissProt record.
48
49 Members:
50 entry_name Name of this entry, e.g. RL1_ECOLI.
51 data_class Either 'STANDARD' or 'PRELIMINARY'.
52 molecule_type Type of molecule, 'PRT',
53 sequence_length Number of residues.
54
55 accessions List of the accession numbers, e.g. ['P00321']
56 created A tuple of (date, release).
57 sequence_update A tuple of (date, release).
58 annotation_update A tuple of (date, release).
59
60 description Free-format description.
61 gene_name Gene name. See userman.txt for description.
62 organism The source of the sequence.
63 organelle The origin of the sequence.
64 organism_classification The taxonomy classification. List of strings.
65 (http://www.ncbi.nlm.nih.gov/Taxonomy/)
66 taxonomy_id A list of NCBI taxonomy id's.
67 host_organism A list of NCBI taxonomy id's of the hosts of a virus,
68 if any.
69 references List of Reference objects.
70 comments List of strings.
71 cross_references List of tuples (db, id1[, id2][, id3]). See the docs.
72 keywords List of the keywords.
73 features List of tuples (key name, from, to, description).
74 from and to can be either integers for the residue
75 numbers, '<', '>', or '?'
76
77 seqinfo tuple of (length, molecular weight, CRC32 value)
78 sequence The sequence.
79
80 """
82 self.entry_name = None
83 self.data_class = None
84 self.molecule_type = None
85 self.sequence_length = None
86
87 self.accessions = []
88 self.created = None
89 self.sequence_update = None
90 self.annotation_update = None
91
92 self.description = ''
93 self.gene_name = ''
94 self.organism = ''
95 self.organelle = ''
96 self.organism_classification = []
97 self.taxonomy_id = []
98 self.host_organism = []
99 self.references = []
100 self.comments = []
101 self.cross_references = []
102 self.keywords = []
103 self.features = []
104
105 self.seqinfo = None
106 self.sequence = ''
107
109 """Holds information from 1 references in a SwissProt entry.
110
111 Members:
112 number Number of reference in an entry.
113 positions Describes extent of work. list of strings.
114 comments Comments. List of (token, text).
115 references References. List of (dbname, identifier)
116 authors The authors of the work.
117 title Title of the work.
118 location A citation for the work.
119
120 """
122 self.number = None
123 self.positions = []
124 self.comments = []
125 self.references = []
126 self.authors = ''
127 self.title = ''
128 self.location = ''
129
131 """Returns one record at a time from a SwissProt file.
132
133 Methods:
134 next Return the next record from the stream, or None.
135
136 """
137 - def __init__(self, handle, parser=None):
138 """__init__(self, handle, parser=None)
139
140 Create a new iterator. handle is a file-like object. parser
141 is an optional Parser object to change the results into another form.
142 If set to None, then the raw contents of the file will be returned.
143
144 """
145 import warnings
146 warnings.warn("Bio.SwissProt.SProt.Iterator is deprecated. Please use the function Bio.SwissProt.parse instead if you want to get a SwissProt.SProt.Record, or Bio.SeqIO.parse if you want to get a SeqRecord. If these solutions do not work for you, please get in contact with the Biopython developers (biopython-dev@biopython.org).",
147 DeprecationWarning)
148
149 if type(handle) is not FileType and type(handle) is not InstanceType:
150 raise ValueError, "I expected a file handle or file-like object"
151 self._uhandle = File.UndoHandle(handle)
152 self._parser = parser
153
155 """next(self) -> object
156
157 Return the next swissprot record from the file. If no more records,
158 return None.
159
160 """
161 lines = []
162 while 1:
163 line = self._uhandle.readline()
164 if not line:
165 break
166 lines.append(line)
167 if line[:2] == '//':
168 break
169
170 if not lines:
171 return None
172
173 data = ''.join(lines)
174 if self._parser is not None:
175 return self._parser.parse(File.StringHandle(data))
176 return data
177
179 return iter(self.next, None)
180
182 """Accesses a SwissProt file using a dictionary interface.
183
184 """
185 __filename_key = '__filename'
186
187 - def __init__(self, indexname, parser=None):
188 """__init__(self, indexname, parser=None)
189
190 Open a SwissProt Dictionary. indexname is the name of the
191 index for the dictionary. The index should have been created
192 using the index_file function. parser is an optional Parser
193 object to change the results into another form. If set to None,
194 then the raw contents of the file will be returned.
195
196 """
197 self._index = Index.Index(indexname)
198 self._handle = open(self._index[self.__filename_key])
199 self._parser = parser
200
202 return len(self._index)
203
211
213 return getattr(self._index, name)
214
220
222 """Access SwissProt at ExPASy using a read-only dictionary interface.
223
224 """
225 - def __init__(self, delay=5.0, parser=None):
226 """__init__(self, delay=5.0, parser=None)
227
228 Create a new Dictionary to access SwissProt. parser is an optional
229 parser (e.g. SProt.RecordParser) object to change the results
230 into another form. If set to None, then the raw contents of the
231 file will be returned. delay is the number of seconds to wait
232 between each query.
233
234 """
235 import warnings
236 from Bio.WWW import RequestLimiter
237 warnings.warn("Bio.SwissProt.ExPASyDictionary is deprecated. Please use the function Bio.ExPASy.get_sprot_raw instead.",
238 DeprecationWarning)
239 self.parser = parser
240 self.limiter = RequestLimiter(delay)
241
243 raise NotImplementedError, "SwissProt contains lots of entries"
245 raise NotImplementedError, "This is a read-only dictionary"
247 raise NotImplementedError, "This is a read-only dictionary"
249 raise NotImplementedError, "This is a read-only dictionary"
251 raise NotImplementedError, "You don't need to do this..."
253 raise NotImplementedError, "You don't really want to do this..."
255 raise NotImplementedError, "You don't really want to do this..."
257 raise NotImplementedError, "You don't really want to do this..."
258
260 """has_key(self, id) -> bool"""
261 try:
262 self[id]
263 except KeyError:
264 return 0
265 return 1
266
267 - def get(self, id, failobj=None):
268 try:
269 return self[id]
270 except KeyError:
271 return failobj
272 raise "How did I get here?"
273
275 """__getitem__(self, id) -> object
276
277 Return a SwissProt entry. id is either the id or accession
278 for the entry. Raises a KeyError if there's an error.
279
280 """
281 from Bio.WWW import ExPASy
282
283
284 self.limiter.wait()
285
286 try:
287 handle = ExPASy.get_sprot_raw(id)
288 except IOError:
289 raise KeyError, id
290
291 if self.parser is not None:
292 return self.parser.parse(handle)
293 return handle.read()
294
296 """Parses SwissProt data into a Record object.
297
298 """
302
303 - def parse(self, handle):
304 self._scanner.feed(handle, self._consumer)
305 return self._consumer.data
306
308 """Parses SwissProt data into a standard SeqRecord object.
309 """
311 """Initialize a SequenceParser.
312
313 Arguments:
314 o alphabet - The alphabet to use for the generated Seq objects. If
315 not supplied this will default to the generic protein alphabet.
316 """
317 self._scanner = _Scanner()
318 self._consumer = _SequenceConsumer(alphabet)
319
320 - def parse(self, handle):
321 self._scanner.feed(handle, self._consumer)
322 return self._consumer.data
323
325 """Scans SwissProt-formatted data.
326
327 Tested with:
328 Release 37
329 Release 38
330 """
331
332 - def feed(self, handle, consumer):
333 """feed(self, handle, consumer)
334
335 Feed in SwissProt data for scanning. handle is a file-like
336 object that contains swissprot data. consumer is a
337 Consumer object that will receive events as the report is scanned.
338
339 """
340 if isinstance(handle, File.UndoHandle):
341 uhandle = handle
342 else:
343 uhandle = File.UndoHandle(handle)
344
345 self._scan_record(uhandle, consumer)
346
348 """Ignores any lines starting **"""
349
350
351
352
353
354
355
356 while "**" == uhandle.peekline()[:2] :
357 skip = uhandle.readline()
358
359
373
374 - def _scan_line(self, line_type, uhandle, event_fn,
375 exactly_one=None, one_or_more=None, any_number=None,
376 up_to_one=None):
394
397
403
409
413
416
420
423
427
431
435
453
457
461
465
469
483
487
493
496
500
503
506
509
512
515
518
519 _scan_fns = [
520 _scan_id,
521 _scan_ac,
522 _scan_dt,
523 _scan_de,
524 _scan_gn,
525 _scan_os,
526 _scan_og,
527 _scan_oc,
528 _scan_ox,
529 _scan_oh,
530 _scan_reference,
531 _scan_cc,
532 _scan_dr,
533 _scan_pe,
534 _scan_kw,
535 _scan_ft,
536 _scan_sq,
537 _scan_sequence_data,
538 _scan_terminator
539 ]
540
542 """Consumer that converts a SwissProt record to a Record object.
543
544 Members:
545 data Record with SwissProt data.
546
547 """
550
552 return "Bio.SwissProt.SProt._RecordConsumer()"
553
555 self.data = Record()
556 self._sequence_lines = []
557
561
563 cols = line.split()
564
565
566
567
568
569
570
571
572 if len(cols) == 6 :
573 self.data.entry_name = cols[1]
574 self.data.data_class = cols[2].rstrip(_CHOMP)
575 self.data.molecule_type = cols[3].rstrip(_CHOMP)
576 self.data.sequence_length = int(cols[4])
577 elif len(cols) == 5 :
578 self.data.entry_name = cols[1]
579 self.data.data_class = cols[2].rstrip(_CHOMP)
580 self.data.molecule_type = None
581 self.data.sequence_length = int(cols[3])
582 else :
583
584 raise ValueError("ID line has unrecognised format:\n"+line)
585
586
587
588
589 if self.data.data_class not in ['STANDARD', 'PRELIMINARY', 'IPI',
590 'Reviewed', 'Unreviewed']:
591 raise ValueError, "Unrecognized data class %s in line\n%s" % \
592 (self.data.data_class, line)
593
594
595 if self.data.molecule_type is not None \
596 and self.data.molecule_type != 'PRT':
597 raise ValueError, "Unrecognized molecule type %s in line\n%s" % \
598 (self.data.molecule_type, line)
599
606
607 - def date(self, line):
608 uprline = string.upper(line)
609 cols = line.rstrip().split()
610
611 if uprline.find('CREATED') >= 0 \
612 or uprline.find('LAST SEQUENCE UPDATE') >= 0 \
613 or uprline.find('LAST ANNOTATION UPDATE') >= 0:
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629 uprcols = uprline.split()
630 rel_index = -1
631 for index in range(len(uprcols)):
632 if uprcols[index].find("REL.") >= 0:
633 rel_index = index
634 assert rel_index >= 0, \
635 "Could not find Rel. in DT line: %s" % (line)
636 version_index = rel_index + 1
637
638 str_version = cols[version_index].rstrip(_CHOMP)
639
640 if str_version == '':
641 version = 0
642
643 elif str_version.find(".") >= 0:
644 version = str_version
645
646 else:
647 version = int(str_version)
648
649 if uprline.find('CREATED') >= 0:
650 self.data.created = cols[1], version
651 elif uprline.find('LAST SEQUENCE UPDATE') >= 0:
652 self.data.sequence_update = cols[1], version
653 elif uprline.find( 'LAST ANNOTATION UPDATE') >= 0:
654 self.data.annotation_update = cols[1], version
655 else:
656 assert False, "Shouldn't reach this line!"
657 elif uprline.find('INTEGRATED INTO') >= 0 \
658 or uprline.find('SEQUENCE VERSION') >= 0 \
659 or uprline.find('ENTRY VERSION') >= 0:
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680 try:
681 version = int(cols[-1])
682 except ValueError :
683 version = 0
684
685
686
687 if uprline.find("INTEGRATED") >= 0:
688 self.data.created = cols[1], version
689 elif uprline.find('SEQUENCE VERSION') >= 0:
690 self.data.sequence_update = cols[1], version
691 elif uprline.find( 'ENTRY VERSION') >= 0:
692 self.data.annotation_update = cols[1], version
693 else:
694 assert False, "Shouldn't reach this line!"
695 else:
696 raise ValueError, "I don't understand the date line %s" % line
697
700
703
706
709
715
737
750
752 rn = line[5:].rstrip()
753 assert rn[0] == '[' and rn[-1] == ']', "Missing brackets %s" % rn
754 ref = Reference()
755 ref.number = int(rn[1:-1])
756 self.data.references.append(ref)
757
759 assert self.data.references, "RP: missing RN"
760 self.data.references[-1].positions.append(line[5:].rstrip())
761
789
791 assert self.data.references, "RX: missing RN"
792
793
794
795
796
797
798
799
800 ind = line.find('[NCBI, ExPASy, Israel, Japan]')
801 if ind >= 0:
802 line = line[:ind]
803
804
805
806
807
808
809
810
811 if line.find("=") != -1:
812 cols = line[2:].split("; ")
813 cols = [x.strip() for x in cols]
814 cols = [x for x in cols if x]
815 for col in cols:
816 x = col.split("=")
817 assert len(x) == 2, "I don't understand RX line %s" % line
818 key, value = x[0].rstrip(_CHOMP), x[1].rstrip(_CHOMP)
819 ref = self.data.references[-1].references
820 ref.append((key, value))
821
822 else:
823 cols = line.split()
824
825 assert len(cols) == 3, "I don't understand RX line %s" % line
826 self.data.references[-1].references.append(
827 (cols[1].rstrip(_CHOMP), cols[2].rstrip(_CHOMP)))
828
830 assert self.data.references, "RA: missing RN"
831 ref = self.data.references[-1]
832 ref.authors += line[5:]
833
835 assert self.data.references, "RT: missing RN"
836 ref = self.data.references[-1]
837 ref.title += line[5:]
838
840 assert self.data.references, "RL: missing RN"
841 ref = self.data.references[-1]
842 ref.location += line[5:]
843
862
864
865
866
867
868 line = line[5:]
869
870 i = line.find('[')
871 if i >= 0:
872 line = line[:i]
873 cols = line.rstrip(_CHOMP).split(';')
874 cols = [col.lstrip() for col in cols]
875 self.data.cross_references.append(tuple(cols))
876
880
882 line = line[5:]
883 name = line[0:8].rstrip()
884 try:
885 from_res = int(line[9:15])
886 except ValueError:
887 from_res = line[9:15].lstrip()
888 try:
889 to_res = int(line[16:22])
890 except ValueError:
891 to_res = line[16:22].lstrip()
892 description = line[29:70].rstrip()
893
894 if line[29:35]==r"/FTId=":
895 ft_id = line[35:70].rstrip()[:-1]
896 else:
897 ft_id =""
898 if not name:
899 assert not from_res and not to_res
900 name, from_res, to_res, old_description,old_ft_id = self.data.features[-1]
901 del self.data.features[-1]
902 description = "%s %s" % (old_description, description)
903
904
905 if name == "VARSPLIC":
906 description = self._fix_varsplic_sequences(description)
907 self.data.features.append((name, from_res, to_res, description,ft_id))
908
910 """Remove unwanted spaces in sequences.
911
912 During line carryover, the sequences in VARSPLIC can get mangled
913 with unwanted spaces like:
914 'DISSTKLQALPSHGLESIQT -> PCRATGWSPFRRSSPC LPTH'
915 We want to check for this case and correct it as it happens.
916 """
917 descr_cols = description.split(" -> ")
918 if len(descr_cols) == 2:
919 first_seq = descr_cols[0]
920 second_seq = descr_cols[1]
921 extra_info = ''
922
923
924 extra_info_pos = second_seq.find(" (")
925 if extra_info_pos != -1:
926 extra_info = second_seq[extra_info_pos:]
927 second_seq = second_seq[:extra_info_pos]
928
929
930 first_seq = first_seq.replace(" ", "")
931 second_seq = second_seq.replace(" ", "")
932
933
934 description = first_seq + " -> " + second_seq + extra_info
935
936 return description
937
941
943 cols = line.split()
944 assert len(cols) == 8, "I don't understand SQ line %s" % line
945
946 self.data.seqinfo = int(cols[2]), int(cols[4]), cols[6]
947
949
950 self._sequence_lines.append(line.replace(" ", "").rstrip())
951
954
955
956
957
958
959
961
962 members = ['description', 'gene_name', 'organism', 'organelle']
963 for m in members:
964 attr = getattr(rec, m)
965 setattr(rec, m, attr.rstrip())
966 for ref in rec.references:
967 self._clean_references(ref)
968
970
971 members = ['authors', 'title', 'location']
972 for m in members:
973 attr = getattr(ref, m)
974 setattr(ref, m, attr.rstrip())
975
977 """Consumer that converts a SwissProt record to a SeqRecord object.
978
979 Members:
980 data Record with SwissProt data.
981 alphabet The alphabet the generated Seq objects will have.
982 """
983
985 """Initialize a Sequence Consumer
986
987 Arguments:
988 o alphabet - The alphabet to use for the generated Seq objects. If
989 not supplied this will default to the generic protein alphabet.
990 """
991 self.data = None
992 self.alphabet = alphabet
993
1001
1008
1012
1027
1028
1032
1034
1035 self._sequence_lines.append(line.replace(" ", "").rstrip())
1036
1043
1052
1053
1072
1073
1074
1075
1076 - def date(self, line):
1077 date_str = line.split()[0]
1078 uprline = string.upper(line)
1079 if uprline.find('CREATED') >= 0 :
1080
1081
1082 self.data.annotations['date'] = date_str
1083 elif uprline.find('LAST SEQUENCE UPDATE') >= 0 :
1084
1085 self.data.annotations['date_last_sequence_update'] = date_str
1086 elif uprline.find('LAST ANNOTATION UPDATE') >= 0:
1087
1088 self.data.annotations['date_last_annotation_update'] = date_str
1089
1102
1112
1129
1142
1163
1175
1177 """RP line, reference position."""
1178 assert self._current_ref is not None, "RP: missing RN"
1179
1180
1181
1182 pass
1183
1185 """RX line, reference cross-references."""
1186 assert self._current_ref is not None, "RX: missing RN"
1187
1188
1189
1190
1191
1192
1193 if line.find("=") != -1:
1194 cols = line[2:].split("; ")
1195 cols = [x.strip() for x in cols]
1196 cols = [x for x in cols if x]
1197 for col in cols:
1198 x = col.split("=")
1199 assert len(x) == 2, "I don't understand RX line %s" % line
1200 key, value = x[0].rstrip(_CHOMP), x[1].rstrip(_CHOMP)
1201 if key == "MEDLINE" :
1202 self._current_ref.medline_id = value
1203 elif key == "PubMed" :
1204 self._current_ref.pubmed_id = value
1205 else :
1206
1207
1208 pass
1209
1210 else:
1211
1212
1213
1214
1215 ind = line.find('[NCBI, ExPASy, Israel, Japan]')
1216 if ind >= 0:
1217 line = line[:ind]
1218 cols = line.split()
1219
1220 assert len(cols) == 3, "I don't understand RX line %s" % line
1221 key = cols[1].rstrip(_CHOMP)
1222 value = cols[2].rstrip(_CHOMP)
1223 if key == "MEDLINE" :
1224 self._current_ref.medline_id = value
1225 elif key == "PubMed" :
1226 self._current_ref.pubmed_id = value
1227 else :
1228
1229
1230 pass
1231
1233 """RA line, reference author(s)."""
1234 assert self._current_ref is not None, "RA: missing RN"
1235 self._current_ref.authors += line[5:].rstrip("\n")
1236
1238 """RT line, reference title."""
1239 assert self._current_ref is not None, "RT: missing RN"
1240 self._current_ref.title += line[5:].rstrip("\n")
1241
1243 """RL line, reference 'location' - journal, volume, pages, year."""
1244 assert self._current_ref is not None, "RL: missing RN"
1245 self._current_ref.journal += line[5:].rstrip("\n")
1246
1253
1254 -def index_file(filename, indexname, rec2key=None):
1255 """index_file(filename, indexname, rec2key=None)
1256
1257 Index a SwissProt file. filename is the name of the file.
1258 indexname is the name of the dictionary. rec2key is an
1259 optional callback that takes a Record and generates a unique key
1260 (e.g. the accession number) for the record. If not specified,
1261 the entry name will be used.
1262
1263 """
1264 from Bio.SwissProt import parse
1265 if not os.path.exists(filename):
1266 raise ValueError, "%s does not exist" % filename
1267
1268 index = Index.Index(indexname, truncate=1)
1269 index[Dictionary._Dictionary__filename_key] = filename
1270
1271 handle = open(filename)
1272 records = parse(handle)
1273 end = 0L
1274 for record in records:
1275 start = end
1276 end = long(handle.tell())
1277 length = end - start
1278
1279 if rec2key is not None:
1280 key = rec2key(record)
1281 else:
1282 key = record.entry_name
1283
1284 if not key:
1285 raise KeyError, "empty sequence key was produced"
1286 elif index.has_key(key):
1287 raise KeyError, "duplicate key %s found" % key
1288
1289 index[key] = start, length
1290