1
2
3
4
5
6
7 """
8 This module provides code to work with the prosite dat file from
9 Prosite.
10 http://www.expasy.ch/prosite/
11
12 Tested with:
13 Release 15.0, July 1998
14 Release 16.0, July 1999
15 Release 17.0, Dec 2001
16 Release 19.0, Mar 2006
17
18
19 Functions:
20 parse Iterates over entries in a Prosite file.
21 scan_sequence_expasy Scan a sequence for occurrences of Prosite patterns.
22 index_file Index a Prosite file for a Dictionary.
23 _extract_record Extract Prosite data from a web page.
24 _extract_pattern_hits Extract Prosite patterns from a web page.
25
26
27 Classes:
28 Record Holds Prosite data.
29 PatternHit Holds data from a hit against a Prosite pattern.
30 Dictionary Accesses a Prosite file using a dictionary interface.
31 RecordParser Parses a Prosite record into a Record object.
32 Iterator Iterates over entries in a Prosite file; DEPRECATED.
33
34 _Scanner Scans Prosite-formatted data.
35 _RecordConsumer Consumes Prosite data to a Record object.
36
37 """
38 from types import *
39 import re
40 import sgmllib
41 from Bio import File
42 from Bio import Index
43 from Bio.ParserSupport import *
44
45
46
47
48
62
77
79 """Holds information from a Prosite record.
80
81 Members:
82 name ID of the record. e.g. ADH_ZINC
83 type Type of entry. e.g. PATTERN, MATRIX, or RULE
84 accession e.g. PS00387
85 created Date the entry was created. (MMM-YYYY)
86 data_update Date the 'primary' data was last updated.
87 info_update Date data other than 'primary' data was last updated.
88 pdoc ID of the PROSITE DOCumentation.
89
90 description Free-format description.
91 pattern The PROSITE pattern. See docs.
92 matrix List of strings that describes a matrix entry.
93 rules List of rule definitions (from RU lines). (strings)
94 prorules List of prorules (from PR lines). (strings)
95
96 NUMERICAL RESULTS
97 nr_sp_release SwissProt release.
98 nr_sp_seqs Number of seqs in that release of Swiss-Prot. (int)
99 nr_total Number of hits in Swiss-Prot. tuple of (hits, seqs)
100 nr_positive True positives. tuple of (hits, seqs)
101 nr_unknown Could be positives. tuple of (hits, seqs)
102 nr_false_pos False positives. tuple of (hits, seqs)
103 nr_false_neg False negatives. (int)
104 nr_partial False negatives, because they are fragments. (int)
105
106 COMMENTS
107 cc_taxo_range Taxonomic range. See docs for format
108 cc_max_repeat Maximum number of repetitions in a protein
109 cc_site Interesting site. list of tuples (pattern pos, desc.)
110 cc_skip_flag Can this entry be ignored?
111 cc_matrix_type
112 cc_scaling_db
113 cc_author
114 cc_ft_key
115 cc_ft_desc
116 cc_version version number (introduced in release 19.0)
117
118 DATA BANK REFERENCES - The following are all
119 lists of tuples (swiss-prot accession,
120 swiss-prot name)
121 dr_positive
122 dr_false_neg
123 dr_false_pos
124 dr_potential Potential hits, but fingerprint region not yet available.
125 dr_unknown Could possibly belong
126
127 pdb_structs List of PDB entries.
128
129 """
131 self.name = ''
132 self.type = ''
133 self.accession = ''
134 self.created = ''
135 self.data_update = ''
136 self.info_update = ''
137 self.pdoc = ''
138
139 self.description = ''
140 self.pattern = ''
141 self.matrix = []
142 self.rules = []
143 self.prorules = []
144 self.postprocessing = []
145
146 self.nr_sp_release = ''
147 self.nr_sp_seqs = ''
148 self.nr_total = (None, None)
149 self.nr_positive = (None, None)
150 self.nr_unknown = (None, None)
151 self.nr_false_pos = (None, None)
152 self.nr_false_neg = None
153 self.nr_partial = None
154
155 self.cc_taxo_range = ''
156 self.cc_max_repeat = ''
157 self.cc_site = []
158 self.cc_skip_flag = ''
159
160 self.dr_positive = []
161 self.dr_false_neg = []
162 self.dr_false_pos = []
163 self.dr_potential = []
164 self.dr_unknown = []
165
166 self.pdb_structs = []
167
169 """Holds information from a hit against a Prosite pattern.
170
171 Members:
172 name ID of the record. e.g. ADH_ZINC
173 accession e.g. PS00387
174 pdoc ID of the PROSITE DOCumentation.
175 description Free-format description.
176 matches List of tuples (start, end, sequence) where
177 start and end are indexes of the match, and sequence is
178 the sequence matched.
179
180 """
188 lines = []
189 lines.append("%s %s %s" % (self.accession, self.pdoc, self.name))
190 lines.append(self.description)
191 lines.append('')
192 if len(self.matches) > 1:
193 lines.append("Number of matches: %s" % len(self.matches))
194 for i in range(len(self.matches)):
195 start, end, seq = self.matches[i]
196 range_str = "%d-%d" % (start, end)
197 if len(self.matches) > 1:
198 lines.append("%7d %10s %s" % (i+1, range_str, seq))
199 else:
200 lines.append("%7s %10s %s" % (' ', range_str, seq))
201 return "\n".join(lines)
202
204 """Returns one record at a time from a Prosite file.
205
206 Methods:
207 next Return the next record from the stream, or None.
208
209 """
210 - def __init__(self, handle, parser=None):
211 """__init__(self, handle, parser=None)
212
213 Create a new iterator. handle is a file-like object. parser
214 is an optional Parser object to change the results into another form.
215 If set to None, then the raw contents of the file will be returned.
216
217 """
218 import warnings
219 warnings.warn("Bio.Prosite.Iterator is deprecated; we recommend using the function Bio.Prosite.parse instead. Please contact the Biopython developers at biopython-dev@biopython.org you cannot use Bio.Prosite.parse instead of Bio.Prosite.Iterator.",
220 DeprecationWarning)
221 if type(handle) is not FileType and type(handle) is not InstanceType:
222 raise ValueError, "I expected a file handle or file-like object"
223 self._uhandle = File.UndoHandle(handle)
224 self._parser = parser
225
227 """next(self) -> object
228
229 Return the next Prosite record from the file. If no more records,
230 return None.
231
232 """
233
234 line = self._uhandle.peekline()
235 if line[:2] == 'CC':
236 while 1:
237 line = self._uhandle.readline()
238 if not line:
239 break
240 if line[:2] == '//':
241 break
242 if line[:2] != 'CC':
243 raise ValueError, \
244 "Oops, where's the copyright?"
245
246 lines = []
247 while 1:
248 line = self._uhandle.readline()
249 if not line:
250 break
251 lines.append(line)
252 if line[:2] == '//':
253 break
254
255 if not lines:
256 return None
257
258 data = "".join(lines)
259 if self._parser is not None:
260 return self._parser.parse(File.StringHandle(data))
261 return data
262
264 return iter(self.next, None)
265
267 """Accesses a Prosite file using a dictionary interface.
268
269 """
270 __filename_key = '__filename'
271
272 - def __init__(self, indexname, parser=None):
273 """__init__(self, indexname, parser=None)
274
275 Open a Prosite Dictionary. indexname is the name of the
276 index for the dictionary. The index should have been created
277 using the index_file function. parser is an optional Parser
278 object to change the results into another form. If set to None,
279 then the raw contents of the file will be returned.
280
281 """
282 self._index = Index.Index(indexname)
283 self._handle = open(self._index[Dictionary.__filename_key])
284 self._parser = parser
285
287 return len(self._index)
288
296
298 return getattr(self._index, name)
299
301 """Access PROSITE at ExPASy using a read-only dictionary interface.
302
303 """
304 - def __init__(self, delay=5.0, parser=None):
305 """__init__(self, delay=5.0, parser=None)
306
307 Create a new Dictionary to access PROSITE. parser is an optional
308 parser (e.g. Prosite.RecordParser) object to change the results
309 into another form. If set to None, then the raw contents of the
310 file will be returned. delay is the number of seconds to wait
311 between each query.
312
313 """
314 import warnings
315 from Bio.WWW import RequestLimiter
316 warnings.warn("Bio.Prosite.ExPASyDictionary is deprecated. Please use the function Bio.ExPASy.get_prosite_raw instead.",
317 DeprecationWarning)
318 self.parser = parser
319 self.limiter = RequestLimiter(delay)
320
322 raise NotImplementedError, "Prosite contains lots of entries"
324 raise NotImplementedError, "This is a read-only dictionary"
326 raise NotImplementedError, "This is a read-only dictionary"
328 raise NotImplementedError, "This is a read-only dictionary"
330 raise NotImplementedError, "You don't need to do this..."
332 raise NotImplementedError, "You don't really want to do this..."
334 raise NotImplementedError, "You don't really want to do this..."
336 raise NotImplementedError, "You don't really want to do this..."
337
339 """has_key(self, id) -> bool"""
340 try:
341 self[id]
342 except KeyError:
343 return 0
344 return 1
345
346 - def get(self, id, failobj=None):
347 try:
348 return self[id]
349 except KeyError:
350 return failobj
351 raise "How did I get here?"
352
354 """__getitem__(self, id) -> object
355
356 Return a Prosite entry. id is either the id or accession
357 for the entry. Raises a KeyError if there's an error.
358
359 """
360 from Bio.WWW import ExPASy
361
362
363 self.limiter.wait()
364
365 try:
366 handle = ExPASy.get_prosite_entry(id)
367 except IOError:
368 raise KeyError, id
369 try:
370 handle = File.StringHandle(_extract_record(handle))
371 except ValueError:
372 raise KeyError, id
373
374 if self.parser is not None:
375 return self.parser.parse(handle)
376 return handle.read()
377
379 """Parses Prosite data into a Record object.
380
381 """
385
386 - def parse(self, handle):
387 self._scanner.feed(handle, self._consumer)
388 return self._consumer.data
389
391 """Scans Prosite-formatted data.
392
393 Tested with:
394 Release 15.0, July 1998
395
396 """
397 - def feed(self, handle, consumer):
398 """feed(self, handle, consumer)
399
400 Feed in Prosite data for scanning. handle is a file-like
401 object that contains prosite data. consumer is a
402 Consumer object that will receive events as the report is scanned.
403
404 """
405 if isinstance(handle, File.UndoHandle):
406 uhandle = handle
407 else:
408 uhandle = File.UndoHandle(handle)
409
410 consumer.finished = False
411 while not consumer.finished:
412 line = uhandle.peekline()
413 if not line:
414 break
415 elif is_blank_line(line):
416
417 uhandle.readline()
418 continue
419 elif line[:2] == 'ID':
420 self._scan_record(uhandle, consumer)
421 elif line[:2] == 'CC':
422 self._scan_copyrights(uhandle, consumer)
423 else:
424 raise ValueError, "There doesn't appear to be a record"
425
427 consumer.start_copyrights()
428 self._scan_line('CC', uhandle, consumer.copyright, any_number=1)
429 self._scan_terminator(uhandle, consumer)
430 consumer.end_copyrights()
431
444
445 - def _scan_line(self, line_type, uhandle, event_fn,
446 exactly_one=None, one_or_more=None, any_number=None,
447 up_to_one=None):
465
468
471
474
477
480
483
484
485
486
487
488
489
490
491
492
493
494
495
496
500
503
507
510
514
518
522
525
528
529
530
531
532 _scan_fns = [
533 _scan_id,
534 _scan_ac,
535 _scan_dt,
536 _scan_de,
537 _scan_pa,
538 _scan_ma,
539 _scan_pp,
540 _scan_ru,
541 _scan_nr,
542 _scan_cc,
543
544
545
546
547
548 _scan_ma,
549 _scan_nr,
550 _scan_cc,
551
552 _scan_dr,
553 _scan_3d,
554 _scan_pr,
555 _scan_do,
556 _scan_terminator
557 ]
558
560 """Consumer that converts a Prosite record to a Record object.
561
562 Members:
563 data Record with Prosite data.
564
565 """
568
571
574
576 cols = line.split()
577 if len(cols) != 3:
578 raise ValueError, "I don't understand identification line\n%s" % \
579 line
580 self.data.name = self._chomp(cols[1])
581 self.data.type = self._chomp(cols[2])
582
584 cols = line.split()
585 if len(cols) != 2:
586 raise ValueError, "I don't understand accession line\n%s" % line
587 self.data.accession = self._chomp(cols[1])
588
589 - def date(self, line):
590 uprline = line.upper()
591 cols = uprline.split()
592
593
594 if cols[2] != '(CREATED);' or \
595 cols[4] != '(DATA' or cols[5] != 'UPDATE);' or \
596 cols[7][:4] != '(INF' or cols[8] != 'UPDATE).':
597 raise ValueError, "I don't understand date line\n%s" % line
598
599 self.data.created = cols[1]
600 self.data.data_update = cols[3]
601 self.data.info_update = cols[6]
602
605
608
611
612 - def postprocessing(self, line):
615
616 - def rule(self, line):
618
620 cols = self._clean(line).split(";")
621 for col in cols:
622 if not col:
623 continue
624 qual, data = [word.lstrip() for word in col.split("=")]
625 if qual == '/RELEASE':
626 release, seqs = data.split(",")
627 self.data.nr_sp_release = release
628 self.data.nr_sp_seqs = int(seqs)
629 elif qual == '/FALSE_NEG':
630 self.data.nr_false_neg = int(data)
631 elif qual == '/PARTIAL':
632 self.data.nr_partial = int(data)
633 elif qual in ['/TOTAL', '/POSITIVE', '/UNKNOWN', '/FALSE_POS']:
634 m = re.match(r'(\d+)\((\d+)\)', data)
635 if not m:
636 raise error, "Broken data %s in comment line\n%s" % \
637 (repr(data), line)
638 hits = tuple(map(int, m.groups()))
639 if(qual == "/TOTAL"):
640 self.data.nr_total = hits
641 elif(qual == "/POSITIVE"):
642 self.data.nr_positive = hits
643 elif(qual == "/UNKNOWN"):
644 self.data.nr_unknown = hits
645 elif(qual == "/FALSE_POS"):
646 self.data.nr_false_pos = hits
647 else:
648 raise ValueError, "Unknown qual %s in comment line\n%s" % \
649 (repr(qual), line)
650
692
711
716
721
724
727
728 - def _chomp(self, word, to_chomp='.,;'):
729
730 if word[-1] in to_chomp:
731 return word[:-1]
732 return word
733
734 - def _clean(self, line, rstrip=1):
735
736 if rstrip:
737 return line[5:].rstrip()
738 return line[5:]
739
741 """scan_sequence_expasy(seq=None, id=None, exclude_frequent=None) ->
742 list of PatternHit's
743
744 Search a sequence for occurrences of Prosite patterns. You can
745 specify either a sequence in seq or a SwissProt/trEMBL ID or accession
746 in id. Only one of those should be given. If exclude_frequent
747 is true, then the patterns with the high probability of occurring
748 will be excluded.
749
750 """
751 from Bio import ExPASy
752 if (seq and id) or not (seq or id):
753 raise ValueError, "Please specify either a sequence or an id"
754 handle = ExPASy.scanprosite1(seq, id, exclude_frequent)
755 return _extract_pattern_hits(handle)
756
758 """_extract_pattern_hits(handle) -> list of PatternHit's
759
760 Extract hits from a web page. Raises a ValueError if there
761 was an error in the query.
762
763 """
764 class parser(sgmllib.SGMLParser):
765 def __init__(self):
766 sgmllib.SGMLParser.__init__(self)
767 self.hits = []
768 self.broken_message = 'Some error occurred'
769 self._in_pre = 0
770 self._current_hit = None
771 self._last_found = None
772 def handle_data(self, data):
773 if data.find('try again') >= 0:
774 self.broken_message = data
775 return
776 elif data == 'illegal':
777 self.broken_message = 'Sequence contains illegal characters'
778 return
779 if not self._in_pre:
780 return
781 elif not data.strip():
782 return
783 if self._last_found is None and data[:4] == 'PDOC':
784 self._current_hit.pdoc = data
785 self._last_found = 'pdoc'
786 elif self._last_found == 'pdoc':
787 if data[:2] != 'PS':
788 raise ValueError, "Expected accession but got:\n%s" % data
789 self._current_hit.accession = data
790 self._last_found = 'accession'
791 elif self._last_found == 'accession':
792 self._current_hit.name = data
793 self._last_found = 'name'
794 elif self._last_found == 'name':
795 self._current_hit.description = data
796 self._last_found = 'description'
797 elif self._last_found == 'description':
798 m = re.findall(r'(\d+)-(\d+) (\w+)', data)
799 for start, end, seq in m:
800 self._current_hit.matches.append(
801 (int(start), int(end), seq))
802
803 def do_hr(self, attrs):
804
805 if self._in_pre:
806 self._current_hit = PatternHit()
807 self.hits.append(self._current_hit)
808 self._last_found = None
809 def start_pre(self, attrs):
810 self._in_pre = 1
811 self.broken_message = None
812 def end_pre(self):
813 self._in_pre = 0
814 p = parser()
815 p.feed(handle.read())
816 if p.broken_message:
817 raise ValueError, p.broken_message
818 return p.hits
819
820
821
822
823 -def index_file(filename, indexname, rec2key=None):
824 """index_file(filename, indexname, rec2key=None)
825
826 Index a Prosite file. filename is the name of the file.
827 indexname is the name of the dictionary. rec2key is an
828 optional callback that takes a Record and generates a unique key
829 (e.g. the accession number) for the record. If not specified,
830 the id name will be used.
831
832 """
833 import os
834 if not os.path.exists(filename):
835 raise ValueError, "%s does not exist" % filename
836
837 index = Index.Index(indexname, truncate=1)
838 index[Dictionary._Dictionary__filename_key] = filename
839
840 handle = open(filename)
841 records = parse(handle)
842 end = 0L
843 for record in records:
844 start = end
845 end = long(handle.tell())
846 length = end - start
847
848 if rec2key is not None:
849 key = rec2key(record)
850 else:
851 key = record.name
852
853 if not key:
854 raise KeyError, "empty key was produced"
855 elif index.has_key(key):
856 raise KeyError, "duplicate key %s found" % key
857
858 index[key] = start, length
859
860
861
863 """_extract_record(handle) -> str
864
865 Extract PROSITE data from a web page. Raises a ValueError if no
866 data was found in the web page.
867
868 """
869
870
871
872 class parser(sgmllib.SGMLParser):
873 def __init__(self):
874 sgmllib.SGMLParser.__init__(self)
875 self._in_pre = 0
876 self.data = []
877 def handle_data(self, data):
878 if self._in_pre:
879 self.data.append(data)
880 def do_br(self, attrs):
881 if self._in_pre:
882 self.data.append('\n')
883 def start_pre(self, attrs):
884 self._in_pre = 1
885 def end_pre(self):
886 self._in_pre = 0
887 p = parser()
888 p.feed(handle.read())
889 if not p.data:
890 raise ValueError, "No data found in web page."
891 return "".join(p.data)
892