1
2
3
4
5
6
7
8
9
10 """
11 This module provides code to work with the prosite dat file from
12 Prosite.
13 http://www.expasy.ch/prosite/
14
15 Tested with:
16 Release 15.0, July 1998
17 Release 16.0, July 1999
18 Release 17.0, Dec 2001
19 Release 19.0, Mar 2006
20
21
22 Classes:
23 Record Holds Prosite data.
24 PatternHit Holds data from a hit against a Prosite pattern.
25 Iterator Iterates over entries in a Prosite file.
26 Dictionary Accesses a Prosite file using a dictionary interface.
27 ExPASyDictionary Accesses Prosite records from ExPASy.
28 RecordParser Parses a Prosite record into a Record object.
29
30 _Scanner Scans Prosite-formatted data.
31 _RecordConsumer Consumes Prosite data to a Record object.
32
33
34 Functions:
35 scan_sequence_expasy Scan a sequence for occurrences of Prosite patterns.
36 index_file Index a Prosite file for a Dictionary.
37 _extract_record Extract Prosite data from a web page.
38 _extract_pattern_hits Extract Prosite patterns from a web page.
39
40 """
41 from types import *
42 import string
43 import re
44 import sgmllib
45 from Bio import File
46 from Bio import Index
47 from Bio.ParserSupport import *
48 from Bio.WWW import ExPASy
49 from Bio.WWW import RequestLimiter
50
52 """Holds information from a Prosite record.
53
54 Members:
55 name ID of the record. e.g. ADH_ZINC
56 type Type of entry. e.g. PATTERN, MATRIX, or RULE
57 accession e.g. PS00387
58 created Date the entry was created. (MMM-YYYY)
59 data_update Date the 'primary' data was last updated.
60 info_update Date data other than 'primary' data was last updated.
61 pdoc ID of the PROSITE DOCumentation.
62
63 description Free-format description.
64 pattern The PROSITE pattern. See docs.
65 matrix List of strings that describes a matrix entry.
66 rules List of rule definitions. (strings)
67
68 NUMERICAL RESULTS
69 nr_sp_release SwissProt release.
70 nr_sp_seqs Number of seqs in that release of Swiss-Prot. (int)
71 nr_total Number of hits in Swiss-Prot. tuple of (hits, seqs)
72 nr_positive True positives. tuple of (hits, seqs)
73 nr_unknown Could be positives. tuple of (hits, seqs)
74 nr_false_pos False positives. tuple of (hits, seqs)
75 nr_false_neg False negatives. (int)
76 nr_partial False negatives, because they are fragments. (int)
77
78 COMMENTS
79 cc_taxo_range Taxonomic range. See docs for format
80 cc_max_repeat Maximum number of repetitions in a protein
81 cc_site Interesting site. list of tuples (pattern pos, desc.)
82 cc_skip_flag Can this entry be ignored?
83 cc_matrix_type
84 cc_scaling_db
85 cc_author
86 cc_ft_key
87 cc_ft_desc
88 cc_version version number (introduced in release 19.0)
89
90 DATA BANK REFERENCES - The following are all
91 lists of tuples (swiss-prot accession,
92 swiss-prot name)
93 dr_positive
94 dr_false_neg
95 dr_false_pos
96 dr_potential Potential hits, but fingerprint region not yet available.
97 dr_unknown Could possibly belong
98
99 pdb_structs List of PDB entries.
100
101 """
103 self.name = ''
104 self.type = ''
105 self.accession = ''
106 self.created = ''
107 self.data_update = ''
108 self.info_update = ''
109 self.pdoc = ''
110
111 self.description = ''
112 self.pattern = ''
113 self.matrix = []
114 self.rules = []
115
116 self.nr_sp_release = ''
117 self.nr_sp_seqs = ''
118 self.nr_total = (None, None)
119 self.nr_positive = (None, None)
120 self.nr_unknown = (None, None)
121 self.nr_false_pos = (None, None)
122 self.nr_false_neg = None
123 self.nr_partial = None
124
125 self.cc_taxo_range = ''
126 self.cc_max_repeat = ''
127 self.cc_site = []
128 self.cc_skip_flag = ''
129
130 self.dr_positive = []
131 self.dr_false_neg = []
132 self.dr_false_pos = []
133 self.dr_potential = []
134 self.dr_unknown = []
135
136 self.pdb_structs = []
137
139 """Holds information from a hit against a Prosite pattern.
140
141 Members:
142 name ID of the record. e.g. ADH_ZINC
143 accession e.g. PS00387
144 pdoc ID of the PROSITE DOCumentation.
145 description Free-format description.
146 matches List of tuples (start, end, sequence) where
147 start and end are indexes of the match, and sequence is
148 the sequence matched.
149
150 """
158 lines = []
159 lines.append("%s %s %s" % (self.accession, self.pdoc, self.name))
160 lines.append(self.description)
161 lines.append('')
162 if len(self.matches) > 1:
163 lines.append("Number of matches: %s" % len(self.matches))
164 for i in range(len(self.matches)):
165 start, end, seq = self.matches[i]
166 range_str = "%d-%d" % (start, end)
167 if len(self.matches) > 1:
168 lines.append("%7d %10s %s" % (i+1, range_str, seq))
169 else:
170 lines.append("%7s %10s %s" % (' ', range_str, seq))
171 return string.join(lines, '\n')
172
174 """Returns one record at a time from a Prosite file.
175
176 Methods:
177 next Return the next record from the stream, or None.
178
179 """
180 - def __init__(self, handle, parser=None):
181 """__init__(self, handle, parser=None)
182
183 Create a new iterator. handle is a file-like object. parser
184 is an optional Parser object to change the results into another form.
185 If set to None, then the raw contents of the file will be returned.
186
187 """
188 if type(handle) is not FileType and type(handle) is not InstanceType:
189 raise ValueError, "I expected a file handle or file-like object"
190 self._uhandle = File.UndoHandle(handle)
191 self._parser = parser
192
194 """next(self) -> object
195
196 Return the next Prosite record from the file. If no more records,
197 return None.
198
199 """
200
201 line = self._uhandle.peekline()
202 if line[:2] == 'CC':
203 while 1:
204 line = self._uhandle.readline()
205 if not line:
206 break
207 if line[:2] == '//':
208 break
209 if line[:2] != 'CC':
210 raise SyntaxError, \
211 "Oops, where's the copyright?"
212
213 lines = []
214 while 1:
215 line = self._uhandle.readline()
216 if not line:
217 break
218 lines.append(line)
219 if line[:2] == '//':
220 break
221
222 if not lines:
223 return None
224
225 data = string.join(lines, '')
226 if self._parser is not None:
227 return self._parser.parse(File.StringHandle(data))
228 return data
229
231 return iter(self.next, None)
232
234 """Accesses a Prosite file using a dictionary interface.
235
236 """
237 __filename_key = '__filename'
238
239 - def __init__(self, indexname, parser=None):
240 """__init__(self, indexname, parser=None)
241
242 Open a Prosite Dictionary. indexname is the name of the
243 index for the dictionary. The index should have been created
244 using the index_file function. parser is an optional Parser
245 object to change the results into another form. If set to None,
246 then the raw contents of the file will be returned.
247
248 """
249 self._index = Index.Index(indexname)
250 self._handle = open(self._index[Dictionary.__filename_key])
251 self._parser = parser
252
254 return len(self._index)
255
263
265 return getattr(self._index, name)
266
268 """Access PROSITE at ExPASy using a read-only dictionary interface.
269
270 """
271 - def __init__(self, delay=5.0, parser=None):
272 """__init__(self, delay=5.0, parser=None)
273
274 Create a new Dictionary to access PROSITE. parser is an optional
275 parser (e.g. Prosite.RecordParser) object to change the results
276 into another form. If set to None, then the raw contents of the
277 file will be returned. delay is the number of seconds to wait
278 between each query.
279
280 """
281 self.parser = parser
282 self.limiter = RequestLimiter(delay)
283
285 raise NotImplementedError, "Prosite contains lots of entries"
287 raise NotImplementedError, "This is a read-only dictionary"
289 raise NotImplementedError, "This is a read-only dictionary"
291 raise NotImplementedError, "This is a read-only dictionary"
293 raise NotImplementedError, "You don't need to do this..."
295 raise NotImplementedError, "You don't really want to do this..."
297 raise NotImplementedError, "You don't really want to do this..."
299 raise NotImplementedError, "You don't really want to do this..."
300
302 """has_key(self, id) -> bool"""
303 try:
304 self[id]
305 except KeyError:
306 return 0
307 return 1
308
309 - def get(self, id, failobj=None):
310 try:
311 return self[id]
312 except KeyError:
313 return failobj
314 raise "How did I get here?"
315
317 """__getitem__(self, id) -> object
318
319 Return a Prosite entry. id is either the id or accession
320 for the entry. Raises a KeyError if there's an error.
321
322 """
323
324
325 self.limiter.wait()
326
327 try:
328 handle = ExPASy.get_prosite_entry(id)
329 except IOError:
330 raise KeyError, id
331 try:
332 handle = File.StringHandle(_extract_record(handle))
333 except ValueError:
334 raise KeyError, id
335
336 if self.parser is not None:
337 return self.parser.parse(handle)
338 return handle.read()
339
341 """Parses Prosite data into a Record object.
342
343 """
347
348 - def parse(self, handle):
349 self._scanner.feed(handle, self._consumer)
350 return self._consumer.data
351
353 """Scans Prosite-formatted data.
354
355 Tested with:
356 Release 15.0, July 1998
357
358 """
359 - def feed(self, handle, consumer):
360 """feed(self, handle, consumer)
361
362 Feed in Prosite data for scanning. handle is a file-like
363 object that contains prosite data. consumer is a
364 Consumer object that will receive events as the report is scanned.
365
366 """
367 if isinstance(handle, File.UndoHandle):
368 uhandle = handle
369 else:
370 uhandle = File.UndoHandle(handle)
371
372 while 1:
373 line = uhandle.peekline()
374 if not line:
375 break
376 elif is_blank_line(line):
377
378 uhandle.readline()
379 continue
380 elif line[:2] == 'ID':
381 self._scan_record(uhandle, consumer)
382 elif line[:2] == 'CC':
383 self._scan_copyrights(uhandle, consumer)
384 else:
385 raise SyntaxError, "There doesn't appear to be a record"
386
388 consumer.start_copyrights()
389 self._scan_line('CC', uhandle, consumer.copyright, any_number=1)
390 self._scan_terminator(uhandle, consumer)
391 consumer.end_copyrights()
392
405
406 - def _scan_line(self, line_type, uhandle, event_fn,
407 exactly_one=None, one_or_more=None, any_number=None,
408 up_to_one=None):
426
429
432
435
438
441
444
445
446
447
448
449
450
451
452
453
454
455
456
457
460
464
467
471
475
478
481
482 _scan_fns = [
483 _scan_id,
484 _scan_ac,
485 _scan_dt,
486 _scan_de,
487 _scan_pa,
488 _scan_ma,
489 _scan_ru,
490 _scan_nr,
491 _scan_cc,
492
493
494
495
496
497 _scan_ma,
498 _scan_nr,
499 _scan_cc,
500
501 _scan_dr,
502 _scan_3d,
503 _scan_do,
504 _scan_terminator
505 ]
506
508 """Consumer that converts a Prosite record to a Record object.
509
510 Members:
511 data Record with Prosite data.
512
513 """
516
519
522
524 cols = string.split(line)
525 if len(cols) != 3:
526 raise SyntaxError, "I don't understand identification line\n%s" % \
527 line
528 self.data.name = self._chomp(cols[1])
529 self.data.type = self._chomp(cols[2])
530
532 cols = string.split(line)
533 if len(cols) != 2:
534 raise SyntaxError, "I don't understand accession line\n%s" % line
535 self.data.accession = self._chomp(cols[1])
536
537 - def date(self, line):
538 uprline = string.upper(line)
539 cols = string.split(uprline)
540
541
542 if cols[2] != '(CREATED);' or \
543 cols[4] != '(DATA' or cols[5] != 'UPDATE);' or \
544 cols[7][:4] != '(INF' or cols[8] != 'UPDATE).':
545 raise SyntaxError, "I don't understand date line\n%s" % line
546
547 self.data.created = cols[1]
548 self.data.data_update = cols[3]
549 self.data.info_update = cols[6]
550
553
556
559
560 - def rule(self, line):
562
564 cols = string.split(self._clean(line), ';')
565 for col in cols:
566 if not col:
567 continue
568 qual, data = map(string.lstrip, string.split(col, '='))
569 if qual == '/RELEASE':
570 release, seqs = string.split(data, ',')
571 self.data.nr_sp_release = release
572 self.data.nr_sp_seqs = int(seqs)
573 elif qual == '/FALSE_NEG':
574 self.data.nr_false_neg = int(data)
575 elif qual == '/PARTIAL':
576 self.data.nr_partial = int(data)
577 elif qual in ['/TOTAL', '/POSITIVE', '/UNKNOWN', '/FALSE_POS']:
578 m = re.match(r'(\d+)\((\d+)\)', data)
579 if not m:
580 raise error, "Broken data %s in comment line\n%s" % \
581 (repr(data), line)
582 hits = tuple(map(int, m.groups()))
583 if(qual == "/TOTAL"):
584 self.data.nr_total = hits
585 elif(qual == "/POSITIVE"):
586 self.data.nr_positive = hits
587 elif(qual == "/UNKNOWN"):
588 self.data.nr_unknown = hits
589 elif(qual == "/FALSE_POS"):
590 self.data.nr_false_pos = hits
591 else:
592 raise SyntaxError, "Unknown qual %s in comment line\n%s" % \
593 (repr(qual), line)
594
628
630 refs = string.split(self._clean(line), ';')
631 for ref in refs:
632 if not ref:
633 continue
634 acc, name, type = map(string.strip, string.split(ref, ','))
635 if type == 'T':
636 self.data.dr_positive.append((acc, name))
637 elif type == 'F':
638 self.data.dr_false_pos.append((acc, name))
639 elif type == 'N':
640 self.data.dr_false_neg.append((acc, name))
641 elif type == 'P':
642 self.data.dr_potential.append((acc, name))
643 elif type == '?':
644 self.data.dr_unknown.append((acc, name))
645 else:
646 raise SyntaxError, "I don't understand type flag %s" % type
647
652
655
658
659 - def _chomp(self, word, to_chomp='.,;'):
660
661 if word[-1] in to_chomp:
662 return word[:-1]
663 return word
664
665 - def _clean(self, line, rstrip=1):
666
667 if rstrip:
668 return string.rstrip(line[5:])
669 return line[5:]
670
672 """scan_sequence_expasy(seq=None, id=None, exclude_frequent=None) ->
673 list of PatternHit's
674
675 Search a sequence for occurrences of Prosite patterns. You can
676 specify either a sequence in seq or a SwissProt/trEMBL ID or accession
677 in id. Only one of those should be given. If exclude_frequent
678 is true, then the patterns with the high probability of occurring
679 will be excluded.
680
681 """
682 if (seq and id) or not (seq or id):
683 raise ValueError, "Please specify either a sequence or an id"
684 handle = ExPASy.scanprosite1(seq, id, exclude_frequent)
685 return _extract_pattern_hits(handle)
686
688 """_extract_pattern_hits(handle) -> list of PatternHit's
689
690 Extract hits from a web page. Raises a ValueError if there
691 was an error in the query.
692
693 """
694 class parser(sgmllib.SGMLParser):
695 def __init__(self):
696 sgmllib.SGMLParser.__init__(self)
697 self.hits = []
698 self.broken_message = 'Some error occurred'
699 self._in_pre = 0
700 self._current_hit = None
701 self._last_found = None
702 def handle_data(self, data):
703 if string.find(data, 'try again') >= 0:
704 self.broken_message = data
705 return
706 elif data == 'illegal':
707 self.broken_message = 'Sequence contains illegal characters'
708 return
709 if not self._in_pre:
710 return
711 elif not string.strip(data):
712 return
713 if self._last_found is None and data[:4] == 'PDOC':
714 self._current_hit.pdoc = data
715 self._last_found = 'pdoc'
716 elif self._last_found == 'pdoc':
717 if data[:2] != 'PS':
718 raise SyntaxError, "Expected accession but got:\n%s" % data
719 self._current_hit.accession = data
720 self._last_found = 'accession'
721 elif self._last_found == 'accession':
722 self._current_hit.name = data
723 self._last_found = 'name'
724 elif self._last_found == 'name':
725 self._current_hit.description = data
726 self._last_found = 'description'
727 elif self._last_found == 'description':
728 m = re.findall(r'(\d+)-(\d+) (\w+)', data)
729 for start, end, seq in m:
730 self._current_hit.matches.append(
731 (int(start), int(end), seq))
732
733 def do_hr(self, attrs):
734
735 if self._in_pre:
736 self._current_hit = PatternHit()
737 self.hits.append(self._current_hit)
738 self._last_found = None
739 def start_pre(self, attrs):
740 self._in_pre = 1
741 self.broken_message = None
742 def end_pre(self):
743 self._in_pre = 0
744 p = parser()
745 p.feed(handle.read())
746 if p.broken_message:
747 raise ValueError, p.broken_message
748 return p.hits
749
750
751
752
753 -def index_file(filename, indexname, rec2key=None):
754 """index_file(filename, indexname, rec2key=None)
755
756 Index a Prosite file. filename is the name of the file.
757 indexname is the name of the dictionary. rec2key is an
758 optional callback that takes a Record and generates a unique key
759 (e.g. the accession number) for the record. If not specified,
760 the id name will be used.
761
762 """
763 if not os.path.exists(filename):
764 raise ValueError, "%s does not exist" % filename
765
766 index = Index.Index(indexname, truncate=1)
767 index[Dictionary._Dictionary__filename_key] = filename
768
769 iter = Iterator(open(filename), parser=RecordParser())
770 while 1:
771 start = iter._uhandle.tell()
772 rec = iter.next()
773 length = iter._uhandle.tell() - start
774
775 if rec is None:
776 break
777 if rec2key is not None:
778 key = rec2key(rec)
779 else:
780 key = rec.name
781
782 if not key:
783 raise KeyError, "empty key was produced"
784 elif index.has_key(key):
785 raise KeyError, "duplicate key %s found" % key
786
787 index[key] = start, length
788
790 """_extract_record(handle) -> str
791
792 Extract PROSITE data from a web page. Raises a ValueError if no
793 data was found in the web page.
794
795 """
796
797
798
799 class parser(sgmllib.SGMLParser):
800 def __init__(self):
801 sgmllib.SGMLParser.__init__(self)
802 self._in_pre = 0
803 self.data = []
804 def handle_data(self, data):
805 if self._in_pre:
806 self.data.append(data)
807 def do_br(self, attrs):
808 if self._in_pre:
809 self.data.append('\n')
810 def start_pre(self, attrs):
811 self._in_pre = 1
812 def end_pre(self):
813 self._in_pre = 0
814 p = parser()
815 p.feed(handle.read())
816 if not p.data:
817 raise ValueError, "No data found in web page."
818 return string.join(p.data, '')
819