1
2
3
4
5
6 """
7 This module provides code to work with the prosite.doc file from
8 Prosite.
9 http://www.expasy.ch/prosite/
10
11 Tested with:
12 Release 15.0, July 1998
13 Release 16.0, July 1999
14
15
16 Classes:
17 Record Holds Prodoc data.
18 Reference Holds data from a Prodoc reference.
19 Iterator Iterates over entries in a Prodoc file.
20 Dictionary Accesses a Prodoc file using a dictionary interface.
21 ExPASyDictionary Accesses Prodoc records from ExPASy.
22 RecordParser Parses a Prodoc record into a Record object.
23
24 _Scanner Scans Prodoc-formatted data.
25 _RecordConsumer Consumes Prodoc data to a Record object.
26
27
28 Functions:
29 index_file Index a Prodoc file for a Dictionary.
30 _extract_record Extract Prodoc data from a web page.
31
32 """
33 from types import *
34 import string
35 import re
36 import sgmllib
37 import time
38 from Bio import File
39 from Bio import Index
40 from Bio.ParserSupport import *
41 from Bio.WWW import ExPASy
42
44 """Holds information from a Prodoc record.
45
46 Members:
47 accession Accession number of the record.
48 prosite_refs List of tuples (prosite accession, prosite name).
49 text Free format text.
50 references List of reference objects.
51
52 """
54 self.accession = ''
55 self.prosite_refs = []
56 self.text = ''
57 self.references = []
58
60 """Holds information from a Prodoc citation.
61
62 Members:
63 number Number of the reference. (string)
64 authors Names of the authors.
65 citation Describes the citation.
66
67 """
69 self.number = ''
70 self.authors = ''
71 self.citation = ''
72
74 """Returns one record at a time from a Prodoc file.
75
76 Methods:
77 next Return the next record from the stream, or None.
78
79 """
80 - def __init__(self, handle, parser=None):
81 """__init__(self, handle, parser=None)
82
83 Create a new iterator. handle is a file-like object. parser
84 is an optional Parser object to change the results into another form.
85 If set to None, then the raw contents of the file will be returned.
86
87 """
88 if type(handle) is not FileType and type(handle) is not InstanceType:
89 raise ValueError, "I expected a file handle or file-like object"
90 self._uhandle = File.UndoHandle(handle)
91 self._parser = parser
92
94 """next(self) -> object
95
96 Return the next Prodoc record from the file. If no more records,
97 return None.
98
99 """
100 lines = []
101 while 1:
102 line = self._uhandle.readline()
103 if not line:
104 break
105 lines.append(line)
106 if line[:5] == '{END}':
107 break
108
109 if not lines:
110 return None
111
112 data = string.join(lines, '')
113 if self._parser is not None:
114 return self._parser.parse(File.StringHandle(data))
115 return data
116
118 return iter(self.next, None)
119
121 """Accesses a Prodoc file using a dictionary interface.
122
123 """
124 __filename_key = '__filename'
125
126 - def __init__(self, indexname, parser=None):
127 """__init__(self, indexname, parser=None)
128
129 Open a Prodoc Dictionary. indexname is the name of the
130 index for the dictionary. The index should have been created
131 using the index_file function. parser is an optional Parser
132 object to change the results into another form. If set to None,
133 then the raw contents of the file will be returned.
134
135 """
136 self._index = Index.Index(indexname)
137 self._handle = open(self._index[Dictionary.__filename_key])
138 self._parser = parser
139
141 return len(self._index)
142
150
152 return getattr(self._index, name)
153
155 """Access PRODOC at ExPASy using a read-only dictionary interface.
156
157 """
158 - def __init__(self, delay=5.0, parser=None):
159 """__init__(self, delay=5.0, parser=None)
160
161 Create a new Dictionary to access PRODOC. parser is an optional
162 parser (e.g. Prodoc.RecordParser) object to change the results
163 into another form. If set to None, then the raw contents of the
164 file will be returned. delay is the number of seconds to wait
165 between each query.
166
167 """
168 self.delay = delay
169 self.parser = parser
170 self.last_query_time = None
171
173 raise NotImplementedError, "Prodoc contains lots of entries"
175 raise NotImplementedError, "This is a read-only dictionary"
177 raise NotImplementedError, "This is a read-only dictionary"
179 raise NotImplementedError, "This is a read-only dictionary"
181 raise NotImplementedError, "You don't need to do this..."
183 raise NotImplementedError, "You don't really want to do this..."
185 raise NotImplementedError, "You don't really want to do this..."
187 raise NotImplementedError, "You don't really want to do this..."
188
190 """has_key(self, id) -> bool"""
191 try:
192 self[id]
193 except KeyError:
194 return 0
195 return 1
196
197 - def get(self, id, failobj=None):
198 try:
199 return self[id]
200 except KeyError:
201 return failobj
202 raise "How did I get here?"
203
205 """__getitem__(self, id) -> object
206
207 Return a Prodoc entry. id is either the id or accession
208 for the entry. Raises a KeyError if there's an error.
209
210 """
211
212
213 if self.last_query_time is not None:
214 delay = self.last_query_time + self.delay - time.time()
215 if delay > 0.0:
216 time.sleep(delay)
217 self.last_query_time = time.time()
218
219 try:
220 handle = ExPASy.get_prodoc_entry(id)
221 except IOError:
222 raise KeyError, id
223 try:
224 handle = File.StringHandle(_extract_record(handle))
225 except ValueError:
226 raise KeyError, id
227
228 if self.parser is not None:
229 return self.parser.parse(handle)
230 return handle.read()
231
233 """Parses Prodoc data into a Record object.
234
235 """
239
240 - def parse(self, handle):
241 self._scanner.feed(handle, self._consumer)
242 return self._consumer.data
243
245 """Scans Prodoc-formatted data.
246
247 Tested with:
248 Release 15.0, July 1998
249
250 """
251 - def feed(self, handle, consumer):
252 """feed(self, handle, consumer)
253
254 Feed in Prodoc data for scanning. handle is a file-like
255 object that contains prosite data. consumer is a
256 Consumer object that will receive events as the report is scanned.
257
258 """
259 if isinstance(handle, File.UndoHandle):
260 uhandle = handle
261 else:
262 uhandle = File.UndoHandle(handle)
263
264 while 1:
265 line = uhandle.peekline()
266 if not line:
267 break
268 elif is_blank_line(line):
269
270 uhandle.readline()
271 continue
272 else:
273 self._scan_record(uhandle, consumer)
274
287
290
295
296 - def _scan_text(self, uhandle, consumer):
297 while 1:
298 line = safe_readline(uhandle)
299 if (line[0] == '[' and line[3] == ']' and line[4] == ' ') or \
300 line[:5] == '{END}':
301 uhandle.saveline(line)
302 break
303 consumer.text(line)
304
312
320
322 """Consumer that converts a Prodoc record to a Record object.
323
324 Members:
325 data Record with Prodoc data.
326
327 """
330
333
336
338 line = string.rstrip(line)
339 if line[0] != '{' or line[-1] != '}':
340 raise SyntaxError, "I don't understand accession line\n%s" % line
341 acc = line[1:-1]
342 if acc[:4] != 'PDOC':
343 raise SyntaxError, "Invalid accession in line\n%s" % line
344 self.data.accession = acc
345
347 line = string.rstrip(line)
348 if line[0] != '{' or line[-1] != '}':
349 raise SyntaxError, "I don't understand accession line\n%s" % line
350 acc, name = string.split(line[1:-1], '; ')
351 self.data.prosite_refs.append((acc, name))
352
353 - def text(self, line):
354 self.data.text = self.data.text + line
355
357 if line[0] == '[' and line[3] == ']':
358 self._ref = Reference()
359 self._ref.number = string.strip(line[1:3])
360 if line[1] == 'E':
361
362
363 self._ref.citation = string.strip(line[4:])
364 else:
365 self._ref.authors = string.strip(line[4:])
366 self.data.references.append(self._ref)
367 elif line[:4] == ' ':
368 if not self._ref:
369 raise SyntaxError, "Unnumbered reference lines\n%s" % line
370 self._ref.citation = self._ref.citation + line[5:]
371 else:
372 raise "I don't understand the reference line\n%s" % line
373
375
376 for ref in self.data.references:
377 ref.citation = string.rstrip(ref.citation)
378 ref.authors = string.rstrip(ref.authors)
379
380 -def index_file(filename, indexname, rec2key=None):
381 """index_file(filename, indexname, rec2key=None)
382
383 Index a Prodoc file. filename is the name of the file.
384 indexname is the name of the dictionary. rec2key is an
385 optional callback that takes a Record and generates a unique key
386 (e.g. the accession number) for the record. If not specified,
387 the id name will be used.
388
389 """
390 if not os.path.exists(filename):
391 raise ValueError, "%s does not exist" % filename
392
393 index = Index.Index(indexname, truncate=1)
394 index[Dictionary._Dictionary__filename_key] = filename
395
396 iter = Iterator(open(filename), parser=RecordParser())
397 while 1:
398 start = iter._uhandle.tell()
399 rec = iter.next()
400 length = iter._uhandle.tell() - start
401
402 if rec is None:
403 break
404 if rec2key is not None:
405 key = rec2key(rec)
406 else:
407 key = rec.accession
408
409 if not key:
410 raise KeyError, "empty key was produced"
411 elif index.has_key(key):
412 raise KeyError, "duplicate key %s found" % key
413
414 index[key] = start, length
415
417 """_extract_record(handle) -> str
418
419 Extract PRODOC data from a web page. Raises a ValueError if no
420 data was found in the web page.
421
422 """
423
424
425
426 class parser(sgmllib.SGMLParser):
427 def __init__(self):
428 sgmllib.SGMLParser.__init__(self)
429 self._in_pre = 0
430 self.data = []
431 def handle_data(self, data):
432 if self._in_pre:
433 self.data.append(data)
434 def do_br(self, attrs):
435 if self._in_pre:
436 self.data.append('\n')
437 def start_pre(self, attrs):
438 self._in_pre = 1
439 def end_pre(self):
440 self._in_pre = 0
441 p = parser()
442 p.feed(handle.read())
443 data = string.lstrip(string.join(p.data, ''))
444 if not data:
445 raise ValueError, "No data found in web page."
446 return data
447