Package Bio :: Package Prosite :: Module Prodoc
[hide private]
[frames] | no frames]

Source Code for Module Bio.Prosite.Prodoc

  1  # Copyright 2000 by Jeffrey Chang.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """ 
  7  This module provides code to work with the prosite.doc file from 
  8  Prosite. 
  9  http://www.expasy.ch/prosite/ 
 10   
 11  Tested with: 
 12  Release 15.0, July 1998 
 13  Release 16.0, July 1999 
 14   
 15   
 16  Classes: 
 17  Record             Holds Prodoc data. 
 18  Reference          Holds data from a Prodoc reference. 
 19  Iterator           Iterates over entries in a Prodoc file. 
 20  Dictionary         Accesses a Prodoc file using a dictionary interface. 
 21  ExPASyDictionary   Accesses Prodoc records from ExPASy. 
 22  RecordParser       Parses a Prodoc record into a Record object. 
 23   
 24  _Scanner           Scans Prodoc-formatted data. 
 25  _RecordConsumer    Consumes Prodoc data to a Record object. 
 26   
 27   
 28  Functions: 
 29  index_file         Index a Prodoc file for a Dictionary. 
 30  _extract_record    Extract Prodoc data from a web page. 
 31   
 32  """ 
 33  from types import * 
 34  import string 
 35  import re 
 36  import sgmllib 
 37  import time 
 38  from Bio import File 
 39  from Bio import Index 
 40  from Bio.ParserSupport import * 
 41  from Bio.WWW import ExPASy 
 42   
43 -class Record:
44 """Holds information from a Prodoc record. 45 46 Members: 47 accession Accession number of the record. 48 prosite_refs List of tuples (prosite accession, prosite name). 49 text Free format text. 50 references List of reference objects. 51 52 """
53 - def __init__(self):
54 self.accession = '' 55 self.prosite_refs = [] 56 self.text = '' 57 self.references = []
58
59 -class Reference:
60 """Holds information from a Prodoc citation. 61 62 Members: 63 number Number of the reference. (string) 64 authors Names of the authors. 65 citation Describes the citation. 66 67 """
68 - def __init__(self):
69 self.number = '' 70 self.authors = '' 71 self.citation = ''
72
73 -class Iterator:
74 """Returns one record at a time from a Prodoc file. 75 76 Methods: 77 next Return the next record from the stream, or None. 78 79 """
80 - def __init__(self, handle, parser=None):
81 """__init__(self, handle, parser=None) 82 83 Create a new iterator. handle is a file-like object. parser 84 is an optional Parser object to change the results into another form. 85 If set to None, then the raw contents of the file will be returned. 86 87 """ 88 if type(handle) is not FileType and type(handle) is not InstanceType: 89 raise ValueError, "I expected a file handle or file-like object" 90 self._uhandle = File.UndoHandle(handle) 91 self._parser = parser
92
93 - def next(self):
94 """next(self) -> object 95 96 Return the next Prodoc record from the file. If no more records, 97 return None. 98 99 """ 100 lines = [] 101 while 1: 102 line = self._uhandle.readline() 103 if not line: 104 break 105 lines.append(line) 106 if line[:5] == '{END}': 107 break 108 109 if not lines: 110 return None 111 112 data = string.join(lines, '') 113 if self._parser is not None: 114 return self._parser.parse(File.StringHandle(data)) 115 return data
116
117 - def __iter__(self):
118 return iter(self.next, None)
119
120 -class Dictionary:
121 """Accesses a Prodoc file using a dictionary interface. 122 123 """ 124 __filename_key = '__filename' 125
126 - def __init__(self, indexname, parser=None):
127 """__init__(self, indexname, parser=None) 128 129 Open a Prodoc Dictionary. indexname is the name of the 130 index for the dictionary. The index should have been created 131 using the index_file function. parser is an optional Parser 132 object to change the results into another form. If set to None, 133 then the raw contents of the file will be returned. 134 135 """ 136 self._index = Index.Index(indexname) 137 self._handle = open(self._index[Dictionary.__filename_key]) 138 self._parser = parser
139
140 - def __len__(self):
141 return len(self._index)
142
143 - def __getitem__(self, key):
144 start, len = self._index[key] 145 self._handle.seek(start) 146 data = self._handle.read(len) 147 if self._parser is not None: 148 return self._parser.parse(File.StringHandle(data)) 149 return data
150
151 - def __getattr__(self, name):
152 return getattr(self._index, name)
153
154 -class ExPASyDictionary:
155 """Access PRODOC at ExPASy using a read-only dictionary interface. 156 157 """
158 - def __init__(self, delay=5.0, parser=None):
159 """__init__(self, delay=5.0, parser=None) 160 161 Create a new Dictionary to access PRODOC. parser is an optional 162 parser (e.g. Prodoc.RecordParser) object to change the results 163 into another form. If set to None, then the raw contents of the 164 file will be returned. delay is the number of seconds to wait 165 between each query. 166 167 """ 168 self.delay = delay 169 self.parser = parser 170 self.last_query_time = None
171
172 - def __len__(self):
173 raise NotImplementedError, "Prodoc contains lots of entries"
174 - def clear(self):
175 raise NotImplementedError, "This is a read-only dictionary"
176 - def __setitem__(self, key, item):
177 raise NotImplementedError, "This is a read-only dictionary"
178 - def update(self):
179 raise NotImplementedError, "This is a read-only dictionary"
180 - def copy(self):
181 raise NotImplementedError, "You don't need to do this..."
182 - def keys(self):
183 raise NotImplementedError, "You don't really want to do this..."
184 - def items(self):
185 raise NotImplementedError, "You don't really want to do this..."
186 - def values(self):
187 raise NotImplementedError, "You don't really want to do this..."
188
189 - def has_key(self, id):
190 """has_key(self, id) -> bool""" 191 try: 192 self[id] 193 except KeyError: 194 return 0 195 return 1
196
197 - def get(self, id, failobj=None):
198 try: 199 return self[id] 200 except KeyError: 201 return failobj 202 raise "How did I get here?"
203
204 - def __getitem__(self, id):
205 """__getitem__(self, id) -> object 206 207 Return a Prodoc entry. id is either the id or accession 208 for the entry. Raises a KeyError if there's an error. 209 210 """ 211 # First, check to see if enough time has passed since my 212 # last query. 213 if self.last_query_time is not None: 214 delay = self.last_query_time + self.delay - time.time() 215 if delay > 0.0: 216 time.sleep(delay) 217 self.last_query_time = time.time() 218 219 try: 220 handle = ExPASy.get_prodoc_entry(id) 221 except IOError: 222 raise KeyError, id 223 try: 224 handle = File.StringHandle(_extract_record(handle)) 225 except ValueError: 226 raise KeyError, id 227 228 if self.parser is not None: 229 return self.parser.parse(handle) 230 return handle.read()
231
232 -class RecordParser(AbstractParser):
233 """Parses Prodoc data into a Record object. 234 235 """
236 - def __init__(self):
237 self._scanner = _Scanner() 238 self._consumer = _RecordConsumer()
239
240 - def parse(self, handle):
241 self._scanner.feed(handle, self._consumer) 242 return self._consumer.data
243
244 -class _Scanner:
245 """Scans Prodoc-formatted data. 246 247 Tested with: 248 Release 15.0, July 1998 249 250 """
251 - def feed(self, handle, consumer):
252 """feed(self, handle, consumer) 253 254 Feed in Prodoc data for scanning. handle is a file-like 255 object that contains prosite data. consumer is a 256 Consumer object that will receive events as the report is scanned. 257 258 """ 259 if isinstance(handle, File.UndoHandle): 260 uhandle = handle 261 else: 262 uhandle = File.UndoHandle(handle) 263 264 while 1: 265 line = uhandle.peekline() 266 if not line: 267 break 268 elif is_blank_line(line): 269 # Skip blank lines between records 270 uhandle.readline() 271 continue 272 else: 273 self._scan_record(uhandle, consumer)
274
275 - def _scan_record(self, uhandle, consumer):
276 consumer.start_record() 277 278 self._scan_accession(uhandle, consumer) 279 self._scan_prosite_refs(uhandle, consumer) 280 read_and_call(uhandle, consumer.noevent, start='{BEGIN}') 281 self._scan_text(uhandle, consumer) 282 self._scan_refs(uhandle, consumer) 283 self._scan_copyright(uhandle, consumer) 284 read_and_call(uhandle, consumer.noevent, start='{END}') 285 286 consumer.end_record()
287
288 - def _scan_accession(self, uhandle, consumer):
289 read_and_call(uhandle, consumer.accession, start='{PDOC')
290
291 - def _scan_prosite_refs(self, uhandle, consumer):
292 while attempt_read_and_call(uhandle, consumer.prosite_reference, 293 start='{PS'): 294 pass
295
296 - def _scan_text(self, uhandle, consumer):
297 while 1: 298 line = safe_readline(uhandle) 299 if (line[0] == '[' and line[3] == ']' and line[4] == ' ') or \ 300 line[:5] == '{END}': 301 uhandle.saveline(line) 302 break 303 consumer.text(line)
304
305 - def _scan_refs(self, uhandle, consumer):
306 while 1: 307 line = safe_readline(uhandle) 308 if line[:5] == '{END}' or is_blank_line(line): 309 uhandle.saveline(line) 310 break 311 consumer.reference(line)
312
320
321 -class _RecordConsumer(AbstractConsumer):
322 """Consumer that converts a Prodoc record to a Record object. 323 324 Members: 325 data Record with Prodoc data. 326 327 """
328 - def __init__(self):
329 self.data = None
330
331 - def start_record(self):
332 self.data = Record()
333
334 - def end_record(self):
335 self._clean_data()
336
337 - def accession(self, line):
338 line = string.rstrip(line) 339 if line[0] != '{' or line[-1] != '}': 340 raise SyntaxError, "I don't understand accession line\n%s" % line 341 acc = line[1:-1] 342 if acc[:4] != 'PDOC': 343 raise SyntaxError, "Invalid accession in line\n%s" % line 344 self.data.accession = acc
345
346 - def prosite_reference(self, line):
347 line = string.rstrip(line) 348 if line[0] != '{' or line[-1] != '}': 349 raise SyntaxError, "I don't understand accession line\n%s" % line 350 acc, name = string.split(line[1:-1], '; ') 351 self.data.prosite_refs.append((acc, name))
352
353 - def text(self, line):
354 self.data.text = self.data.text + line
355
356 - def reference(self, line):
357 if line[0] == '[' and line[3] == ']': # new reference 358 self._ref = Reference() 359 self._ref.number = string.strip(line[1:3]) 360 if line[1] == 'E': 361 # If it's an electronic reference, then the URL is on the 362 # line, instead of the author. 363 self._ref.citation = string.strip(line[4:]) 364 else: 365 self._ref.authors = string.strip(line[4:]) 366 self.data.references.append(self._ref) 367 elif line[:4] == ' ': 368 if not self._ref: 369 raise SyntaxError, "Unnumbered reference lines\n%s" % line 370 self._ref.citation = self._ref.citation + line[5:] 371 else: 372 raise "I don't understand the reference line\n%s" % line
373
374 - def _clean_data(self):
375 # get rid of trailing newlines 376 for ref in self.data.references: 377 ref.citation = string.rstrip(ref.citation) 378 ref.authors = string.rstrip(ref.authors)
379
380 -def index_file(filename, indexname, rec2key=None):
381 """index_file(filename, indexname, rec2key=None) 382 383 Index a Prodoc file. filename is the name of the file. 384 indexname is the name of the dictionary. rec2key is an 385 optional callback that takes a Record and generates a unique key 386 (e.g. the accession number) for the record. If not specified, 387 the id name will be used. 388 389 """ 390 if not os.path.exists(filename): 391 raise ValueError, "%s does not exist" % filename 392 393 index = Index.Index(indexname, truncate=1) 394 index[Dictionary._Dictionary__filename_key] = filename 395 396 iter = Iterator(open(filename), parser=RecordParser()) 397 while 1: 398 start = iter._uhandle.tell() 399 rec = iter.next() 400 length = iter._uhandle.tell() - start 401 402 if rec is None: 403 break 404 if rec2key is not None: 405 key = rec2key(rec) 406 else: 407 key = rec.accession 408 409 if not key: 410 raise KeyError, "empty key was produced" 411 elif index.has_key(key): 412 raise KeyError, "duplicate key %s found" % key 413 414 index[key] = start, length
415
416 -def _extract_record(handle):
417 """_extract_record(handle) -> str 418 419 Extract PRODOC data from a web page. Raises a ValueError if no 420 data was found in the web page. 421 422 """ 423 # All the data appears between tags: 424 # <pre width = 80>ID NIR_SIR; PATTERN. 425 # </PRE> 426 class parser(sgmllib.SGMLParser): 427 def __init__(self): 428 sgmllib.SGMLParser.__init__(self) 429 self._in_pre = 0 430 self.data = []
431 def handle_data(self, data): 432 if self._in_pre: 433 self.data.append(data) 434 def do_br(self, attrs): 435 if self._in_pre: 436 self.data.append('\n') 437 def start_pre(self, attrs): 438 self._in_pre = 1 439 def end_pre(self): 440 self._in_pre = 0 441 p = parser() 442 p.feed(handle.read()) 443 data = string.lstrip(string.join(p.data, '')) 444 if not data: 445 raise ValueError, "No data found in web page." 446 return data 447