Package Bio :: Package Prosite :: Module Prodoc
[hide private]
[frames] | no frames]

Source Code for Module Bio.Prosite.Prodoc

  1  # Copyright 2000 by Jeffrey Chang.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """ 
  7  This module provides code to work with the prosite.doc file from 
  8  Prosite. 
  9  http://www.expasy.ch/prosite/ 
 10   
 11  Tested with: 
 12  Release 15.0, July 1998 
 13  Release 16.0, July 1999 
 14  Release 20.22, 13 November 2007 
 15   
 16   
 17  Functions: 
 18  parse              Iterates over entries in a Prodoc file. 
 19  index_file         Index a Prodoc file for a Dictionary. 
 20  _extract_record    Extract Prodoc data from a web page. 
 21   
 22   
 23  Classes: 
 24  Record             Holds Prodoc data. 
 25  Reference          Holds data from a Prodoc reference. 
 26  Dictionary         Accesses a Prodoc file using a dictionary interface. 
 27  RecordParser       Parses a Prodoc record into a Record object. 
 28   
 29  _Scanner           Scans Prodoc-formatted data. 
 30  _RecordConsumer    Consumes Prodoc data to a Record object. 
 31  Iterator           Iterates over entries in a Prodoc file; DEPRECATED. 
 32  """ 
 33   
 34  from types import * 
 35  import os 
 36  import sgmllib 
 37  from Bio import File 
 38  from Bio import Index 
 39  from Bio.ParserSupport import * 
 40   
41 -def parse(handle):
42 import cStringIO 43 parser = RecordParser() 44 text = "" 45 for line in handle: 46 text += line 47 if line[:5] == '{END}': 48 handle = cStringIO.StringIO(text) 49 record = parser.parse(handle) 50 text = "" 51 yield record
52
53 -def read(handle):
54 parser = RecordParser() 55 record = parser.parse(handle) 56 # We should have reached the end of the record by now 57 remainder = handle.read() 58 if remainder: 59 raise ValueError, "More than one Prodoc record found" 60 return record
61 62 63 # It may be a good idea to rewrite read(), parse() at some point to avoid 64 # using the old-style "parser = RecordParser(); parser.parse(handle)" approach. 65
66 -class Record:
67 """Holds information from a Prodoc record. 68 69 Members: 70 accession Accession number of the record. 71 prosite_refs List of tuples (prosite accession, prosite name). 72 text Free format text. 73 references List of reference objects. 74 75 """
76 - def __init__(self):
77 self.accession = '' 78 self.prosite_refs = [] 79 self.text = '' 80 self.references = []
81
82 -class Reference:
83 """Holds information from a Prodoc citation. 84 85 Members: 86 number Number of the reference. (string) 87 authors Names of the authors. 88 citation Describes the citation. 89 90 """
91 - def __init__(self):
92 self.number = '' 93 self.authors = '' 94 self.citation = ''
95
96 -class Iterator:
97 """Returns one record at a time from a Prodoc file. 98 99 Methods: 100 next Return the next record from the stream, or None. 101 102 """
103 - def __init__(self, handle, parser=None):
104 """__init__(self, handle, parser=None) 105 106 Create a new iterator. handle is a file-like object. parser 107 is an optional Parser object to change the results into another form. 108 If set to None, then the raw contents of the file will be returned. 109 110 """ 111 import warnings 112 warnings.warn("Bio.Prosite.Prodoc.Iterator is deprecated; we recommend using the function Bio.Prosite.Prodoc.parse instead. Please contact the Biopython developers at biopython-dev@biopython.org you cannot use Bio.Prosite.Prodoc.parse instead of Bio.Prosite.Prodoc.Iterator.", 113 DeprecationWarning) 114 if type(handle) is not FileType and type(handle) is not InstanceType: 115 raise ValueError, "I expected a file handle or file-like object" 116 self._uhandle = File.UndoHandle(handle) 117 self._parser = parser
118
119 - def next(self):
120 """next(self) -> object 121 122 Return the next Prodoc record from the file. If no more records, 123 return None. 124 125 """ 126 lines = [] 127 while 1: 128 line = self._uhandle.readline() 129 if not line: 130 break 131 lines.append(line) 132 if line[:5] == '{END}': 133 break 134 135 if not lines: 136 return None 137 138 data = "".join(lines) 139 if self._parser is not None: 140 return self._parser.parse(File.StringHandle(data)) 141 return data
142
143 - def __iter__(self):
144 return iter(self.next, None)
145
146 -class Dictionary:
147 """Accesses a Prodoc file using a dictionary interface. 148 149 """ 150 __filename_key = '__filename' 151
152 - def __init__(self, indexname, parser=None):
153 """__init__(self, indexname, parser=None) 154 155 Open a Prodoc Dictionary. indexname is the name of the 156 index for the dictionary. The index should have been created 157 using the index_file function. parser is an optional Parser 158 object to change the results into another form. If set to None, 159 then the raw contents of the file will be returned. 160 161 """ 162 self._index = Index.Index(indexname) 163 self._handle = open(self._index[Dictionary.__filename_key]) 164 self._parser = parser
165
166 - def __len__(self):
167 return len(self._index)
168
169 - def __getitem__(self, key):
170 start, len = self._index[key] 171 self._handle.seek(start) 172 data = self._handle.read(len) 173 if self._parser is not None: 174 return self._parser.parse(File.StringHandle(data)) 175 return data
176
177 - def __getattr__(self, name):
178 return getattr(self._index, name)
179
180 -class ExPASyDictionary:
181 """Access PRODOC at ExPASy using a read-only dictionary interface. 182 183 """
184 - def __init__(self, delay=5.0, parser=None):
185 """__init__(self, delay=5.0, parser=None) 186 187 Create a new Dictionary to access PRODOC. parser is an optional 188 parser (e.g. Prodoc.RecordParser) object to change the results 189 into another form. If set to None, then the raw contents of the 190 file will be returned. delay is the number of seconds to wait 191 between each query. 192 193 """ 194 import warnings 195 warnings.warn("Bio.Prosite.Prodoc.ExPASyDictionary is deprecated. Please use the function Bio.ExPASy.get_prosite_raw instead.", 196 DeprecationWarning) 197 198 self.delay = delay 199 self.parser = parser 200 self.last_query_time = None
201
202 - def __len__(self):
203 raise NotImplementedError, "Prodoc contains lots of entries"
204 - def clear(self):
205 raise NotImplementedError, "This is a read-only dictionary"
206 - def __setitem__(self, key, item):
207 raise NotImplementedError, "This is a read-only dictionary"
208 - def update(self):
209 raise NotImplementedError, "This is a read-only dictionary"
210 - def copy(self):
211 raise NotImplementedError, "You don't need to do this..."
212 - def keys(self):
213 raise NotImplementedError, "You don't really want to do this..."
214 - def items(self):
215 raise NotImplementedError, "You don't really want to do this..."
216 - def values(self):
217 raise NotImplementedError, "You don't really want to do this..."
218
219 - def has_key(self, id):
220 """has_key(self, id) -> bool""" 221 try: 222 self[id] 223 except KeyError: 224 return 0 225 return 1
226
227 - def get(self, id, failobj=None):
228 try: 229 return self[id] 230 except KeyError: 231 return failobj 232 raise "How did I get here?"
233
234 - def __getitem__(self, id):
235 """__getitem__(self, id) -> object 236 237 Return a Prodoc entry. id is either the id or accession 238 for the entry. Raises a KeyError if there's an error. 239 240 """ 241 import time 242 from Bio.WWW import ExPASy 243 # First, check to see if enough time has passed since my 244 # last query. 245 if self.last_query_time is not None: 246 delay = self.last_query_time + self.delay - time.time() 247 if delay > 0.0: 248 time.sleep(delay) 249 self.last_query_time = time.time() 250 251 try: 252 handle = ExPASy.get_prodoc_entry(id) 253 except IOError: 254 raise KeyError, id 255 try: 256 handle = File.StringHandle(_extract_record(handle)) 257 except ValueError: 258 raise KeyError, id 259 260 if self.parser is not None: 261 return self.parser.parse(handle) 262 return handle.read()
263
264 -class RecordParser(AbstractParser):
265 """Parses Prodoc data into a Record object. 266 267 """
268 - def __init__(self):
269 self._scanner = _Scanner() 270 self._consumer = _RecordConsumer()
271
272 - def parse(self, handle):
273 self._scanner.feed(handle, self._consumer) 274 return self._consumer.data
275
276 -class _Scanner:
277 """Scans Prodoc-formatted data. 278 279 Tested with: 280 Release 15.0, July 1998 281 282 """
283 - def feed(self, handle, consumer):
284 """feed(self, handle, consumer) 285 286 Feed in Prodoc data for scanning. handle is a file-like 287 object that contains prosite data. consumer is a 288 Consumer object that will receive events as the report is scanned. 289 290 """ 291 if isinstance(handle, File.UndoHandle): 292 uhandle = handle 293 else: 294 uhandle = File.UndoHandle(handle) 295 296 while 1: 297 line = uhandle.peekline() 298 if not line: 299 break 300 elif is_blank_line(line): 301 # Skip blank lines between records 302 uhandle.readline() 303 continue 304 else: 305 self._scan_record(uhandle, consumer)
306
307 - def _scan_record(self, uhandle, consumer):
308 consumer.start_record() 309 310 self._scan_accession(uhandle, consumer) 311 self._scan_prosite_refs(uhandle, consumer) 312 read_and_call(uhandle, consumer.noevent, start='{BEGIN}') 313 self._scan_text(uhandle, consumer) 314 self._scan_refs(uhandle, consumer) 315 self._scan_copyright(uhandle, consumer) 316 read_and_call(uhandle, consumer.noevent, start='{END}') 317 318 consumer.end_record()
319
320 - def _scan_accession(self, uhandle, consumer):
321 read_and_call(uhandle, consumer.accession, start='{PDOC')
322
323 - def _scan_prosite_refs(self, uhandle, consumer):
324 while attempt_read_and_call(uhandle, consumer.prosite_reference, 325 start='{PS'): 326 pass
327
328 - def _scan_text(self, uhandle, consumer):
329 while 1: 330 line = safe_readline(uhandle) 331 if (line[0] == '[' and line[3] == ']' and line[4] == ' ') or \ 332 line[:5] == '{END}': 333 uhandle.saveline(line) 334 break 335 consumer.text(line)
336
337 - def _scan_refs(self, uhandle, consumer):
338 while 1: 339 line = safe_readline(uhandle) 340 if line[:5] == '{END}' or is_blank_line(line): 341 uhandle.saveline(line) 342 break 343 consumer.reference(line)
344
353
354 -class _RecordConsumer(AbstractConsumer):
355 """Consumer that converts a Prodoc record to a Record object. 356 357 Members: 358 data Record with Prodoc data. 359 360 """
361 - def __init__(self):
362 self.data = None
363
364 - def start_record(self):
365 self.data = Record()
366
367 - def end_record(self):
368 self._clean_data()
369
370 - def accession(self, line):
371 line = line.rstrip() 372 if line[0] != '{' or line[-1] != '}': 373 raise ValueError, "I don't understand accession line\n%s" % line 374 acc = line[1:-1] 375 if acc[:4] != 'PDOC': 376 raise ValueError, "Invalid accession in line\n%s" % line 377 self.data.accession = acc
378
379 - def prosite_reference(self, line):
380 line = line.rstrip() 381 if line[0] != '{' or line[-1] != '}': 382 raise ValueError, "I don't understand accession line\n%s" % line 383 acc, name = line[1:-1].split('; ') 384 self.data.prosite_refs.append((acc, name))
385
386 - def text(self, line):
387 self.data.text = self.data.text + line
388
389 - def reference(self, line):
390 if line[0] == '[' and line[3] == ']': # new reference 391 self._ref = Reference() 392 self._ref.number = line[1:3].strip() 393 if line[1] == 'E': 394 # If it's an electronic reference, then the URL is on the 395 # line, instead of the author. 396 self._ref.citation = line[4:].strip() 397 else: 398 self._ref.authors = line[4:].strip() 399 self.data.references.append(self._ref) 400 elif line[:4] == ' ': 401 if not self._ref: 402 raise ValueError, "Unnumbered reference lines\n%s" % line 403 self._ref.citation = self._ref.citation + line[5:] 404 else: 405 raise "I don't understand the reference line\n%s" % line
406
407 - def _clean_data(self):
408 # get rid of trailing newlines 409 for ref in self.data.references: 410 ref.citation = ref.citation.rstrip() 411 ref.authors = ref.authors.rstrip()
412
413 -def index_file(filename, indexname, rec2key=None):
414 """index_file(filename, indexname, rec2key=None) 415 416 Index a Prodoc file. filename is the name of the file. 417 indexname is the name of the dictionary. rec2key is an 418 optional callback that takes a Record and generates a unique key 419 (e.g. the accession number) for the record. If not specified, 420 the id name will be used. 421 422 """ 423 import os 424 if not os.path.exists(filename): 425 raise ValueError, "%s does not exist" % filename 426 427 index = Index.Index(indexname, truncate=1) 428 index[Dictionary._Dictionary__filename_key] = filename 429 430 handle = open(filename) 431 records = parse(handle) 432 end = 0L 433 for record in records: 434 start = end 435 end = long(handle.tell()) 436 length = end - start 437 438 if rec2key is not None: 439 key = rec2key(record) 440 else: 441 key = record.accession 442 443 if not key: 444 raise KeyError, "empty key was produced" 445 elif index.has_key(key): 446 raise KeyError, "duplicate key %s found" % key 447 448 index[key] = start, length
449 450 # This function can be deprecated once Bio.Prosite.Prodoc.ExPASyDictionary 451 # is removed.
452 -def _extract_record(handle):
453 """_extract_record(handle) -> str 454 455 Extract PRODOC data from a web page. Raises a ValueError if no 456 data was found in the web page. 457 458 """ 459 # All the data appears between tags: 460 # <pre width = 80>ID NIR_SIR; PATTERN. 461 # </PRE> 462 class parser(sgmllib.SGMLParser): 463 def __init__(self): 464 sgmllib.SGMLParser.__init__(self) 465 self._in_pre = 0 466 self.data = []
467 def handle_data(self, data): 468 if self._in_pre: 469 self.data.append(data) 470 def do_br(self, attrs): 471 if self._in_pre: 472 self.data.append('\n') 473 def start_pre(self, attrs): 474 self._in_pre = 1 475 def end_pre(self): 476 self._in_pre = 0 477 p = parser() 478 p.feed(handle.read()) 479 data = ''.join(p.data).lstrip() 480 if not data: 481 raise ValueError, "No data found in web page." 482 return data 483