Package Bio :: Package Prosite
[hide private]
[frames] | no frames]

Source Code for Package Bio.Prosite

  1  # Copyright 1999 by Jeffrey Chang.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  # Copyright 2000 by Jeffrey Chang.  All rights reserved. 
  7  # This code is part of the Biopython distribution and governed by its 
  8  # license.  Please see the LICENSE file that should have been included 
  9  # as part of this package. 
 10  """ 
 11  This module provides code to work with the prosite dat file from 
 12  Prosite. 
 13  http://www.expasy.ch/prosite/ 
 14   
 15  Tested with: 
 16  Release 15.0, July 1998 
 17  Release 16.0, July 1999 
 18  Release 17.0, Dec 2001 
 19  Release 19.0, Mar 2006 
 20   
 21   
 22  Classes: 
 23  Record                Holds Prosite data. 
 24  PatternHit            Holds data from a hit against a Prosite pattern. 
 25  Iterator              Iterates over entries in a Prosite file. 
 26  Dictionary            Accesses a Prosite file using a dictionary interface. 
 27  ExPASyDictionary      Accesses Prosite records from ExPASy. 
 28  RecordParser          Parses a Prosite record into a Record object. 
 29   
 30  _Scanner              Scans Prosite-formatted data. 
 31  _RecordConsumer       Consumes Prosite data to a Record object. 
 32   
 33   
 34  Functions: 
 35  scan_sequence_expasy  Scan a sequence for occurrences of Prosite patterns. 
 36  index_file            Index a Prosite file for a Dictionary. 
 37  _extract_record       Extract Prosite data from a web page. 
 38  _extract_pattern_hits Extract Prosite patterns from a web page. 
 39   
 40  """ 
 41  from types import * 
 42  import string 
 43  import re 
 44  import sgmllib 
 45  from Bio import File 
 46  from Bio import Index 
 47  from Bio.ParserSupport import * 
 48  from Bio.WWW import ExPASy 
 49  from Bio.WWW import RequestLimiter 
 50   
51 -class Record:
52 """Holds information from a Prosite record. 53 54 Members: 55 name ID of the record. e.g. ADH_ZINC 56 type Type of entry. e.g. PATTERN, MATRIX, or RULE 57 accession e.g. PS00387 58 created Date the entry was created. (MMM-YYYY) 59 data_update Date the 'primary' data was last updated. 60 info_update Date data other than 'primary' data was last updated. 61 pdoc ID of the PROSITE DOCumentation. 62 63 description Free-format description. 64 pattern The PROSITE pattern. See docs. 65 matrix List of strings that describes a matrix entry. 66 rules List of rule definitions. (strings) 67 68 NUMERICAL RESULTS 69 nr_sp_release SwissProt release. 70 nr_sp_seqs Number of seqs in that release of Swiss-Prot. (int) 71 nr_total Number of hits in Swiss-Prot. tuple of (hits, seqs) 72 nr_positive True positives. tuple of (hits, seqs) 73 nr_unknown Could be positives. tuple of (hits, seqs) 74 nr_false_pos False positives. tuple of (hits, seqs) 75 nr_false_neg False negatives. (int) 76 nr_partial False negatives, because they are fragments. (int) 77 78 COMMENTS 79 cc_taxo_range Taxonomic range. See docs for format 80 cc_max_repeat Maximum number of repetitions in a protein 81 cc_site Interesting site. list of tuples (pattern pos, desc.) 82 cc_skip_flag Can this entry be ignored? 83 cc_matrix_type 84 cc_scaling_db 85 cc_author 86 cc_ft_key 87 cc_ft_desc 88 cc_version version number (introduced in release 19.0) 89 90 DATA BANK REFERENCES - The following are all 91 lists of tuples (swiss-prot accession, 92 swiss-prot name) 93 dr_positive 94 dr_false_neg 95 dr_false_pos 96 dr_potential Potential hits, but fingerprint region not yet available. 97 dr_unknown Could possibly belong 98 99 pdb_structs List of PDB entries. 100 101 """
102 - def __init__(self):
103 self.name = '' 104 self.type = '' 105 self.accession = '' 106 self.created = '' 107 self.data_update = '' 108 self.info_update = '' 109 self.pdoc = '' 110 111 self.description = '' 112 self.pattern = '' 113 self.matrix = [] 114 self.rules = [] 115 116 self.nr_sp_release = '' 117 self.nr_sp_seqs = '' 118 self.nr_total = (None, None) 119 self.nr_positive = (None, None) 120 self.nr_unknown = (None, None) 121 self.nr_false_pos = (None, None) 122 self.nr_false_neg = None 123 self.nr_partial = None 124 125 self.cc_taxo_range = '' 126 self.cc_max_repeat = '' 127 self.cc_site = [] 128 self.cc_skip_flag = '' 129 130 self.dr_positive = [] 131 self.dr_false_neg = [] 132 self.dr_false_pos = [] 133 self.dr_potential = [] 134 self.dr_unknown = [] 135 136 self.pdb_structs = []
137
138 -class PatternHit:
139 """Holds information from a hit against a Prosite pattern. 140 141 Members: 142 name ID of the record. e.g. ADH_ZINC 143 accession e.g. PS00387 144 pdoc ID of the PROSITE DOCumentation. 145 description Free-format description. 146 matches List of tuples (start, end, sequence) where 147 start and end are indexes of the match, and sequence is 148 the sequence matched. 149 150 """
151 - def __init__(self):
152 self.name = None 153 self.accession = None 154 self.pdoc = None 155 self.description = None 156 self.matches = []
157 - def __str__(self):
158 lines = [] 159 lines.append("%s %s %s" % (self.accession, self.pdoc, self.name)) 160 lines.append(self.description) 161 lines.append('') 162 if len(self.matches) > 1: 163 lines.append("Number of matches: %s" % len(self.matches)) 164 for i in range(len(self.matches)): 165 start, end, seq = self.matches[i] 166 range_str = "%d-%d" % (start, end) 167 if len(self.matches) > 1: 168 lines.append("%7d %10s %s" % (i+1, range_str, seq)) 169 else: 170 lines.append("%7s %10s %s" % (' ', range_str, seq)) 171 return string.join(lines, '\n')
172
173 -class Iterator:
174 """Returns one record at a time from a Prosite file. 175 176 Methods: 177 next Return the next record from the stream, or None. 178 179 """
180 - def __init__(self, handle, parser=None):
181 """__init__(self, handle, parser=None) 182 183 Create a new iterator. handle is a file-like object. parser 184 is an optional Parser object to change the results into another form. 185 If set to None, then the raw contents of the file will be returned. 186 187 """ 188 if type(handle) is not FileType and type(handle) is not InstanceType: 189 raise ValueError, "I expected a file handle or file-like object" 190 self._uhandle = File.UndoHandle(handle) 191 self._parser = parser
192
193 - def next(self):
194 """next(self) -> object 195 196 Return the next Prosite record from the file. If no more records, 197 return None. 198 199 """ 200 # Skip the copyright info, if it's the first record. 201 line = self._uhandle.peekline() 202 if line[:2] == 'CC': 203 while 1: 204 line = self._uhandle.readline() 205 if not line: 206 break 207 if line[:2] == '//': 208 break 209 if line[:2] != 'CC': 210 raise SyntaxError, \ 211 "Oops, where's the copyright?" 212 213 lines = [] 214 while 1: 215 line = self._uhandle.readline() 216 if not line: 217 break 218 lines.append(line) 219 if line[:2] == '//': 220 break 221 222 if not lines: 223 return None 224 225 data = string.join(lines, '') 226 if self._parser is not None: 227 return self._parser.parse(File.StringHandle(data)) 228 return data
229
230 - def __iter__(self):
231 return iter(self.next, None)
232
233 -class Dictionary:
234 """Accesses a Prosite file using a dictionary interface. 235 236 """ 237 __filename_key = '__filename' 238
239 - def __init__(self, indexname, parser=None):
240 """__init__(self, indexname, parser=None) 241 242 Open a Prosite Dictionary. indexname is the name of the 243 index for the dictionary. The index should have been created 244 using the index_file function. parser is an optional Parser 245 object to change the results into another form. If set to None, 246 then the raw contents of the file will be returned. 247 248 """ 249 self._index = Index.Index(indexname) 250 self._handle = open(self._index[Dictionary.__filename_key]) 251 self._parser = parser
252
253 - def __len__(self):
254 return len(self._index)
255
256 - def __getitem__(self, key):
257 start, len = self._index[key] 258 self._handle.seek(start) 259 data = self._handle.read(len) 260 if self._parser is not None: 261 return self._parser.parse(File.StringHandle(data)) 262 return data
263
264 - def __getattr__(self, name):
265 return getattr(self._index, name)
266
267 -class ExPASyDictionary:
268 """Access PROSITE at ExPASy using a read-only dictionary interface. 269 270 """
271 - def __init__(self, delay=5.0, parser=None):
272 """__init__(self, delay=5.0, parser=None) 273 274 Create a new Dictionary to access PROSITE. parser is an optional 275 parser (e.g. Prosite.RecordParser) object to change the results 276 into another form. If set to None, then the raw contents of the 277 file will be returned. delay is the number of seconds to wait 278 between each query. 279 280 """ 281 self.parser = parser 282 self.limiter = RequestLimiter(delay)
283
284 - def __len__(self):
285 raise NotImplementedError, "Prosite contains lots of entries"
286 - def clear(self):
287 raise NotImplementedError, "This is a read-only dictionary"
288 - def __setitem__(self, key, item):
289 raise NotImplementedError, "This is a read-only dictionary"
290 - def update(self):
291 raise NotImplementedError, "This is a read-only dictionary"
292 - def copy(self):
293 raise NotImplementedError, "You don't need to do this..."
294 - def keys(self):
295 raise NotImplementedError, "You don't really want to do this..."
296 - def items(self):
297 raise NotImplementedError, "You don't really want to do this..."
298 - def values(self):
299 raise NotImplementedError, "You don't really want to do this..."
300
301 - def has_key(self, id):
302 """has_key(self, id) -> bool""" 303 try: 304 self[id] 305 except KeyError: 306 return 0 307 return 1
308
309 - def get(self, id, failobj=None):
310 try: 311 return self[id] 312 except KeyError: 313 return failobj 314 raise "How did I get here?"
315
316 - def __getitem__(self, id):
317 """__getitem__(self, id) -> object 318 319 Return a Prosite entry. id is either the id or accession 320 for the entry. Raises a KeyError if there's an error. 321 322 """ 323 # First, check to see if enough time has passed since my 324 # last query. 325 self.limiter.wait() 326 327 try: 328 handle = ExPASy.get_prosite_entry(id) 329 except IOError: 330 raise KeyError, id 331 try: 332 handle = File.StringHandle(_extract_record(handle)) 333 except ValueError: 334 raise KeyError, id 335 336 if self.parser is not None: 337 return self.parser.parse(handle) 338 return handle.read()
339
340 -class RecordParser(AbstractParser):
341 """Parses Prosite data into a Record object. 342 343 """
344 - def __init__(self):
345 self._scanner = _Scanner() 346 self._consumer = _RecordConsumer()
347
348 - def parse(self, handle):
349 self._scanner.feed(handle, self._consumer) 350 return self._consumer.data
351
352 -class _Scanner:
353 """Scans Prosite-formatted data. 354 355 Tested with: 356 Release 15.0, July 1998 357 358 """
359 - def feed(self, handle, consumer):
360 """feed(self, handle, consumer) 361 362 Feed in Prosite data for scanning. handle is a file-like 363 object that contains prosite data. consumer is a 364 Consumer object that will receive events as the report is scanned. 365 366 """ 367 if isinstance(handle, File.UndoHandle): 368 uhandle = handle 369 else: 370 uhandle = File.UndoHandle(handle) 371 372 while 1: 373 line = uhandle.peekline() 374 if not line: 375 break 376 elif is_blank_line(line): 377 # Skip blank lines between records 378 uhandle.readline() 379 continue 380 elif line[:2] == 'ID': 381 self._scan_record(uhandle, consumer) 382 elif line[:2] == 'CC': 383 self._scan_copyrights(uhandle, consumer) 384 else: 385 raise SyntaxError, "There doesn't appear to be a record"
386
387 - def _scan_copyrights(self, uhandle, consumer):
388 consumer.start_copyrights() 389 self._scan_line('CC', uhandle, consumer.copyright, any_number=1) 390 self._scan_terminator(uhandle, consumer) 391 consumer.end_copyrights()
392
393 - def _scan_record(self, uhandle, consumer):
394 consumer.start_record() 395 for fn in self._scan_fns: 396 fn(self, uhandle, consumer) 397 398 # In Release 15.0, C_TYPE_LECTIN_1 has the DO line before 399 # the 3D lines, instead of the other way around. 400 # Thus, I'll give the 3D lines another chance after the DO lines 401 # are finished. 402 if fn is self._scan_do.im_func: 403 self._scan_3d(uhandle, consumer) 404 consumer.end_record()
405
406 - def _scan_line(self, line_type, uhandle, event_fn, 407 exactly_one=None, one_or_more=None, any_number=None, 408 up_to_one=None):
409 # Callers must set exactly one of exactly_one, one_or_more, or 410 # any_number to a true value. I do not explicitly check to 411 # make sure this function is called correctly. 412 413 # This does not guarantee any parameter safety, but I 414 # like the readability. The other strategy I tried was have 415 # parameters min_lines, max_lines. 416 417 if exactly_one or one_or_more: 418 read_and_call(uhandle, event_fn, start=line_type) 419 if one_or_more or any_number: 420 while 1: 421 if not attempt_read_and_call(uhandle, event_fn, 422 start=line_type): 423 break 424 if up_to_one: 425 attempt_read_and_call(uhandle, event_fn, start=line_type)
426
427 - def _scan_id(self, uhandle, consumer):
428 self._scan_line('ID', uhandle, consumer.identification, exactly_one=1)
429
430 - def _scan_ac(self, uhandle, consumer):
431 self._scan_line('AC', uhandle, consumer.accession, exactly_one=1)
432
433 - def _scan_dt(self, uhandle, consumer):
434 self._scan_line('DT', uhandle, consumer.date, exactly_one=1)
435
436 - def _scan_de(self, uhandle, consumer):
437 self._scan_line('DE', uhandle, consumer.description, exactly_one=1)
438
439 - def _scan_pa(self, uhandle, consumer):
440 self._scan_line('PA', uhandle, consumer.pattern, any_number=1)
441
442 - def _scan_ma(self, uhandle, consumer):
443 self._scan_line('MA', uhandle, consumer.matrix, any_number=1)
444 ## # ZN2_CY6_FUNGAL_2, DNAJ_2 in Release 15 445 ## # contain a CC line buried within an 'MA' line. Need to check 446 ## # for that. 447 ## while 1: 448 ## if not attempt_read_and_call(uhandle, consumer.matrix, start='MA'): 449 ## line1 = uhandle.readline() 450 ## line2 = uhandle.readline() 451 ## uhandle.saveline(line2) 452 ## uhandle.saveline(line1) 453 ## if line1[:2] == 'CC' and line2[:2] == 'MA': 454 ## read_and_call(uhandle, consumer.comment, start='CC') 455 ## else: 456 ## break 457
458 - def _scan_ru(self, uhandle, consumer):
459 self._scan_line('RU', uhandle, consumer.rule, any_number=1)
460
461 - def _scan_nr(self, uhandle, consumer):
462 self._scan_line('NR', uhandle, consumer.numerical_results, 463 any_number=1)
464
465 - def _scan_cc(self, uhandle, consumer):
466 self._scan_line('CC', uhandle, consumer.comment, any_number=1)
467
468 - def _scan_dr(self, uhandle, consumer):
469 self._scan_line('DR', uhandle, consumer.database_reference, 470 any_number=1)
471
472 - def _scan_3d(self, uhandle, consumer):
473 self._scan_line('3D', uhandle, consumer.pdb_reference, 474 any_number=1)
475
476 - def _scan_do(self, uhandle, consumer):
477 self._scan_line('DO', uhandle, consumer.documentation, exactly_one=1)
478
479 - def _scan_terminator(self, uhandle, consumer):
480 self._scan_line('//', uhandle, consumer.terminator, exactly_one=1)
481 482 _scan_fns = [ 483 _scan_id, 484 _scan_ac, 485 _scan_dt, 486 _scan_de, 487 _scan_pa, 488 _scan_ma, 489 _scan_ru, 490 _scan_nr, 491 _scan_cc, 492 493 # This is a really dirty hack, and should be fixed properly at 494 # some point. ZN2_CY6_FUNGAL_2, DNAJ_2 in Rel 15 and PS50309 495 # in Rel 17 have lines out of order. Thus, I have to rescan 496 # these, which decreases performance. 497 _scan_ma, 498 _scan_nr, 499 _scan_cc, 500 501 _scan_dr, 502 _scan_3d, 503 _scan_do, 504 _scan_terminator 505 ]
506
507 -class _RecordConsumer(AbstractConsumer):
508 """Consumer that converts a Prosite record to a Record object. 509 510 Members: 511 data Record with Prosite data. 512 513 """
514 - def __init__(self):
515 self.data = None
516
517 - def start_record(self):
518 self.data = Record()
519
520 - def end_record(self):
521 self._clean_record(self.data)
522
523 - def identification(self, line):
524 cols = string.split(line) 525 if len(cols) != 3: 526 raise SyntaxError, "I don't understand identification line\n%s" % \ 527 line 528 self.data.name = self._chomp(cols[1]) # don't want ';' 529 self.data.type = self._chomp(cols[2]) # don't want '.'
530
531 - def accession(self, line):
532 cols = string.split(line) 533 if len(cols) != 2: 534 raise SyntaxError, "I don't understand accession line\n%s" % line 535 self.data.accession = self._chomp(cols[1])
536
537 - def date(self, line):
538 uprline = string.upper(line) 539 cols = string.split(uprline) 540 541 # Release 15.0 contains both 'INFO UPDATE' and 'INF UPDATE' 542 if cols[2] != '(CREATED);' or \ 543 cols[4] != '(DATA' or cols[5] != 'UPDATE);' or \ 544 cols[7][:4] != '(INF' or cols[8] != 'UPDATE).': 545 raise SyntaxError, "I don't understand date line\n%s" % line 546 547 self.data.created = cols[1] 548 self.data.data_update = cols[3] 549 self.data.info_update = cols[6]
550
551 - def description(self, line):
552 self.data.description = self._clean(line)
553
554 - def pattern(self, line):
555 self.data.pattern = self.data.pattern + self._clean(line)
556
557 - def matrix(self, line):
558 self.data.matrix.append(self._clean(line))
559
560 - def rule(self, line):
561 self.data.rules.append(self._clean(line))
562
563 - def numerical_results(self, line):
564 cols = string.split(self._clean(line), ';') 565 for col in cols: 566 if not col: 567 continue 568 qual, data = map(string.lstrip, string.split(col, '=')) 569 if qual == '/RELEASE': 570 release, seqs = string.split(data, ',') 571 self.data.nr_sp_release = release 572 self.data.nr_sp_seqs = int(seqs) 573 elif qual == '/FALSE_NEG': 574 self.data.nr_false_neg = int(data) 575 elif qual == '/PARTIAL': 576 self.data.nr_partial = int(data) 577 elif qual in ['/TOTAL', '/POSITIVE', '/UNKNOWN', '/FALSE_POS']: 578 m = re.match(r'(\d+)\((\d+)\)', data) 579 if not m: 580 raise error, "Broken data %s in comment line\n%s" % \ 581 (repr(data), line) 582 hits = tuple(map(int, m.groups())) 583 if(qual == "/TOTAL"): 584 self.data.nr_total = hits 585 elif(qual == "/POSITIVE"): 586 self.data.nr_positive = hits 587 elif(qual == "/UNKNOWN"): 588 self.data.nr_unknown = hits 589 elif(qual == "/FALSE_POS"): 590 self.data.nr_false_pos = hits 591 else: 592 raise SyntaxError, "Unknown qual %s in comment line\n%s" % \ 593 (repr(qual), line)
594
595 - def comment(self, line):
596 cols = string.split(self._clean(line), ';') 597 for col in cols: 598 # DNAJ_2 in Release 15 has a non-standard comment line: 599 # CC Automatic scaling using reversed database 600 # Throw it away. (Should I keep it?) 601 if not col or col[:17] == 'Automatic scaling': 602 continue 603 qual, data = map(string.lstrip, string.split(col, '=')) 604 if qual == '/TAXO-RANGE': 605 self.data.cc_taxo_range = data 606 elif qual == '/MAX-REPEAT': 607 self.data.cc_max_repeat = data 608 elif qual == '/SITE': 609 pos, desc = string.split(data, ',') 610 self.data.cc_site.append((int(pos), desc)) 611 elif qual == '/SKIP-FLAG': 612 self.data.cc_skip_flag = data 613 elif qual == '/MATRIX_TYPE': 614 self.data.cc_matrix_type = data 615 elif qual == '/SCALING_DB': 616 self.data.cc_scaling_db = data 617 elif qual == '/AUTHOR': 618 self.data.cc_author = data 619 elif qual == '/FT_KEY': 620 self.data.cc_ft_key = data 621 elif qual == '/FT_DESC': 622 self.data.cc_ft_desc = data 623 elif qual == '/VERSION': 624 self.data.cc_version = data 625 else: 626 raise SyntaxError, "Unknown qual %s in comment line\n%s" % \ 627 (repr(qual), line)
628
629 - def database_reference(self, line):
630 refs = string.split(self._clean(line), ';') 631 for ref in refs: 632 if not ref: 633 continue 634 acc, name, type = map(string.strip, string.split(ref, ',')) 635 if type == 'T': 636 self.data.dr_positive.append((acc, name)) 637 elif type == 'F': 638 self.data.dr_false_pos.append((acc, name)) 639 elif type == 'N': 640 self.data.dr_false_neg.append((acc, name)) 641 elif type == 'P': 642 self.data.dr_potential.append((acc, name)) 643 elif type == '?': 644 self.data.dr_unknown.append((acc, name)) 645 else: 646 raise SyntaxError, "I don't understand type flag %s" % type
647
648 - def pdb_reference(self, line):
649 cols = string.split(line) 650 for id in cols[1:]: # get all but the '3D' col 651 self.data.pdb_structs.append(self._chomp(id))
652
653 - def documentation(self, line):
654 self.data.pdoc = self._chomp(self._clean(line))
655
656 - def terminator(self, line):
657 pass
658
659 - def _chomp(self, word, to_chomp='.,;'):
660 # Remove the punctuation at the end of a word. 661 if word[-1] in to_chomp: 662 return word[:-1] 663 return word
664
665 - def _clean(self, line, rstrip=1):
666 # Clean up a line. 667 if rstrip: 668 return string.rstrip(line[5:]) 669 return line[5:]
670
671 -def scan_sequence_expasy(seq=None, id=None, exclude_frequent=None):
672 """scan_sequence_expasy(seq=None, id=None, exclude_frequent=None) -> 673 list of PatternHit's 674 675 Search a sequence for occurrences of Prosite patterns. You can 676 specify either a sequence in seq or a SwissProt/trEMBL ID or accession 677 in id. Only one of those should be given. If exclude_frequent 678 is true, then the patterns with the high probability of occurring 679 will be excluded. 680 681 """ 682 if (seq and id) or not (seq or id): 683 raise ValueError, "Please specify either a sequence or an id" 684 handle = ExPASy.scanprosite1(seq, id, exclude_frequent) 685 return _extract_pattern_hits(handle)
686
687 -def _extract_pattern_hits(handle):
688 """_extract_pattern_hits(handle) -> list of PatternHit's 689 690 Extract hits from a web page. Raises a ValueError if there 691 was an error in the query. 692 693 """ 694 class parser(sgmllib.SGMLParser): 695 def __init__(self): 696 sgmllib.SGMLParser.__init__(self) 697 self.hits = [] 698 self.broken_message = 'Some error occurred' 699 self._in_pre = 0 700 self._current_hit = None 701 self._last_found = None # Save state of parsing
702 def handle_data(self, data): 703 if string.find(data, 'try again') >= 0: 704 self.broken_message = data 705 return 706 elif data == 'illegal': 707 self.broken_message = 'Sequence contains illegal characters' 708 return 709 if not self._in_pre: 710 return 711 elif not string.strip(data): 712 return 713 if self._last_found is None and data[:4] == 'PDOC': 714 self._current_hit.pdoc = data 715 self._last_found = 'pdoc' 716 elif self._last_found == 'pdoc': 717 if data[:2] != 'PS': 718 raise SyntaxError, "Expected accession but got:\n%s" % data 719 self._current_hit.accession = data 720 self._last_found = 'accession' 721 elif self._last_found == 'accession': 722 self._current_hit.name = data 723 self._last_found = 'name' 724 elif self._last_found == 'name': 725 self._current_hit.description = data 726 self._last_found = 'description' 727 elif self._last_found == 'description': 728 m = re.findall(r'(\d+)-(\d+) (\w+)', data) 729 for start, end, seq in m: 730 self._current_hit.matches.append( 731 (int(start), int(end), seq)) 732 733 def do_hr(self, attrs): 734 # <HR> inside a <PRE> section means a new hit. 735 if self._in_pre: 736 self._current_hit = PatternHit() 737 self.hits.append(self._current_hit) 738 self._last_found = None 739 def start_pre(self, attrs): 740 self._in_pre = 1 741 self.broken_message = None # Probably not broken 742 def end_pre(self): 743 self._in_pre = 0 744 p = parser() 745 p.feed(handle.read()) 746 if p.broken_message: 747 raise ValueError, p.broken_message 748 return p.hits 749 750 751 752
753 -def index_file(filename, indexname, rec2key=None):
754 """index_file(filename, indexname, rec2key=None) 755 756 Index a Prosite file. filename is the name of the file. 757 indexname is the name of the dictionary. rec2key is an 758 optional callback that takes a Record and generates a unique key 759 (e.g. the accession number) for the record. If not specified, 760 the id name will be used. 761 762 """ 763 if not os.path.exists(filename): 764 raise ValueError, "%s does not exist" % filename 765 766 index = Index.Index(indexname, truncate=1) 767 index[Dictionary._Dictionary__filename_key] = filename 768 769 iter = Iterator(open(filename), parser=RecordParser()) 770 while 1: 771 start = iter._uhandle.tell() 772 rec = iter.next() 773 length = iter._uhandle.tell() - start 774 775 if rec is None: 776 break 777 if rec2key is not None: 778 key = rec2key(rec) 779 else: 780 key = rec.name 781 782 if not key: 783 raise KeyError, "empty key was produced" 784 elif index.has_key(key): 785 raise KeyError, "duplicate key %s found" % key 786 787 index[key] = start, length
788
789 -def _extract_record(handle):
790 """_extract_record(handle) -> str 791 792 Extract PROSITE data from a web page. Raises a ValueError if no 793 data was found in the web page. 794 795 """ 796 # All the data appears between tags: 797 # <pre width = 80>ID NIR_SIR; PATTERN. 798 # </PRE> 799 class parser(sgmllib.SGMLParser): 800 def __init__(self): 801 sgmllib.SGMLParser.__init__(self) 802 self._in_pre = 0 803 self.data = []
804 def handle_data(self, data): 805 if self._in_pre: 806 self.data.append(data) 807 def do_br(self, attrs): 808 if self._in_pre: 809 self.data.append('\n') 810 def start_pre(self, attrs): 811 self._in_pre = 1 812 def end_pre(self): 813 self._in_pre = 0 814 p = parser() 815 p.feed(handle.read()) 816 if not p.data: 817 raise ValueError, "No data found in web page." 818 return string.join(p.data, '') 819