Package Bio :: Package SwissProt :: Module SProt
[hide private]
[frames] | no frames]

Source Code for Module Bio.SwissProt.SProt

   1  # Copyright 1999 by Jeffrey Chang.  All rights reserved. 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license.  Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5   
   6  """ 
   7  This module provides code to work with the sprotXX.dat file from 
   8  SwissProt. 
   9  http://www.expasy.ch/sprot/sprot-top.html 
  10   
  11  Tested with: 
  12  Release 37, Release 38, Release 39 
  13   
  14  Limited testing with: 
  15  Release 51, 54 
  16   
  17   
  18  Classes: 
  19  Record             Holds SwissProt data. 
  20  Reference          Holds reference data from a SwissProt entry. 
  21  Iterator           Iterates over entries in a SwissProt file. 
  22  Dictionary         Accesses a SwissProt file using a dictionary interface. 
  23  ExPASyDictionary   Accesses SwissProt records from ExPASy. 
  24  RecordParser       Parses a SwissProt record into a Record object. 
  25  SequenceParser     Parses a SwissProt record into a SeqRecord object. 
  26   
  27  _Scanner           Scans SwissProt-formatted data. 
  28  _RecordConsumer    Consumes SwissProt data to a Record object. 
  29  _SequenceConsumer  Consumes SwissProt data to a Seq object. 
  30   
  31   
  32  Functions: 
  33  index_file         Index a SwissProt file for a Dictionary. 
  34   
  35  """ 
  36  from types import * 
  37  import os 
  38  from Bio import File 
  39  from Bio import Index 
  40  from Bio import Alphabet 
  41  from Bio import Seq 
  42  from Bio import SeqRecord 
  43  from Bio.ParserSupport import * 
  44  from Bio.WWW import ExPASy 
  45  from Bio.WWW import RequestLimiter 
  46   
  47  _CHOMP = " \n\r\t.,;" #whitespace and trailing punctuation 
  48   
49 -class Record:
50 """Holds information from a SwissProt record. 51 52 Members: 53 entry_name Name of this entry, e.g. RL1_ECOLI. 54 data_class Either 'STANDARD' or 'PRELIMINARY'. 55 molecule_type Type of molecule, 'PRT', 56 sequence_length Number of residues. 57 58 accessions List of the accession numbers, e.g. ['P00321'] 59 created A tuple of (date, release). 60 sequence_update A tuple of (date, release). 61 annotation_update A tuple of (date, release). 62 63 description Free-format description. 64 gene_name Gene name. See userman.txt for description. 65 organism The source of the sequence. 66 organelle The origin of the sequence. 67 organism_classification The taxonomy classification. List of strings. 68 (http://www.ncbi.nlm.nih.gov/Taxonomy/) 69 taxonomy_id A list of NCBI taxonomy id's. 70 host_organism A list of NCBI taxonomy id's of the hosts of a virus, 71 if any. 72 references List of Reference objects. 73 comments List of strings. 74 cross_references List of tuples (db, id1[, id2][, id3]). See the docs. 75 keywords List of the keywords. 76 features List of tuples (key name, from, to, description). 77 from and to can be either integers for the residue 78 numbers, '<', '>', or '?' 79 80 seqinfo tuple of (length, molecular weight, CRC32 value) 81 sequence The sequence. 82 83 """
84 - def __init__(self):
85 self.entry_name = None 86 self.data_class = None 87 self.molecule_type = None 88 self.sequence_length = None 89 90 self.accessions = [] 91 self.created = None 92 self.sequence_update = None 93 self.annotation_update = None 94 95 self.description = '' 96 self.gene_name = '' 97 self.organism = '' 98 self.organelle = '' 99 self.organism_classification = [] 100 self.taxonomy_id = [] 101 self.host_organism = [] 102 self.references = [] 103 self.comments = [] 104 self.cross_references = [] 105 self.keywords = [] 106 self.features = [] 107 108 self.seqinfo = None 109 self.sequence = ''
110
111 -class Reference:
112 """Holds information from 1 references in a SwissProt entry. 113 114 Members: 115 number Number of reference in an entry. 116 positions Describes extent of work. list of strings. 117 comments Comments. List of (token, text). 118 references References. List of (dbname, identifier) 119 authors The authors of the work. 120 title Title of the work. 121 location A citation for the work. 122 123 """
124 - def __init__(self):
125 self.number = None 126 self.positions = [] 127 self.comments = [] 128 self.references = [] 129 self.authors = '' 130 self.title = '' 131 self.location = ''
132
133 -class Iterator:
134 """Returns one record at a time from a SwissProt file. 135 136 Methods: 137 next Return the next record from the stream, or None. 138 139 """
140 - def __init__(self, handle, parser=None):
141 """__init__(self, handle, parser=None) 142 143 Create a new iterator. handle is a file-like object. parser 144 is an optional Parser object to change the results into another form. 145 If set to None, then the raw contents of the file will be returned. 146 147 """ 148 if type(handle) is not FileType and type(handle) is not InstanceType: 149 raise ValueError, "I expected a file handle or file-like object" 150 self._uhandle = File.UndoHandle(handle) 151 self._parser = parser
152
153 - def next(self):
154 """next(self) -> object 155 156 Return the next swissprot record from the file. If no more records, 157 return None. 158 159 """ 160 lines = [] 161 while 1: 162 line = self._uhandle.readline() 163 if not line: 164 break 165 lines.append(line) 166 if line[:2] == '//': 167 break 168 169 if not lines: 170 return None 171 172 data = ''.join(lines) 173 if self._parser is not None: 174 return self._parser.parse(File.StringHandle(data)) 175 return data
176
177 - def __iter__(self):
178 return iter(self.next, None)
179
180 -class Dictionary:
181 """Accesses a SwissProt file using a dictionary interface. 182 183 """ 184 __filename_key = '__filename' 185
186 - def __init__(self, indexname, parser=None):
187 """__init__(self, indexname, parser=None) 188 189 Open a SwissProt Dictionary. indexname is the name of the 190 index for the dictionary. The index should have been created 191 using the index_file function. parser is an optional Parser 192 object to change the results into another form. If set to None, 193 then the raw contents of the file will be returned. 194 195 """ 196 self._index = Index.Index(indexname) 197 self._handle = open(self._index[self.__filename_key]) 198 self._parser = parser
199
200 - def __len__(self):
201 return len(self._index)
202
203 - def __getitem__(self, key):
204 start, len = self._index[key] 205 self._handle.seek(start) 206 data = self._handle.read(len) 207 if self._parser is not None: 208 return self._parser.parse(File.StringHandle(data)) 209 return data
210
211 - def __getattr__(self, name):
212 return getattr(self._index, name)
213
214 - def keys(self):
215 # I only want to expose the keys for SwissProt. 216 k = self._index.keys() 217 k.remove(self.__filename_key) 218 return k
219
220 -class ExPASyDictionary:
221 """Access SwissProt at ExPASy using a read-only dictionary interface. 222 223 """
224 - def __init__(self, delay=5.0, parser=None):
225 """__init__(self, delay=5.0, parser=None) 226 227 Create a new Dictionary to access SwissProt. parser is an optional 228 parser (e.g. SProt.RecordParser) object to change the results 229 into another form. If set to None, then the raw contents of the 230 file will be returned. delay is the number of seconds to wait 231 between each query. 232 233 """ 234 self.parser = parser 235 self.limiter = RequestLimiter(delay)
236
237 - def __len__(self):
238 raise NotImplementedError, "SwissProt contains lots of entries"
239 - def clear(self):
240 raise NotImplementedError, "This is a read-only dictionary"
241 - def __setitem__(self, key, item):
242 raise NotImplementedError, "This is a read-only dictionary"
243 - def update(self):
244 raise NotImplementedError, "This is a read-only dictionary"
245 - def copy(self):
246 raise NotImplementedError, "You don't need to do this..."
247 - def keys(self):
248 raise NotImplementedError, "You don't really want to do this..."
249 - def items(self):
250 raise NotImplementedError, "You don't really want to do this..."
251 - def values(self):
252 raise NotImplementedError, "You don't really want to do this..."
253
254 - def has_key(self, id):
255 """has_key(self, id) -> bool""" 256 try: 257 self[id] 258 except KeyError: 259 return 0 260 return 1
261
262 - def get(self, id, failobj=None):
263 try: 264 return self[id] 265 except KeyError: 266 return failobj 267 raise "How did I get here?"
268
269 - def __getitem__(self, id):
270 """__getitem__(self, id) -> object 271 272 Return a SwissProt entry. id is either the id or accession 273 for the entry. Raises a KeyError if there's an error. 274 275 """ 276 # First, check to see if enough time has passed since my 277 # last query. 278 self.limiter.wait() 279 280 try: 281 handle = ExPASy.get_sprot_raw(id) 282 except IOError: 283 raise KeyError, id 284 285 if self.parser is not None: 286 return self.parser.parse(handle) 287 return handle.read()
288
289 -class RecordParser(AbstractParser):
290 """Parses SwissProt data into a Record object. 291 292 """
293 - def __init__(self):
294 self._scanner = _Scanner() 295 self._consumer = _RecordConsumer()
296
297 - def parse(self, handle):
298 self._scanner.feed(handle, self._consumer) 299 return self._consumer.data
300
301 -class SequenceParser(AbstractParser):
302 """Parses SwissProt data into a standard SeqRecord object. 303 """
304 - def __init__(self, alphabet = Alphabet.generic_protein):
305 """Initialize a SequenceParser. 306 307 Arguments: 308 o alphabet - The alphabet to use for the generated Seq objects. If 309 not supplied this will default to the generic protein alphabet. 310 """ 311 self._scanner = _Scanner() 312 self._consumer = _SequenceConsumer(alphabet)
313
314 - def parse(self, handle):
315 self._scanner.feed(handle, self._consumer) 316 return self._consumer.data
317
318 -class _Scanner:
319 """Scans SwissProt-formatted data. 320 321 Tested with: 322 Release 37 323 Release 38 324 """ 325
326 - def feed(self, handle, consumer):
327 """feed(self, handle, consumer) 328 329 Feed in SwissProt data for scanning. handle is a file-like 330 object that contains swissprot data. consumer is a 331 Consumer object that will receive events as the report is scanned. 332 333 """ 334 if isinstance(handle, File.UndoHandle): 335 uhandle = handle 336 else: 337 uhandle = File.UndoHandle(handle) 338 339 while uhandle.peekline(): 340 self._scan_record(uhandle, consumer)
341
342 - def _skip_starstar(self, uhandle) :
343 """Ignores any lines starting **""" 344 #See Bug 2353, some files from the EBI have extra lines 345 #starting "**" (two asterisks/stars), usually between the 346 #features and sequence but not all the time. They appear 347 #to be unofficial automated annotations. e.g. 348 #** 349 #** ################# INTERNAL SECTION ################## 350 #**HA SAM; Annotated by PicoHamap 1.88; MF_01138.1; 09-NOV-2003. 351 while "**" == uhandle.peekline()[:2] : 352 skip = uhandle.readline()
353 #print "Skipping line: %s" % skip.rstrip() 354
355 - def _scan_record(self, uhandle, consumer):
356 consumer.start_record() 357 for fn in self._scan_fns: 358 self._skip_starstar(uhandle) 359 fn(self, uhandle, consumer) 360 361 # In Release 38, ID N33_HUMAN has a DR buried within comments. 362 # Check for this and do more comments, if necessary. 363 # XXX handle this better 364 if fn is self._scan_dr.im_func: 365 self._scan_cc(uhandle, consumer) 366 self._scan_dr(uhandle, consumer) 367 consumer.end_record()
368
369 - def _scan_line(self, line_type, uhandle, event_fn, 370 exactly_one=None, one_or_more=None, any_number=None, 371 up_to_one=None):
372 # Callers must set exactly one of exactly_one, one_or_more, or 373 # any_number to a true value. I do not explicitly check to 374 # make sure this function is called correctly. 375 376 # This does not guarantee any parameter safety, but I 377 # like the readability. The other strategy I tried was have 378 # parameters min_lines, max_lines. 379 380 if exactly_one or one_or_more: 381 read_and_call(uhandle, event_fn, start=line_type) 382 if one_or_more or any_number: 383 while 1: 384 if not attempt_read_and_call(uhandle, event_fn, 385 start=line_type): 386 break 387 if up_to_one: 388 attempt_read_and_call(uhandle, event_fn, start=line_type)
389
390 - def _scan_id(self, uhandle, consumer):
391 self._scan_line('ID', uhandle, consumer.identification, exactly_one=1)
392
393 - def _scan_ac(self, uhandle, consumer):
394 # Until release 38, this used to match exactly_one. 395 # However, in release 39, 1A02_HUMAN has 2 AC lines, and the 396 # definition needed to be expanded. 397 self._scan_line('AC', uhandle, consumer.accession, any_number=1)
398
399 - def _scan_dt(self, uhandle, consumer):
400 self._scan_line('DT', uhandle, consumer.date, exactly_one=1) 401 self._scan_line('DT', uhandle, consumer.date, exactly_one=1) 402 # IPI doesn't necessarily contain the third line about annotations 403 self._scan_line('DT', uhandle, consumer.date, up_to_one=1)
404
405 - def _scan_de(self, uhandle, consumer):
406 # IPI can be missing a DE line 407 self._scan_line('DE', uhandle, consumer.description, any_number=1)
408
409 - def _scan_gn(self, uhandle, consumer):
410 self._scan_line('GN', uhandle, consumer.gene_name, any_number=1)
411
412 - def _scan_os(self, uhandle, consumer):
413 self._scan_line('OS', uhandle, consumer.organism_species, 414 one_or_more=1)
415
416 - def _scan_og(self, uhandle, consumer):
417 self._scan_line('OG', uhandle, consumer.organelle, any_number=1)
418
419 - def _scan_oc(self, uhandle, consumer):
420 self._scan_line('OC', uhandle, consumer.organism_classification, 421 one_or_more=1)
422
423 - def _scan_ox(self, uhandle, consumer):
424 self._scan_line('OX', uhandle, consumer.taxonomy_id, 425 any_number=1)
426
427 - def _scan_oh(self, uhandle, consumer):
428 # viral host organism. introduced after SwissProt 39. 429 self._scan_line('OH', uhandle, consumer.organism_host, any_number=1)
430
431 - def _scan_reference(self, uhandle, consumer):
432 while 1: 433 if safe_peekline(uhandle)[:2] != 'RN': 434 break 435 self._scan_rn(uhandle, consumer) 436 self._scan_rp(uhandle, consumer) 437 self._scan_rc(uhandle, consumer) 438 self._scan_rx(uhandle, consumer) 439 # ws:2001-12-05 added, for record with RL before RA 440 self._scan_rl(uhandle, consumer) 441 self._scan_ra(uhandle, consumer) 442 #EBI copy of P72010 is missing the RT line, and has one 443 #of their ** lines in its place noting "** /NO TITLE." 444 #See also bug 2353 445 self._skip_starstar(uhandle) 446 self._scan_rt(uhandle, consumer) 447 self._scan_rl(uhandle, consumer)
448
449 - def _scan_rn(self, uhandle, consumer):
450 self._scan_line('RN', uhandle, consumer.reference_number, 451 exactly_one=1)
452
453 - def _scan_rp(self, uhandle, consumer):
454 self._scan_line('RP', uhandle, consumer.reference_position, 455 one_or_more=1)
456
457 - def _scan_rc(self, uhandle, consumer):
458 self._scan_line('RC', uhandle, consumer.reference_comment, 459 any_number=1)
460
461 - def _scan_rx(self, uhandle, consumer):
462 self._scan_line('RX', uhandle, consumer.reference_cross_reference, 463 any_number=1)
464
465 - def _scan_ra(self, uhandle, consumer):
466 # In UniProt release 1.12 of 6/21/04, there is a new RG 467 # (Reference Group) line, which references a group instead of 468 # an author. Each block must have at least 1 RA or RG line. 469 self._scan_line('RA', uhandle, consumer.reference_author, 470 any_number=1) 471 self._scan_line('RG', uhandle, consumer.reference_author, 472 any_number=1) 473 # PRKN_HUMAN has RG lines, then RA lines. The best solution 474 # is to write code that accepts either of the line types. 475 # This is the quick solution... 476 self._scan_line('RA', uhandle, consumer.reference_author, 477 any_number=1)
478
479 - def _scan_rt(self, uhandle, consumer):
480 self._scan_line('RT', uhandle, consumer.reference_title, 481 any_number=1)
482
483 - def _scan_rl(self, uhandle, consumer):
484 # This was one_or_more, but P82909 in TrEMBL 16.0 does not 485 # have one. 486 self._scan_line('RL', uhandle, consumer.reference_location, 487 any_number=1)
488
489 - def _scan_cc(self, uhandle, consumer):
490 self._scan_line('CC', uhandle, consumer.comment, any_number=1)
491
492 - def _scan_dr(self, uhandle, consumer):
493 self._scan_line('DR', uhandle, consumer.database_cross_reference, 494 any_number=1)
495
496 - def _scan_kw(self, uhandle, consumer):
497 self._scan_line('KW', uhandle, consumer.keyword, any_number=1)
498
499 - def _scan_ft(self, uhandle, consumer):
500 self._scan_line('FT', uhandle, consumer.feature_table, any_number=1)
501
502 - def _scan_pe(self, uhandle, consumer):
503 self._scan_line('PE', uhandle, consumer.protein_existence, any_number=1)
504
505 - def _scan_sq(self, uhandle, consumer):
506 self._scan_line('SQ', uhandle, consumer.sequence_header, exactly_one=1)
507
508 - def _scan_sequence_data(self, uhandle, consumer):
509 self._scan_line(' ', uhandle, consumer.sequence_data, one_or_more=1)
510
511 - def _scan_terminator(self, uhandle, consumer):
512 self._scan_line('//', uhandle, consumer.terminator, exactly_one=1)
513 514 _scan_fns = [ 515 _scan_id, 516 _scan_ac, 517 _scan_dt, 518 _scan_de, 519 _scan_gn, 520 _scan_os, 521 _scan_og, 522 _scan_oc, 523 _scan_ox, 524 _scan_oh, 525 _scan_reference, 526 _scan_cc, 527 _scan_dr, 528 _scan_pe, 529 _scan_kw, 530 _scan_ft, 531 _scan_sq, 532 _scan_sequence_data, 533 _scan_terminator 534 ]
535
536 -class _RecordConsumer(AbstractConsumer):
537 """Consumer that converts a SwissProt record to a Record object. 538 539 Members: 540 data Record with SwissProt data. 541 542 """
543 - def __init__(self):
544 self.data = None
545
546 - def __repr__(self) :
547 return "Bio.SwissProt.SProt._RecordConsumer()"
548
549 - def start_record(self):
550 self.data = Record()
551
552 - def end_record(self):
553 self._clean_record(self.data)
554
555 - def identification(self, line):
556 cols = line.split() 557 #Prior to release 51, included with MoleculeType: 558 #ID EntryName DataClass; MoleculeType; SequenceLength. 559 # 560 #Newer files lack the MoleculeType: 561 #ID EntryName DataClass; SequenceLength. 562 # 563 #Note that cols is split on white space, so the length 564 #should become two fields (number and units) 565 if len(cols) == 6 : 566 self.data.entry_name = cols[1] 567 self.data.data_class = cols[2].rstrip(_CHOMP) # don't want ';' 568 self.data.molecule_type = cols[3].rstrip(_CHOMP) # don't want ';' 569 self.data.sequence_length = int(cols[4]) 570 elif len(cols) == 5 : 571 self.data.entry_name = cols[1] 572 self.data.data_class = cols[2].rstrip(_CHOMP) # don't want ';' 573 self.data.molecule_type = None 574 self.data.sequence_length = int(cols[3]) 575 else : 576 #Should we print a warning an continue? 577 raise SyntaxError("ID line has unrecognised format:\n"+line) 578 579 # data class can be 'STANDARD' or 'PRELIMINARY' 580 # ws:2001-12-05 added IPI 581 # pjc:2006-11-02 added 'Reviewed' and 'Unreviewed' 582 if self.data.data_class not in ['STANDARD', 'PRELIMINARY', 'IPI', 583 'Reviewed', 'Unreviewed']: 584 raise SyntaxError, "Unrecognized data class %s in line\n%s" % \ 585 (self.data.data_class, line) 586 # molecule_type should be 'PRT' for PRoTein 587 # Note that has been removed in recent releases (set to None) 588 if self.data.molecule_type is not None \ 589 and self.data.molecule_type != 'PRT': 590 raise SyntaxError, "Unrecognized molecule type %s in line\n%s" % \ 591 (self.data.molecule_type, line)
592
593 - def accession(self, line):
594 cols = line[5:].rstrip(_CHOMP).split(';') 595 for ac in cols: 596 self.data.accessions.append(ac.lstrip())
597
598 - def date(self, line):
599 uprline = string.upper(line) 600 cols = line.rstrip().split() 601 602 if uprline.find('CREATED') >= 0 \ 603 or uprline.find('LAST SEQUENCE UPDATE') >= 0 \ 604 or uprline.find('LAST ANNOTATION UPDATE') >= 0: 605 # Old style DT line 606 # ================= 607 # e.g. 608 # DT 01-FEB-1995 (Rel. 31, Created) 609 # DT 01-FEB-1995 (Rel. 31, Last sequence update) 610 # DT 01-OCT-2000 (Rel. 40, Last annotation update) 611 # 612 # or: 613 # DT 08-JAN-2002 (IPI Human rel. 2.3, Created) 614 # ... 615 616 # find where the version information will be located 617 # This is needed for when you have cases like IPI where 618 # the release verison is in a different spot: 619 # DT 08-JAN-2002 (IPI Human rel. 2.3, Created) 620 uprcols = uprline.split() 621 rel_index = -1 622 for index in range(len(uprcols)): 623 if uprcols[index].find("REL.") >= 0: 624 rel_index = index 625 assert rel_index >= 0, \ 626 "Could not find Rel. in DT line: %s" % (line) 627 version_index = rel_index + 1 628 # get the version information 629 str_version = cols[version_index].rstrip(_CHOMP) 630 # no version number 631 if str_version == '': 632 version = 0 633 # dot versioned 634 elif str_version.find(".") >= 0: 635 version = str_version 636 # integer versioned 637 else: 638 version = int(str_version) 639 640 if uprline.find('CREATED') >= 0: 641 self.data.created = cols[1], version 642 elif uprline.find('LAST SEQUENCE UPDATE') >= 0: 643 self.data.sequence_update = cols[1], version 644 elif uprline.find( 'LAST ANNOTATION UPDATE') >= 0: 645 self.data.annotation_update = cols[1], version 646 else: 647 assert False, "Shouldn't reach this line!" 648 elif uprline.find('INTEGRATED INTO') >= 0 \ 649 or uprline.find('SEQUENCE VERSION') >= 0 \ 650 or uprline.find('ENTRY VERSION') >= 0: 651 # New style DT line 652 # ================= 653 # As of UniProt Knowledgebase release 7.0 (including 654 # Swiss-Prot release 49.0 and TrEMBL release 32.0) the 655 # format of the DT lines and the version information 656 # in them was changed - the release number was dropped. 657 # 658 # For more information see bug 1948 and 659 # http://ca.expasy.org/sprot/relnotes/sp_news.html#rel7.0 660 # 661 # e.g. 662 # DT 01-JAN-1998, integrated into UniProtKB/Swiss-Prot. 663 # DT 15-OCT-2001, sequence version 3. 664 # DT 01-APR-2004, entry version 14. 665 # 666 #This is a new style DT line... 667 668 # The date should be in string cols[1] 669 # Get the version number if there is one. 670 # For the three DT lines above: 0, 3, 14 671 try: 672 version = int(cols[-1]) 673 except ValueError : 674 version = 0 675 676 # Re-use the historical property names, even though 677 # the meaning has changed slighty: 678 if uprline.find("INTEGRATED") >= 0: 679 self.data.created = cols[1], version 680 elif uprline.find('SEQUENCE VERSION') >= 0: 681 self.data.sequence_update = cols[1], version 682 elif uprline.find( 'ENTRY VERSION') >= 0: 683 self.data.annotation_update = cols[1], version 684 else: 685 assert False, "Shouldn't reach this line!" 686 else: 687 raise SyntaxError, "I don't understand the date line %s" % line
688
689 - def description(self, line):
690 self.data.description += line[5:]
691
692 - def gene_name(self, line):
693 self.data.gene_name += line[5:]
694
695 - def organism_species(self, line):
696 self.data.organism += line[5:]
697
698 - def organelle(self, line):
699 self.data.organelle += line[5:]
700
701 - def organism_classification(self, line):
702 line = line[5:].rstrip(_CHOMP) 703 cols = line.split(';') 704 for col in cols: 705 self.data.organism_classification.append(col.lstrip())
706
707 - def taxonomy_id(self, line):
708 # The OX line is in the format: 709 # OX DESCRIPTION=ID[, ID]...; 710 # If there are too many id's to fit onto a line, then the ID's 711 # continue directly onto the next line, e.g. 712 # OX DESCRIPTION=ID[, ID]... 713 # OX ID[, ID]...; 714 # Currently, the description is always "NCBI_TaxID". 715 716 # To parse this, I need to check to see whether I'm at the 717 # first line. If I am, grab the description and make sure 718 # it's an NCBI ID. Then, grab all the id's. 719 line = line[5:].rstrip(_CHOMP) 720 index = line.find('=') 721 if index >= 0: 722 descr = line[:index] 723 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr 724 ids = line[index+1:].split(',') 725 else: 726 ids = line.split(',') 727 self.data.taxonomy_id.extend([id.strip() for id in ids])
728
729 - def organism_host(self, line):
730 # Line type OH (Organism Host) for viral hosts 731 # same code as in taxonomy_id() 732 line = line[5:].rstrip(_CHOMP) 733 index = line.find('=') 734 if index >= 0: 735 descr = line[:index] 736 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr 737 ids = line[index+1:].split(',') 738 else: 739 ids = line.split(',') 740 self.data.host_organism.extend([id.strip() for id in ids])
741
742 - def reference_number(self, line):
743 rn = line[5:].rstrip() 744 assert rn[0] == '[' and rn[-1] == ']', "Missing brackets %s" % rn 745 ref = Reference() 746 ref.number = int(rn[1:-1]) 747 self.data.references.append(ref)
748
749 - def reference_position(self, line):
750 assert self.data.references, "RP: missing RN" 751 self.data.references[-1].positions.append(line[5:].rstrip())
752
753 - def reference_comment(self, line):
754 assert self.data.references, "RC: missing RN" 755 cols = line[5:].rstrip().split( ';') 756 ref = self.data.references[-1] 757 for col in cols: 758 if not col: # last column will be the empty string 759 continue 760 # The token is everything before the first '=' character. 761 index = col.find('=') 762 token, text = col[:index], col[index+1:] 763 # According to the spec, there should only be 1 '=' 764 # character. However, there are too many exceptions to 765 # handle, so we'll ease up and allow anything after the 766 # first '='. 767 #if col == ' STRAIN=TISSUE=BRAIN': 768 # # from CSP_MOUSE, release 38 769 # token, text = "TISSUE", "BRAIN" 770 #elif col == ' STRAIN=NCIB 9816-4, AND STRAIN=G7 / ATCC 17485': 771 # # from NDOA_PSEPU, release 38 772 # token, text = "STRAIN", "NCIB 9816-4 AND G7 / ATCC 17485" 773 #elif col == ' STRAIN=ISOLATE=NO 27, ANNO 1987' or \ 774 # col == ' STRAIN=ISOLATE=NO 27 / ANNO 1987': 775 # # from NU3M_BALPH, release 38, release 39 776 # token, text = "STRAIN", "ISOLATE NO 27, ANNO 1987" 777 #else: 778 # token, text = string.split(col, '=') 779 ref.comments.append((token.lstrip(), text))
780
781 - def reference_cross_reference(self, line):
782 assert self.data.references, "RX: missing RN" 783 # The basic (older?) RX line is of the form: 784 # RX MEDLINE; 85132727. 785 # but there are variants of this that need to be dealt with (see below) 786 787 # CLD1_HUMAN in Release 39 and DADR_DIDMA in Release 33 788 # have extraneous information in the RX line. Check for 789 # this and chop it out of the line. 790 # (noticed by katel@worldpath.net) 791 ind = line.find('[NCBI, ExPASy, Israel, Japan]') 792 if ind >= 0: 793 line = line[:ind] 794 795 # RX lines can also be used of the form 796 # RX PubMed=9603189; 797 # reported by edvard@farmasi.uit.no 798 # and these can be more complicated like: 799 # RX MEDLINE=95385798; PubMed=7656980; 800 # RX PubMed=15060122; DOI=10.1136/jmg 2003.012781; 801 # We look for these cases first and deal with them 802 if line.find("=") != -1: 803 cols = line[2:].split("; ") 804 cols = [x.strip() for x in cols] 805 cols = [x for x in cols if x] 806 for col in cols: 807 x = col.split("=") 808 assert len(x) == 2, "I don't understand RX line %s" % line 809 key, value = x[0].rstrip(_CHOMP), x[1].rstrip(_CHOMP) 810 ref = self.data.references[-1].references 811 ref.append((key, value)) 812 # otherwise we assume we have the type 'RX MEDLINE; 85132727.' 813 else: 814 cols = line.split() 815 # normally we split into the three parts 816 assert len(cols) == 3, "I don't understand RX line %s" % line 817 self.data.references[-1].references.append( 818 (cols[1].rstrip(_CHOMP), cols[2].rstrip(_CHOMP)))
819
820 - def reference_author(self, line):
821 assert self.data.references, "RA: missing RN" 822 ref = self.data.references[-1] 823 ref.authors = ref.authors + line[5:]
824
825 - def reference_title(self, line):
826 assert self.data.references, "RT: missing RN" 827 ref = self.data.references[-1] 828 ref.title = ref.title + line[5:]
829
830 - def reference_location(self, line):
831 assert self.data.references, "RL: missing RN" 832 ref = self.data.references[-1] 833 ref.location = ref.location + line[5:]
834
835 - def comment(self, line):
836 if line[5:8] == '-!-': # Make a new comment 837 self.data.comments.append(line[9:]) 838 elif line[5:8] == ' ': # add to the previous comment 839 if not self.data.comments: 840 # TCMO_STRGA in Release 37 has comment with no topic 841 self.data.comments.append(line[9:]) 842 else: 843 self.data.comments[-1] = self.data.comments[-1] + line[9:] 844 elif line[5:8] == '---': 845 # If there are no comments, and it's not the closing line, 846 # make a new comment. 847 if not self.data.comments or self.data.comments[-1][:3] != '---': 848 self.data.comments.append(line[5:]) 849 else: 850 self.data.comments[-1] = self.data.comments[-1] + line[5:] 851 else: # copyright notice 852 self.data.comments[-1] = self.data.comments[-1] + line[5:]
853
854 - def database_cross_reference(self, line):
855 # From CLD1_HUMAN, Release 39: 856 # DR EMBL; [snip]; -. [EMBL / GenBank / DDBJ] [CoDingSequence] 857 # DR PRODOM [Domain structure / List of seq. sharing at least 1 domai 858 # DR SWISS-2DPAGE; GET REGION ON 2D PAGE. 859 line = line[5:] 860 # Remove the comments at the end of the line 861 i = line.find('[') 862 if i >= 0: 863 line = line[:i] 864 cols = line.rstrip(_CHOMP).split(';') 865 cols = [col.lstrip() for col in cols] 866 self.data.cross_references.append(tuple(cols))
867
868 - def keyword(self, line):
869 cols = line[5:].rstrip(_CHOMP).split(';') 870 self.data.keywords.extend([c.lstrip() for c in cols])
871
872 - def feature_table(self, line):
873 line = line[5:] # get rid of junk in front 874 name = line[0:8].rstrip() 875 try: 876 from_res = int(line[9:15]) 877 except ValueError: 878 from_res = line[9:15].lstrip() 879 try: 880 to_res = int(line[16:22]) 881 except ValueError: 882 to_res = line[16:22].lstrip() 883 description = line[29:70].rstrip() 884 #if there is a feature_id (FTId), store it away 885 if line[29:35]==r"/FTId=": 886 ft_id = line[35:70].rstrip()[:-1] 887 else: 888 ft_id ="" 889 if not name: # is continuation of last one 890 assert not from_res and not to_res 891 name, from_res, to_res, old_description,old_ft_id = self.data.features[-1] 892 del self.data.features[-1] 893 description = "%s %s" % (old_description, description) 894 895 # special case -- VARSPLIC, reported by edvard@farmasi.uit.no 896 if name == "VARSPLIC": 897 description = self._fix_varsplic_sequences(description) 898 self.data.features.append((name, from_res, to_res, description,ft_id))
899
900 - def _fix_varsplic_sequences(self, description):
901 """Remove unwanted spaces in sequences. 902 903 During line carryover, the sequences in VARSPLIC can get mangled 904 with unwanted spaces like: 905 'DISSTKLQALPSHGLESIQT -> PCRATGWSPFRRSSPC LPTH' 906 We want to check for this case and correct it as it happens. 907 """ 908 descr_cols = description.split(" -> ") 909 if len(descr_cols) == 2: 910 first_seq = descr_cols[0] 911 second_seq = descr_cols[1] 912 extra_info = '' 913 # we might have more information at the end of the 914 # second sequence, which should be in parenthesis 915 extra_info_pos = second_seq.find(" (") 916 if extra_info_pos != -1: 917 extra_info = second_seq[extra_info_pos:] 918 second_seq = second_seq[:extra_info_pos] 919 920 # now clean spaces out of the first and second string 921 first_seq = first_seq.replace(" ", "") 922 second_seq = second_seq.replace(" ", "") 923 924 # reassemble the description 925 description = first_seq + " -> " + second_seq + extra_info 926 927 return description
928
929 - def protein_existence(self, line):
930 #TODO - Record this information? 931 pass
932
933 - def sequence_header(self, line):
934 cols = line.split() 935 assert len(cols) == 8, "I don't understand SQ line %s" % line 936 # Do more checking here? 937 self.data.seqinfo = int(cols[2]), int(cols[4]), cols[6]
938
939 - def sequence_data(self, line):
940 seq = line.replace(" ", "").rstrip() 941 self.data.sequence = self.data.sequence + seq
942
943 - def terminator(self, line):
944 pass
945 946 #def _clean(self, line, rstrip=1): 947 # if rstrip: 948 # return string.rstrip(line[5:]) 949 # return line[5:] 950
951 - def _clean_record(self, rec):
952 # Remove trailing newlines 953 members = ['description', 'gene_name', 'organism', 'organelle'] 954 for m in members: 955 attr = getattr(rec, m) 956 setattr(rec, m, attr.rstrip()) 957 for ref in rec.references: 958 self._clean_references(ref)
959
960 - def _clean_references(self, ref):
961 # Remove trailing newlines 962 members = ['authors', 'title', 'location'] 963 for m in members: 964 attr = getattr(ref, m) 965 setattr(ref, m, attr.rstrip())
966
967 -class _SequenceConsumer(AbstractConsumer):
968 """Consumer that converts a SwissProt record to a SeqRecord object. 969 970 Members: 971 data Record with SwissProt data. 972 alphabet The alphabet the generated Seq objects will have. 973 """ 974 #TODO - Cope with references as done for GenBank
975 - def __init__(self, alphabet = Alphabet.generic_protein):
976 """Initialize a Sequence Consumer 977 978 Arguments: 979 o alphabet - The alphabet to use for the generated Seq objects. If 980 not supplied this will default to the generic protein alphabet. 981 """ 982 self.data = None 983 self.alphabet = alphabet
984
985 - def start_record(self):
986 seq = Seq.Seq("", self.alphabet) 987 self.data = SeqRecord.SeqRecord(seq) 988 self.data.description = "" 989 self.data.name = ""
990
991 - def end_record(self):
992 self.data.description = self.data.description.rstrip()
993
994 - def identification(self, line):
995 cols = line.split() 996 self.data.name = cols[1]
997
998 - def accession(self, line):
999 #Note that files can and often do contain multiple AC lines. 1000 ids = line[5:].rstrip().split(';') 1001 1002 #Use the first as the ID, but record them ALL in the annotations 1003 try : 1004 self.data.annotations['accessions'].extend(ids) 1005 except KeyError : 1006 self.data.annotations['accessions'] = ids 1007 1008 #Use the FIRST accession as the ID, not the first on this line! 1009 self.data.id = self.data.annotations['accessions'][0]
1010 #self.data.id = ids[0] 1011
1012 - def description(self, line):
1013 self.data.description = self.data.description + \ 1014 line[5:].strip() + "\n"
1015
1016 - def sequence_data(self, line):
1017 seq = Seq.Seq(line.replace(" ", "").rstrip(), 1018 self.alphabet) 1019 self.data.seq = self.data.seq + seq
1020
1021 - def gene_name(self, line):
1022 #We already store the identification/accession as the records name/id 1023 try : 1024 self.data.annotations['gene_name'] += line[5:] 1025 except KeyError : 1026 self.data.annotations['gene_name'] = line[5:]
1027
1028 - def comment(self, line):
1029 #Try and agree with SeqRecord convention from the GenBank parser, 1030 #which stores the comments as a long string with newlines 1031 #with key 'comment' 1032 try : 1033 self.data.annotations['comment'] += "\n" + line[5:] 1034 except KeyError : 1035 self.data.annotations['comment'] = line[5:]
1036 #TODO - Follow SwissProt conventions more closely? 1037
1038 - def date(self, line):
1039 date_str = line.split()[0] 1040 uprline = string.upper(line) 1041 if uprline.find('CREATED') >= 0 : 1042 #Try and agree with SeqRecord convention from the GenBank parser, 1043 #which stores the submitted date as 'date' 1044 self.data.annotations['date'] = date_str 1045 elif uprline.find('LAST SEQUENCE UPDATE') >= 0 : 1046 #There is no existing convention from the GenBank SeqRecord parser 1047 self.data.annotations['date_last_sequence_update'] = date_str 1048 elif uprline.find('LAST ANNOTATION UPDATE') >= 0: 1049 #There is no existing convention from the GenBank SeqRecord parser 1050 self.data.annotations['date_last_annotation_update'] = date_str
1051
1052 - def keyword(self, line):
1053 #Try and agree with SeqRecord convention from the GenBank parser, 1054 #which stores a list as 'keywords' 1055 cols = line[5:].rstrip(_CHOMP).split(';') 1056 cols = [c.strip() for c in cols] 1057 cols = filter(None, cols) 1058 try : 1059 #Extend any existing list of keywords 1060 self.data.annotations['keywords'].extend(cols) 1061 except KeyError : 1062 #Create the list of keywords 1063 self.data.annotations['keywords'] = cols
1064
1065 - def organism_species(self, line):
1066 #Try and agree with SeqRecord convention from the GenBank parser, 1067 #which stores the organism as a string with key 'organism' 1068 data = line[5:].rstrip(_CHOMP) 1069 try : 1070 #Append to any existing data split over multiple lines 1071 self.data.annotations['organism'] += " " + data 1072 except KeyError: 1073 self.data.annotations['organism'] = data
1074
1075 - def organism_host(self, line):
1076 #There is no SeqRecord convention from the GenBank parser, 1077 #based on how it deals with taxonomy ids (list of strings) 1078 data = line[5:].rstrip(_CHOMP) 1079 index = data.find('=') 1080 if index >= 0: 1081 descr = data[:index] 1082 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr 1083 ids = data[index+1:].split(',') 1084 else: 1085 ids = data.split(',') 1086 1087 try : 1088 #Append to any existing data 1089 self.data.annotations['organism_host'].extend(ids) 1090 except KeyError: 1091 self.data.annotations['organism_host'] = ids
1092
1093 - def taxonomy_id(self, line):
1094 #Try and agree with SeqRecord convention from the GenBank parser, 1095 #which stores these as a list of strings with key 'taxonomy' 1096 1097 line = line[5:].rstrip(_CHOMP) 1098 index = line.find('=') 1099 if index >= 0: 1100 descr = line[:index] 1101 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr 1102 ids = line[index+1:].split(',') 1103 else: 1104 ids = line.split(',') 1105 1106 try : 1107 #Append to any existing data 1108 self.data.annotations['taxonomy'].extend(ids) 1109 except KeyError: 1110 self.data.annotations['taxonomy'] = ids
1111
1112 -def index_file(filename, indexname, rec2key=None):
1113 """index_file(filename, indexname, rec2key=None) 1114 1115 Index a SwissProt file. filename is the name of the file. 1116 indexname is the name of the dictionary. rec2key is an 1117 optional callback that takes a Record and generates a unique key 1118 (e.g. the accession number) for the record. If not specified, 1119 the entry name will be used. 1120 1121 """ 1122 if not os.path.exists(filename): 1123 raise ValueError, "%s does not exist" % filename 1124 1125 index = Index.Index(indexname, truncate=1) 1126 index[Dictionary._Dictionary__filename_key] = filename 1127 1128 iter = Iterator(open(filename), parser=RecordParser()) 1129 while 1: 1130 start = iter._uhandle.tell() 1131 rec = iter.next() 1132 length = iter._uhandle.tell() - start 1133 1134 if rec is None: 1135 break 1136 if rec2key is not None: 1137 key = rec2key(rec) 1138 else: 1139 key = rec.entry_name 1140 1141 if not key: 1142 raise KeyError, "empty sequence key was produced" 1143 elif index.has_key(key): 1144 raise KeyError, "duplicate key %s found" % key 1145 1146 index[key] = start, length
1147 1148 if __name__ == "__main__" : 1149 print "Quick self test..." 1150 1151 example_filename = "../../Tests/SwissProt/sp008" 1152 1153 import os 1154 if not os.path.isfile(example_filename) : 1155 print "Missing test file %s" % example_filename 1156 else : 1157 #Try parsing it! 1158 1159 handle = open(example_filename) 1160 for record in Iterator(handle, RecordParser()) : 1161 print record.entry_name 1162 print ",".join(record.accessions) 1163 print record.keywords 1164 print repr(record.organism) 1165 print record.sequence[:20] + "..." 1166 handle.close() 1167 1168 handle = open(example_filename) 1169 for record in Iterator(handle, SequenceParser()) : 1170 print record.name 1171 print record.id 1172 print record.annotations['keywords'] 1173 print repr(record.annotations['organism']) 1174 print record.seq.tostring()[:20] + "..." 1175 handle.close() 1176