Package Bio :: Package Blast :: Module NCBIXML
[hide private]
[frames] | no frames]

Source Code for Module Bio.Blast.NCBIXML

  1  # Copyright 2000 by Bertrand Frottier .  All rights reserved. 
  2  # Revisions 2005-2006 copyright Michiel de Hoon 
  3  # Revisions 2006 copyright Peter Cock 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7  """This module provides code to work with the BLAST XML output 
  8  following the DTD available on the NCBI FTP 
  9  ftp://ftp.ncbi.nlm.nih.gov/blast/documents/xml/NCBI_BlastOutput.dtd 
 10   
 11  Classes: 
 12  BlastParser         Parses XML output from BLAST. 
 13                      This (now) returns a list of Blast records. 
 14                      Historically it returned a single Blast record. 
 15   
 16  _XMLParser          Generic SAX parser. 
 17   
 18  Functions: 
 19  parse               Incremental parser, this is an iterator that returns 
 20                      Blast records. 
 21  """ 
 22  from Bio.Blast import Record 
 23  import xml.sax 
 24  from xml.sax.handler import ContentHandler 
 25   
26 -class _XMLparser(ContentHandler):
27 """Generic SAX Parser 28 29 Just a very basic SAX parser. 30 31 Redefine the methods startElement, characters and endElement. 32 """
33 - def __init__(self, debug=0):
34 """Constructor 35 36 debug - integer, amount of debug information to print 37 """ 38 self._tag = [] 39 self._value = '' 40 self._debug = debug 41 self._debug_ignore_list = []
42
43 - def _secure_name(self, name):
44 """Removes 'dangerous' from tag names 45 46 name -- name to be 'secured' 47 """ 48 # Replace '-' with '_' in XML tag names 49 return name.replace('-', '_')
50
51 - def startElement(self, name, attr):
52 """Found XML start tag 53 54 No real need of attr, BLAST DTD doesn't use them 55 56 name -- name of the tag 57 58 attr -- tag attributes 59 """ 60 self._tag.append(name) 61 62 # Try to call a method (defined in subclasses) 63 method = self._secure_name('_start_' + name) 64 65 #Note could use try / except AttributeError 66 #BUT I found often triggered by nested errors... 67 if hasattr(self, method) : 68 eval("self.%s()" % method) 69 if self._debug > 4 : 70 print "NCBIXML: Parsed: " + method 71 else : 72 # Doesn't exist (yet) 73 if method not in self._debug_ignore_list : 74 if self._debug > 3 : 75 print "NCBIXML: Ignored: " + method 76 self._debug_ignore_list.append(method)
77
78 - def characters(self, ch):
79 """Found some text 80 81 ch -- characters read 82 """ 83 self._value += ch # You don't ever get the whole string
84
85 - def endElement(self, name):
86 """Found XML end tag 87 88 name -- tag name 89 """ 90 # Strip character buffer 91 self._value = self._value.strip() 92 93 # Try to call a method (defined in subclasses) 94 method = self._secure_name('_end_' + name) 95 #Note could use try / except AttributeError 96 #BUT I found often triggered by nested errors... 97 if hasattr(self, method) : 98 eval("self.%s()" % method) 99 if self._debug > 2 : 100 print "NCBIXML: Parsed: " + method, self._value 101 else : 102 # Doesn't exist (yet) 103 if method not in self._debug_ignore_list : 104 if self._debug > 1 : 105 print "NCBIXML: Ignored: " + method, self._value 106 self._debug_ignore_list.append(method) 107 108 # Reset character buffer 109 self._value = ''
110
111 -class BlastParser(_XMLparser):
112 """Parse XML BLAST data into a Record.Blast object 113 114 Methods: 115 parse Parses BLAST XML data. 116 Returns a list of Blast records 117 118 All XML 'action' methods are private methods and may be: 119 _start_TAG called when the start tag is found 120 _end_TAG called when the end tag is found 121 """ 122
123 - def __init__(self, debug=0):
124 """Constructor 125 126 debug - integer, amount of debug information to print 127 """ 128 # Calling superclass method 129 _XMLparser.__init__(self, debug) 130 131 self._parser = xml.sax.make_parser() 132 self._parser.setContentHandler(self) 133 134 # To avoid ValueError: unknown url type: NCBI_BlastOutput.dtd 135 self._parser.setFeature(xml.sax.handler.feature_validation, 0) 136 self._parser.setFeature(xml.sax.handler.feature_namespaces, 0) 137 self._parser.setFeature(xml.sax.handler.feature_external_pes, 0) 138 self._parser.setFeature(xml.sax.handler.feature_external_ges, 0) 139 140 self.reset()
141
142 - def reset(self) :
143 """Reset all the data allowing reuse of the BlastParser() object""" 144 self._records = [] 145 self._header = Record.Header() 146 self._parameters = Record.Parameters() 147 self._parameters.filter = None #Maybe I should update the class?
148
149 - def parse(self, handler):
150 """Parses the XML data 151 152 handler -- file handler or StringIO 153 154 This method returns a list of Blast record objects. 155 """ 156 self.reset() 157 self._parser.parse(handler) 158 return self._records
159
160 - def _start_Iteration(self):
161 self._blast = Record.Blast() 162 pass
163
164 - def _end_Iteration(self):
165 # We stored a lot of generic "top level" information 166 # in self._header (an object of type Record.Header) 167 self._blast.reference = self._header.reference 168 self._blast.date = self._header.date 169 self._blast.version = self._header.version 170 self._blast.database = self._header.database 171 self._blast.application = self._header.application 172 173 # These are required for "old" pre 2.2.14 files 174 # where only <BlastOutput_query-ID>, <BlastOutput_query-def> 175 # and <BlastOutput_query-len> were used. Now they 176 # are suplemented/replaced by <Iteration_query-ID>, 177 # <Iteration_query-def> and <Iteration_query-len> 178 if not hasattr(self._blast, "query") \ 179 or not self._blast.query : 180 self._blast.query = self._header.query 181 if not hasattr(self._blast, "query_id") \ 182 or not self._blast.query_id : 183 self._blast.query_id = self._header.query_id 184 if not hasattr(self._blast, "query_letters") \ 185 or not self._blast.query_letters : 186 self._blast.query_letters = self._header.query_letters 187 188 # Apply the "top level" parameter information 189 self._blast.matrix = self._parameters.matrix 190 self._blast.num_seqs_better_e = self._parameters.num_seqs_better_e 191 self._blast.gap_penalties = self._parameters.gap_penalties 192 self._blast.filter = self._parameters.filter 193 self._blast.expect = self._parameters.expect 194 195 #Add to the list 196 self._records.append(self._blast) 197 #Clear the object (a new empty one is create in _start_Iteration) 198 self._blast = None 199 200 if self._debug : "NCBIXML: Added Blast record to results"
201 202 # Header
203 - def _end_BlastOutput_program(self):
204 """BLAST program, e.g., blastp, blastn, etc. 205 206 Save this to put on each blast record object 207 """ 208 self._header.application = self._value.upper()
209
210 - def _end_BlastOutput_version(self):
211 """version number of the BLAST engine (e.g., 2.1.2) 212 213 Save this to put on each blast record object 214 """ 215 self._header.version = self._value.split()[1] 216 self._header.date = self._value.split()[2][1:-1]
217
219 """a reference to the article describing the algorithm 220 221 Save this to put on each blast record object 222 """ 223 self._header.reference = self._value
224
225 - def _end_BlastOutput_db(self):
226 """the database(s) searched 227 228 Save this to put on each blast record object 229 """ 230 self._header.database = self._value
231
233 """the identifier of the query 234 235 Important in old pre 2.2.14 BLAST, for recent versions 236 <Iteration_query-ID> is enough 237 """ 238 self._header.query_id = self._value
239
241 """the definition line of the query 242 243 Important in old pre 2.2.14 BLAST, for recent versions 244 <Iteration_query-def> is enough 245 """ 246 self._header.query = self._value
247
249 """the length of the query 250 251 Important in old pre 2.2.14 BLAST, for recent versions 252 <Iteration_query-len> is enough 253 """ 254 self._header.query_letters = int(self._value)
255
256 - def _end_Iteration_query_ID(self):
257 """the identifier of the query 258 """ 259 self._blast.query_id = self._value
260
261 - def _end_Iteration_query_def(self):
262 """the definition line of the query 263 """ 264 self._blast.query = self._value
265
266 - def _end_Iteration_query_len(self):
267 """the length of the query 268 """ 269 self._blast.query_letters = int(self._value)
270 271 ## def _end_BlastOutput_query_seq(self): 272 ## """the query sequence 273 ## """ 274 ## pass # XXX Missing in Record.Blast ? 275 276 ## def _end_BlastOutput_iter_num(self): 277 ## """the psi-blast iteration number 278 ## """ 279 ## pass # XXX TODO PSI 280
281 - def _end_BlastOutput_hits(self):
282 """hits to the database sequences, one for every sequence 283 """ 284 self._blast.num_hits = int(self._value)
285 286 ## def _end_BlastOutput_message(self): 287 ## """error messages 288 ## """ 289 ## pass # XXX What to do ? 290 291 # Parameters
292 - def _end_Parameters_matrix(self):
293 """matrix used (-M) 294 """ 295 self._parameters.matrix = self._value
296
297 - def _end_Parameters_expect(self):
298 """expect values cutoff (-e) 299 """ 300 # NOTE: In old text output there was a line: 301 # Number of sequences better than 1.0e-004: 1 302 # As far as I can see, parameters.num_seqs_better_e 303 # would take the value of 1, and the expectation 304 # value was not recorded. 305 # 306 # Anyway we should NOT record this against num_seqs_better_e 307 self._parameters.expect = self._value
308 309 ## def _end_Parameters_include(self): 310 ## """inclusion threshold for a psi-blast iteration (-h) 311 ## """ 312 ## pass # XXX TODO PSI 313
314 - def _end_Parameters_sc_match(self):
315 """match score for nucleotide-nucleotide comparaison (-r) 316 """ 317 self._parameters.sc_match = int(self._value)
318
320 """mismatch penalty for nucleotide-nucleotide comparaison (-r) 321 """ 322 self._parameters.sc_mismatch = int(self._value)
323
324 - def _end_Parameters_gap_open(self):
325 """gap existence cost (-G) 326 """ 327 self._parameters.gap_penalties = int(self._value)
328
330 """gap extension cose (-E) 331 """ 332 self._parameters.gap_penalties = (self._parameters.gap_penalties, 333 int(self._value))
334
335 - def _end_Parameters_filter(self):
336 """filtering options (-F) 337 """ 338 self._parameters.filter = self._value
339 340 ## def _end_Parameters_pattern(self): 341 ## """pattern used for phi-blast search 342 ## """ 343 ## pass # XXX TODO PSI 344 345 ## def _end_Parameters_entrez_query(self): 346 ## """entrez query used to limit search 347 ## """ 348 ## pass # XXX TODO PSI 349 350 # Hits
351 - def _start_Hit(self):
352 self._blast.alignments.append(Record.Alignment()) 353 self._blast.descriptions.append(Record.Description()) 354 self._blast.multiple_alignment = [] 355 self._hit = self._blast.alignments[-1] 356 self._descr = self._blast.descriptions[-1] 357 self._descr.num_alignments = 0
358
359 - def _end_Hit(self):
360 #Cleanup 361 self._blast.multiple_alignment = None 362 self._hit = None 363 self._descr = None
364
365 - def _end_Hit_id(self):
366 """identifier of the database sequence 367 """ 368 self._hit.hit_id = self._value 369 self._hit.title = self._value + ' '
370
371 - def _end_Hit_def(self):
372 """definition line of the database sequence 373 """ 374 self._hit.hit_def = self._value 375 self._hit.title += self._value 376 self._descr.title = self._hit.title
377
378 - def _end_Hit_accession(self):
379 """accession of the database sequence 380 """ 381 self._hit.accession = self._value 382 self._descr.accession = self._value
383
384 - def _end_Hit_len(self):
385 self._hit.length = int(self._value)
386 387 # HSPs
388 - def _start_Hsp(self):
389 #Note that self._start_Hit() should have been called 390 #to setup things like self._blast.multiple_alignment 391 self._hit.hsps.append(Record.HSP()) 392 self._hsp = self._hit.hsps[-1] 393 self._descr.num_alignments += 1 394 self._blast.multiple_alignment.append(Record.MultipleAlignment()) 395 self._mult_al = self._blast.multiple_alignment[-1]
396 397 # Hsp_num is useless
398 - def _end_Hsp_score(self):
399 """raw score of HSP 400 """ 401 self._hsp.score = float(self._value) 402 if self._descr.score == None: 403 self._descr.score = float(self._value)
404
405 - def _end_Hsp_bit_score(self):
406 """bit score of HSP 407 """ 408 self._hsp.bits = float(self._value)
409 #if self._descr.bits == None: 410 # self._descr.bits = float(self._value) 411
412 - def _end_Hsp_evalue(self):
413 """expect value value of the HSP 414 """ 415 self._hsp.expect = float(self._value) 416 if self._descr.e == None: 417 self._descr.e = float(self._value)
418
419 - def _end_Hsp_query_from(self):
420 """offset of query at the start of the alignment (one-offset) 421 """ 422 self._hsp.query_start = int(self._value)
423
424 - def _end_Hsp_query_to(self):
425 """offset of query at the end of the alignment (one-offset) 426 """ 427 self._hsp.query_end = int(self._value)
428
429 - def _end_Hsp_hit_from(self):
430 """offset of the database at the start of the alignment (one-offset) 431 """ 432 self._hsp.sbjct_start = int(self._value)
433
434 - def _end_Hsp_hit_to(self):
435 """offset of the database at the end of the alignment (one-offset) 436 """ 437 self._hsp.sbjct_end = int(self._value)
438 439 ## def _end_Hsp_pattern_from(self): 440 ## """start of phi-blast pattern on the query (one-offset) 441 ## """ 442 ## pass # XXX TODO PSI 443 444 ## def _end_Hsp_pattern_to(self): 445 ## """end of phi-blast pattern on the query (one-offset) 446 ## """ 447 ## pass # XXX TODO PSI 448
449 - def _end_Hsp_query_frame(self):
450 """frame of the query if applicable 451 """ 452 self._hsp.frame = (int(self._value),)
453
454 - def _end_Hsp_hit_frame(self):
455 """frame of the database sequence if applicable 456 """ 457 self._hsp.frame += (int(self._value),)
458
459 - def _end_Hsp_identity(self):
460 """number of identities in the alignment 461 """ 462 self._hsp.identities = int(self._value)
463
464 - def _end_Hsp_positive(self):
465 """number of positive (conservative) substitutions in the alignment 466 """ 467 self._hsp.positives = int(self._value)
468
469 - def _end_Hsp_gaps(self):
470 """number of gaps in the alignment 471 """ 472 self._hsp.gaps = int(self._value)
473
474 - def _end_Hsp_align_len(self):
475 """length of the alignment 476 """ 477 self._hsp.align_length = int(self._value)
478 479 ## def _en_Hsp_density(self): 480 ## """score density 481 ## """ 482 ## pass # XXX ??? 483
484 - def _end_Hsp_qseq(self):
485 """alignment string for the query 486 """ 487 self._hsp.query = self._value
488
489 - def _end_Hsp_hseq(self):
490 """alignment string for the database 491 """ 492 self._hsp.sbjct = self._value
493
494 - def _end_Hsp_midline(self):
495 """Formatting middle line as normally seen in BLAST report 496 """ 497 self._hsp.match = self._value
498 499 # Statistics
500 - def _end_Statistics_db_num(self):
501 """number of sequences in the database 502 """ 503 self._blast.num_sequences_in_database = int(self._value)
504
505 - def _end_Statistics_db_len(self):
506 """number of letters in the database 507 """ 508 self._blast._num_letters_in_database = int(self._value)
509
510 - def _end_Statistics_hsp_len(self):
511 """the effective HSP length 512 """ 513 self._blast.effective_hsp_length = int(self._value)
514
516 """the effective search space 517 """ 518 self._blast.effective_search_space = float(self._value)
519
520 - def _end_Statistics_kappa(self):
521 """Karlin-Altschul parameter K 522 """ 523 self._blast.ka_params = float(self._value)
524
525 - def _end_Statistics_lambda(self):
526 """Karlin-Altschul parameter Lambda 527 """ 528 self._blast.ka_params = (float(self._value), 529 self._blast.ka_params)
530
531 - def _end_Statistics_entropy(self):
532 """Karlin-Altschul parameter H 533 """ 534 self._blast.ka_params = self._blast.ka_params + (float(self._value),)
535
536 -def parse(handle, debug=0):
537 """Returns an iterator a Blast record for each query. 538 539 handle - file handle to and XML file to parse 540 debug - integer, amount of debug information to print 541 542 This is a generator function that returns multiple Blast records 543 objects - one for each query sequence given to blast. The file 544 is read incrementally, returning complete records as they are read 545 in. 546 547 Should cope with new BLAST 2.2.14+ which gives a single XML file 548 for mutliple query records. 549 550 Should also cope with XML output from older versions BLAST which 551 gave multiple XML files concatenated together (giving a single file 552 which strictly speaking wasn't valid XML).""" 553 from xml.parsers import expat 554 BLOCK = 1024 555 MARGIN = 10 # must be at least length of newline + XML start 556 XML_START = "<?xml" 557 558 text = handle.read(BLOCK) 559 pending = "" 560 561 if not text : 562 #NO DATA FOUND! 563 raise ValueError("Your XML file was empty") 564 565 while text : 566 #We are now starting a new XML file 567 if not text.startswith(XML_START) : 568 raise SyntaxError("Your XML file did not start <?xml...") 569 570 expat_parser = expat.ParserCreate() 571 blast_parser = BlastParser(debug) 572 expat_parser.StartElementHandler = blast_parser.startElement 573 expat_parser.EndElementHandler = blast_parser.endElement 574 expat_parser.CharacterDataHandler = blast_parser.characters 575 576 expat_parser.Parse(text, False) 577 while blast_parser._records: 578 record = blast_parser._records[0] 579 blast_parser._records = blast_parser._records[1:] 580 yield record 581 582 while True : 583 #Read in another block of the file... 584 text, pending = pending + handle.read(BLOCK), "" 585 if not text: 586 #End of the file! 587 expat_parser.Parse("", True) # End of XML record 588 break 589 590 #Now read a little bit more so we can check for the 591 #start of another XML file... 592 pending = handle.read(MARGIN) 593 594 if (text+pending).find("\n" + XML_START) == -1 : 595 # Good - still dealing with the same XML file 596 expat_parser.Parse(text, False) 597 while blast_parser._records: 598 record = blast_parser._records[0] 599 blast_parser._records = blast_parser._records[1:] 600 yield record 601 else : 602 # This is output from pre 2.2.14 BLAST, 603 # one XML file for each query! 604 605 # Finish the old file: 606 text, pending = (text+pending).split("\n" + XML_START,1) 607 pending = XML_START + pending 608 609 expat_parser.Parse(text, True) # End of XML record 610 while blast_parser._records: 611 record = blast_parser._records[0] 612 blast_parser._records = blast_parser._records[1:] 613 yield record 614 615 #Now we are going to re-loop, reset the 616 #parsers and start reading the next XML file 617 text, pending = pending, "" 618 break 619 620 #At this point we have finished the first XML record. 621 #If the file is from an old version of blast, it may 622 #contain more XML records (check if text==""). 623 assert pending=="" 624 assert len(blast_parser._records) == 0 625 626 #We should have finished the file! 627 assert text=="" 628 assert pending=="" 629 assert len(blast_parser._records) == 0
630 631 if __name__ == '__main__': 632 import sys 633 import os 634 p = BlastParser() 635 r_list = p.parse(sys.argv[1]) 636 637 for r in r_list : 638 # Small test 639 print 'Blast of', r.query 640 print 'Found %s alignments with a total of %s HSPs' % (len(r.alignments), 641 reduce(lambda a,b: a+b, 642 [len(a.hsps) for a in r.alignments])) 643 644 for al in r.alignments: 645 print al.title[:50], al.length, 'bp', len(al.hsps), 'HSPs' 646 647 # Cookbook example 648 E_VALUE_THRESH = 0.04 649 for alignment in r.alignments: 650 for hsp in alignment.hsps: 651 if hsp.expect < E_VALUE_THRESH: 652 print '*****' 653 print 'sequence', alignment.title 654 print 'length', alignment.length 655 print 'e value', hsp.expect 656 print hsp.query[:75] + '...' 657 print hsp.match[:75] + '...' 658 print hsp.sbjct[:75] + '...' 659