1
2
3
4
5
6
7 """This module provides code to work with the BLAST XML output
8 following the DTD available on the NCBI FTP
9 ftp://ftp.ncbi.nlm.nih.gov/blast/documents/xml/NCBI_BlastOutput.dtd
10
11 Classes:
12 BlastParser Parses XML output from BLAST.
13 This (now) returns a list of Blast records.
14 Historically it returned a single Blast record.
15
16 _XMLParser Generic SAX parser.
17
18 Functions:
19 parse Incremental parser, this is an iterator that returns
20 Blast records.
21 """
22 from Bio.Blast import Record
23 import xml.sax
24 from xml.sax.handler import ContentHandler
25
27 """Generic SAX Parser
28
29 Just a very basic SAX parser.
30
31 Redefine the methods startElement, characters and endElement.
32 """
34 """Constructor
35
36 debug - integer, amount of debug information to print
37 """
38 self._tag = []
39 self._value = ''
40 self._debug = debug
41 self._debug_ignore_list = []
42
44 """Removes 'dangerous' from tag names
45
46 name -- name to be 'secured'
47 """
48
49 return name.replace('-', '_')
50
52 """Found XML start tag
53
54 No real need of attr, BLAST DTD doesn't use them
55
56 name -- name of the tag
57
58 attr -- tag attributes
59 """
60 self._tag.append(name)
61
62
63 method = self._secure_name('_start_' + name)
64
65
66
67 if hasattr(self, method) :
68 eval("self.%s()" % method)
69 if self._debug > 4 :
70 print "NCBIXML: Parsed: " + method
71 else :
72
73 if method not in self._debug_ignore_list :
74 if self._debug > 3 :
75 print "NCBIXML: Ignored: " + method
76 self._debug_ignore_list.append(method)
77
79 """Found some text
80
81 ch -- characters read
82 """
83 self._value += ch
84
86 """Found XML end tag
87
88 name -- tag name
89 """
90
91 self._value = self._value.strip()
92
93
94 method = self._secure_name('_end_' + name)
95
96
97 if hasattr(self, method) :
98 eval("self.%s()" % method)
99 if self._debug > 2 :
100 print "NCBIXML: Parsed: " + method, self._value
101 else :
102
103 if method not in self._debug_ignore_list :
104 if self._debug > 1 :
105 print "NCBIXML: Ignored: " + method, self._value
106 self._debug_ignore_list.append(method)
107
108
109 self._value = ''
110
112 """Parse XML BLAST data into a Record.Blast object
113
114 Methods:
115 parse Parses BLAST XML data.
116 Returns a list of Blast records
117
118 All XML 'action' methods are private methods and may be:
119 _start_TAG called when the start tag is found
120 _end_TAG called when the end tag is found
121 """
122
124 """Constructor
125
126 debug - integer, amount of debug information to print
127 """
128
129 _XMLparser.__init__(self, debug)
130
131 self._parser = xml.sax.make_parser()
132 self._parser.setContentHandler(self)
133
134
135 self._parser.setFeature(xml.sax.handler.feature_validation, 0)
136 self._parser.setFeature(xml.sax.handler.feature_namespaces, 0)
137 self._parser.setFeature(xml.sax.handler.feature_external_pes, 0)
138 self._parser.setFeature(xml.sax.handler.feature_external_ges, 0)
139
140 self.reset()
141
143 """Reset all the data allowing reuse of the BlastParser() object"""
144 self._records = []
145 self._header = Record.Header()
146 self._parameters = Record.Parameters()
147 self._parameters.filter = None
148
149 - def parse(self, handler):
150 """Parses the XML data
151
152 handler -- file handler or StringIO
153
154 This method returns a list of Blast record objects.
155 """
156 self.reset()
157 self._parser.parse(handler)
158 return self._records
159
163
165
166
167 self._blast.reference = self._header.reference
168 self._blast.date = self._header.date
169 self._blast.version = self._header.version
170 self._blast.database = self._header.database
171 self._blast.application = self._header.application
172
173
174
175
176
177
178 if not hasattr(self._blast, "query") \
179 or not self._blast.query :
180 self._blast.query = self._header.query
181 if not hasattr(self._blast, "query_id") \
182 or not self._blast.query_id :
183 self._blast.query_id = self._header.query_id
184 if not hasattr(self._blast, "query_letters") \
185 or not self._blast.query_letters :
186 self._blast.query_letters = self._header.query_letters
187
188
189 self._blast.matrix = self._parameters.matrix
190 self._blast.num_seqs_better_e = self._parameters.num_seqs_better_e
191 self._blast.gap_penalties = self._parameters.gap_penalties
192 self._blast.filter = self._parameters.filter
193 self._blast.expect = self._parameters.expect
194
195
196 self._records.append(self._blast)
197
198 self._blast = None
199
200 if self._debug : "NCBIXML: Added Blast record to results"
201
202
204 """BLAST program, e.g., blastp, blastn, etc.
205
206 Save this to put on each blast record object
207 """
208 self._header.application = self._value.upper()
209
211 """version number of the BLAST engine (e.g., 2.1.2)
212
213 Save this to put on each blast record object
214 """
215 self._header.version = self._value.split()[1]
216 self._header.date = self._value.split()[2][1:-1]
217
219 """a reference to the article describing the algorithm
220
221 Save this to put on each blast record object
222 """
223 self._header.reference = self._value
224
226 """the database(s) searched
227
228 Save this to put on each blast record object
229 """
230 self._header.database = self._value
231
233 """the identifier of the query
234
235 Important in old pre 2.2.14 BLAST, for recent versions
236 <Iteration_query-ID> is enough
237 """
238 self._header.query_id = self._value
239
241 """the definition line of the query
242
243 Important in old pre 2.2.14 BLAST, for recent versions
244 <Iteration_query-def> is enough
245 """
246 self._header.query = self._value
247
249 """the length of the query
250
251 Important in old pre 2.2.14 BLAST, for recent versions
252 <Iteration_query-len> is enough
253 """
254 self._header.query_letters = int(self._value)
255
257 """the identifier of the query
258 """
259 self._blast.query_id = self._value
260
262 """the definition line of the query
263 """
264 self._blast.query = self._value
265
267 """the length of the query
268 """
269 self._blast.query_letters = int(self._value)
270
271
272
273
274
275
276
277
278
279
280
282 """hits to the database sequences, one for every sequence
283 """
284 self._blast.num_hits = int(self._value)
285
286
287
288
289
290
291
293 """matrix used (-M)
294 """
295 self._parameters.matrix = self._value
296
298 """expect values cutoff (-e)
299 """
300
301
302
303
304
305
306
307 self._parameters.expect = self._value
308
309
310
311
312
313
315 """match score for nucleotide-nucleotide comparaison (-r)
316 """
317 self._parameters.sc_match = int(self._value)
318
320 """mismatch penalty for nucleotide-nucleotide comparaison (-r)
321 """
322 self._parameters.sc_mismatch = int(self._value)
323
325 """gap existence cost (-G)
326 """
327 self._parameters.gap_penalties = int(self._value)
328
330 """gap extension cose (-E)
331 """
332 self._parameters.gap_penalties = (self._parameters.gap_penalties,
333 int(self._value))
334
336 """filtering options (-F)
337 """
338 self._parameters.filter = self._value
339
340
341
342
343
344
345
346
347
348
349
350
352 self._blast.alignments.append(Record.Alignment())
353 self._blast.descriptions.append(Record.Description())
354 self._blast.multiple_alignment = []
355 self._hit = self._blast.alignments[-1]
356 self._descr = self._blast.descriptions[-1]
357 self._descr.num_alignments = 0
358
360
361 self._blast.multiple_alignment = None
362 self._hit = None
363 self._descr = None
364
366 """identifier of the database sequence
367 """
368 self._hit.hit_id = self._value
369 self._hit.title = self._value + ' '
370
372 """definition line of the database sequence
373 """
374 self._hit.hit_def = self._value
375 self._hit.title += self._value
376 self._descr.title = self._hit.title
377
379 """accession of the database sequence
380 """
381 self._hit.accession = self._value
382 self._descr.accession = self._value
383
385 self._hit.length = int(self._value)
386
387
389
390
391 self._hit.hsps.append(Record.HSP())
392 self._hsp = self._hit.hsps[-1]
393 self._descr.num_alignments += 1
394 self._blast.multiple_alignment.append(Record.MultipleAlignment())
395 self._mult_al = self._blast.multiple_alignment[-1]
396
397
399 """raw score of HSP
400 """
401 self._hsp.score = float(self._value)
402 if self._descr.score == None:
403 self._descr.score = float(self._value)
404
406 """bit score of HSP
407 """
408 self._hsp.bits = float(self._value)
409
410
411
413 """expect value value of the HSP
414 """
415 self._hsp.expect = float(self._value)
416 if self._descr.e == None:
417 self._descr.e = float(self._value)
418
420 """offset of query at the start of the alignment (one-offset)
421 """
422 self._hsp.query_start = int(self._value)
423
425 """offset of query at the end of the alignment (one-offset)
426 """
427 self._hsp.query_end = int(self._value)
428
430 """offset of the database at the start of the alignment (one-offset)
431 """
432 self._hsp.sbjct_start = int(self._value)
433
435 """offset of the database at the end of the alignment (one-offset)
436 """
437 self._hsp.sbjct_end = int(self._value)
438
439
440
441
442
443
444
445
446
447
448
450 """frame of the query if applicable
451 """
452 self._hsp.frame = (int(self._value),)
453
455 """frame of the database sequence if applicable
456 """
457 self._hsp.frame += (int(self._value),)
458
460 """number of identities in the alignment
461 """
462 self._hsp.identities = int(self._value)
463
465 """number of positive (conservative) substitutions in the alignment
466 """
467 self._hsp.positives = int(self._value)
468
470 """number of gaps in the alignment
471 """
472 self._hsp.gaps = int(self._value)
473
475 """length of the alignment
476 """
477 self._hsp.align_length = int(self._value)
478
479
480
481
482
483
485 """alignment string for the query
486 """
487 self._hsp.query = self._value
488
490 """alignment string for the database
491 """
492 self._hsp.sbjct = self._value
493
495 """Formatting middle line as normally seen in BLAST report
496 """
497 self._hsp.match = self._value
498
499
504
506 """number of letters in the database
507 """
508 self._blast._num_letters_in_database = int(self._value)
509
514
519
521 """Karlin-Altschul parameter K
522 """
523 self._blast.ka_params = float(self._value)
524
526 """Karlin-Altschul parameter Lambda
527 """
528 self._blast.ka_params = (float(self._value),
529 self._blast.ka_params)
530
532 """Karlin-Altschul parameter H
533 """
534 self._blast.ka_params = self._blast.ka_params + (float(self._value),)
535
536 -def parse(handle, debug=0):
537 """Returns an iterator a Blast record for each query.
538
539 handle - file handle to and XML file to parse
540 debug - integer, amount of debug information to print
541
542 This is a generator function that returns multiple Blast records
543 objects - one for each query sequence given to blast. The file
544 is read incrementally, returning complete records as they are read
545 in.
546
547 Should cope with new BLAST 2.2.14+ which gives a single XML file
548 for mutliple query records.
549
550 Should also cope with XML output from older versions BLAST which
551 gave multiple XML files concatenated together (giving a single file
552 which strictly speaking wasn't valid XML)."""
553 from xml.parsers import expat
554 BLOCK = 1024
555 MARGIN = 10
556 XML_START = "<?xml"
557
558 text = handle.read(BLOCK)
559 pending = ""
560
561 if not text :
562
563 raise ValueError("Your XML file was empty")
564
565 while text :
566
567 if not text.startswith(XML_START) :
568 raise SyntaxError("Your XML file did not start <?xml...")
569
570 expat_parser = expat.ParserCreate()
571 blast_parser = BlastParser(debug)
572 expat_parser.StartElementHandler = blast_parser.startElement
573 expat_parser.EndElementHandler = blast_parser.endElement
574 expat_parser.CharacterDataHandler = blast_parser.characters
575
576 expat_parser.Parse(text, False)
577 while blast_parser._records:
578 record = blast_parser._records[0]
579 blast_parser._records = blast_parser._records[1:]
580 yield record
581
582 while True :
583
584 text, pending = pending + handle.read(BLOCK), ""
585 if not text:
586
587 expat_parser.Parse("", True)
588 break
589
590
591
592 pending = handle.read(MARGIN)
593
594 if (text+pending).find("\n" + XML_START) == -1 :
595
596 expat_parser.Parse(text, False)
597 while blast_parser._records:
598 record = blast_parser._records[0]
599 blast_parser._records = blast_parser._records[1:]
600 yield record
601 else :
602
603
604
605
606 text, pending = (text+pending).split("\n" + XML_START,1)
607 pending = XML_START + pending
608
609 expat_parser.Parse(text, True)
610 while blast_parser._records:
611 record = blast_parser._records[0]
612 blast_parser._records = blast_parser._records[1:]
613 yield record
614
615
616
617 text, pending = pending, ""
618 break
619
620
621
622
623 assert pending==""
624 assert len(blast_parser._records) == 0
625
626
627 assert text==""
628 assert pending==""
629 assert len(blast_parser._records) == 0
630
631 if __name__ == '__main__':
632 import sys
633 import os
634 p = BlastParser()
635 r_list = p.parse(sys.argv[1])
636
637 for r in r_list :
638
639 print 'Blast of', r.query
640 print 'Found %s alignments with a total of %s HSPs' % (len(r.alignments),
641 reduce(lambda a,b: a+b,
642 [len(a.hsps) for a in r.alignments]))
643
644 for al in r.alignments:
645 print al.title[:50], al.length, 'bp', len(al.hsps), 'HSPs'
646
647
648 E_VALUE_THRESH = 0.04
649 for alignment in r.alignments:
650 for hsp in alignment.hsps:
651 if hsp.expect < E_VALUE_THRESH:
652 print '*****'
653 print 'sequence', alignment.title
654 print 'length', alignment.length
655 print 'e value', hsp.expect
656 print hsp.query[:75] + '...'
657 print hsp.match[:75] + '...'
658 print hsp.sbjct[:75] + '...'
659