1
2
3
4
5
6
7 """This module provides code to work with the BLAST XML output
8 following the DTD available on the NCBI FTP
9 ftp://ftp.ncbi.nlm.nih.gov/blast/documents/xml/NCBI_BlastOutput.dtd
10
11 Classes:
12 BlastParser Parses XML output from BLAST.
13 This (now) returns a list of Blast records.
14 Historically it returned a single Blast record.
15
16 _XMLParser Generic SAX parser.
17
18 Functions:
19 parse Incremental parser, this is an iterator that returns
20 Blast records.
21 """
22 from Bio.Blast import Record
23 import xml.sax
24 from xml.sax.handler import ContentHandler
25
27 """Generic SAX Parser
28
29 Just a very basic SAX parser.
30
31 Redefine the methods startElement, characters and endElement.
32 """
34 """Constructor
35
36 debug - integer, amount of debug information to print
37 """
38 self._tag = []
39 self._value = ''
40 self._debug = debug
41 self._debug_ignore_list = []
42
44 """Removes 'dangerous' from tag names
45
46 name -- name to be 'secured'
47 """
48
49 return name.replace('-', '_')
50
52 """Found XML start tag
53
54 No real need of attr, BLAST DTD doesn't use them
55
56 name -- name of the tag
57
58 attr -- tag attributes
59 """
60 self._tag.append(name)
61
62
63 method = self._secure_name('_start_' + name)
64
65
66
67 if hasattr(self, method) :
68 eval("self.%s()" % method)
69 if self._debug > 4 :
70 print "NCBIXML: Parsed: " + method
71 else :
72
73 if method not in self._debug_ignore_list :
74 if self._debug > 3 :
75 print "NCBIXML: Ignored: " + method
76 self._debug_ignore_list.append(method)
77
79 """Found some text
80
81 ch -- characters read
82 """
83 self._value += ch
84
86 """Found XML end tag
87
88 name -- tag name
89 """
90
91 self._value = self._value.strip()
92
93
94 method = self._secure_name('_end_' + name)
95
96
97 if hasattr(self, method) :
98 eval("self.%s()" % method)
99 if self._debug > 2 :
100 print "NCBIXML: Parsed: " + method, self._value
101 else :
102
103 if method not in self._debug_ignore_list :
104 if self._debug > 1 :
105 print "NCBIXML: Ignored: " + method, self._value
106 self._debug_ignore_list.append(method)
107
108
109 self._value = ''
110
112 """Parse XML BLAST data into a Record.Blast object
113
114 All XML 'action' methods are private methods and may be:
115 _start_TAG called when the start tag is found
116 _end_TAG called when the end tag is found
117 """
118
120 """Constructor
121
122 debug - integer, amount of debug information to print
123 """
124
125 _XMLparser.__init__(self, debug)
126
127 self._parser = xml.sax.make_parser()
128 self._parser.setContentHandler(self)
129
130
131 self._parser.setFeature(xml.sax.handler.feature_validation, 0)
132 self._parser.setFeature(xml.sax.handler.feature_namespaces, 0)
133 self._parser.setFeature(xml.sax.handler.feature_external_pes, 0)
134 self._parser.setFeature(xml.sax.handler.feature_external_ges, 0)
135
136 self.reset()
137
139 """Reset all the data allowing reuse of the BlastParser() object"""
140 self._records = []
141 self._header = Record.Header()
142 self._parameters = Record.Parameters()
143 self._parameters.filter = None
144
145 - def parse(self, handler):
146 """Parses the XML data
147
148 handler -- file handler or StringIO
149
150 This method returns a list of Blast record objects.
151 """
152 import warnings
153 warnings.warn("Bio.Blast.NCBIXML.BlastParser.parse has been deprecated; please use Bio.Blast.NCBIXML.parse instead", DeprecationWarning)
154 self.reset()
155 self._parser.parse(handler)
156 return self._records
157
161
163
164
165 self._blast.reference = self._header.reference
166 self._blast.date = self._header.date
167 self._blast.version = self._header.version
168 self._blast.database = self._header.database
169 self._blast.application = self._header.application
170
171
172
173
174
175
176 if not hasattr(self._blast, "query") \
177 or not self._blast.query :
178 self._blast.query = self._header.query
179 if not hasattr(self._blast, "query_id") \
180 or not self._blast.query_id :
181 self._blast.query_id = self._header.query_id
182 if not hasattr(self._blast, "query_letters") \
183 or not self._blast.query_letters :
184 self._blast.query_letters = self._header.query_letters
185
186
187 self._blast.matrix = self._parameters.matrix
188 self._blast.num_seqs_better_e = self._parameters.num_seqs_better_e
189 self._blast.gap_penalties = self._parameters.gap_penalties
190 self._blast.filter = self._parameters.filter
191 self._blast.expect = self._parameters.expect
192 self._blast.sc_match = self._parameters.sc_match
193 self._blast.sc_mismatch = self._parameters.sc_mismatch
194
195
196 self._records.append(self._blast)
197
198 self._blast = None
199
200 if self._debug : "NCBIXML: Added Blast record to results"
201
202
204 """BLAST program, e.g., blastp, blastn, etc.
205
206 Save this to put on each blast record object
207 """
208 self._header.application = self._value.upper()
209
211 """version number and date of the BLAST engine.
212
213 e.g. "BLASTX 2.2.12 [Aug-07-2005]" but there can also be
214 variants like "BLASTP 2.2.18+" without the date.
215
216 Save this to put on each blast record object
217 """
218 parts = self._value.split()
219
220
221
222 self._header.version = parts[1]
223
224
225 if len(parts) >= 3 :
226 if parts[2][0] == "[" and parts[2][-1] == "]" :
227 self._header.date = parts[2][1:-1]
228 else :
229
230
231 self._header.date = parts[2]
232
234 """a reference to the article describing the algorithm
235
236 Save this to put on each blast record object
237 """
238 self._header.reference = self._value
239
241 """the database(s) searched
242
243 Save this to put on each blast record object
244 """
245 self._header.database = self._value
246
248 """the identifier of the query
249
250 Important in old pre 2.2.14 BLAST, for recent versions
251 <Iteration_query-ID> is enough
252 """
253 self._header.query_id = self._value
254
256 """the definition line of the query
257
258 Important in old pre 2.2.14 BLAST, for recent versions
259 <Iteration_query-def> is enough
260 """
261 self._header.query = self._value
262
264 """the length of the query
265
266 Important in old pre 2.2.14 BLAST, for recent versions
267 <Iteration_query-len> is enough
268 """
269 self._header.query_letters = int(self._value)
270
272 """the identifier of the query
273 """
274 self._blast.query_id = self._value
275
277 """the definition line of the query
278 """
279 self._blast.query = self._value
280
282 """the length of the query
283 """
284 self._blast.query_letters = int(self._value)
285
286
287
288
289
290
291
292
293
294
295
297 """hits to the database sequences, one for every sequence
298 """
299 self._blast.num_hits = int(self._value)
300
301
302
303
304
305
306
308 """matrix used (-M)
309 """
310 self._parameters.matrix = self._value
311
313 """expect values cutoff (-e)
314 """
315
316
317
318
319
320
321
322 self._parameters.expect = self._value
323
324
325
326
327
328
330 """match score for nucleotide-nucleotide comparaison (-r)
331 """
332 self._parameters.sc_match = int(self._value)
333
335 """mismatch penalty for nucleotide-nucleotide comparaison (-r)
336 """
337 self._parameters.sc_mismatch = int(self._value)
338
340 """gap existence cost (-G)
341 """
342 self._parameters.gap_penalties = int(self._value)
343
345 """gap extension cose (-E)
346 """
347 self._parameters.gap_penalties = (self._parameters.gap_penalties,
348 int(self._value))
349
351 """filtering options (-F)
352 """
353 self._parameters.filter = self._value
354
355
356
357
358
359
360
361
362
363
364
365
367 self._blast.alignments.append(Record.Alignment())
368 self._blast.descriptions.append(Record.Description())
369 self._blast.multiple_alignment = []
370 self._hit = self._blast.alignments[-1]
371 self._descr = self._blast.descriptions[-1]
372 self._descr.num_alignments = 0
373
375
376 self._blast.multiple_alignment = None
377 self._hit = None
378 self._descr = None
379
381 """identifier of the database sequence
382 """
383 self._hit.hit_id = self._value
384 self._hit.title = self._value + ' '
385
387 """definition line of the database sequence
388 """
389 self._hit.hit_def = self._value
390 self._hit.title += self._value
391 self._descr.title = self._hit.title
392
394 """accession of the database sequence
395 """
396 self._hit.accession = self._value
397 self._descr.accession = self._value
398
400 self._hit.length = int(self._value)
401
402
404
405
406 self._hit.hsps.append(Record.HSP())
407 self._hsp = self._hit.hsps[-1]
408 self._descr.num_alignments += 1
409 self._blast.multiple_alignment.append(Record.MultipleAlignment())
410 self._mult_al = self._blast.multiple_alignment[-1]
411
412
414 """raw score of HSP
415 """
416 self._hsp.score = float(self._value)
417 if self._descr.score == None:
418 self._descr.score = float(self._value)
419
421 """bit score of HSP
422 """
423 self._hsp.bits = float(self._value)
424 if self._descr.bits == None:
425 self._descr.bits = float(self._value)
426
428 """expect value value of the HSP
429 """
430 self._hsp.expect = float(self._value)
431 if self._descr.e == None:
432 self._descr.e = float(self._value)
433
435 """offset of query at the start of the alignment (one-offset)
436 """
437 self._hsp.query_start = int(self._value)
438
440 """offset of query at the end of the alignment (one-offset)
441 """
442 self._hsp.query_end = int(self._value)
443
445 """offset of the database at the start of the alignment (one-offset)
446 """
447 self._hsp.sbjct_start = int(self._value)
448
450 """offset of the database at the end of the alignment (one-offset)
451 """
452 self._hsp.sbjct_end = int(self._value)
453
454
455
456
457
458
459
460
461
462
463
465 """frame of the query if applicable
466 """
467 self._hsp.frame = (int(self._value),)
468
470 """frame of the database sequence if applicable
471 """
472 self._hsp.frame += (int(self._value),)
473
475 """number of identities in the alignment
476 """
477 self._hsp.identities = int(self._value)
478
480 """number of positive (conservative) substitutions in the alignment
481 """
482 self._hsp.positives = int(self._value)
483
485 """number of gaps in the alignment
486 """
487 self._hsp.gaps = int(self._value)
488
490 """length of the alignment
491 """
492 self._hsp.align_length = int(self._value)
493
494
495
496
497
498
500 """alignment string for the query
501 """
502 self._hsp.query = self._value
503
505 """alignment string for the database
506 """
507 self._hsp.sbjct = self._value
508
510 """Formatting middle line as normally seen in BLAST report
511 """
512 self._hsp.match = self._value
513
514
519
524
529
534
536 """Karlin-Altschul parameter K
537 """
538 self._blast.ka_params = float(self._value)
539
541 """Karlin-Altschul parameter Lambda
542 """
543 self._blast.ka_params = (float(self._value),
544 self._blast.ka_params)
545
547 """Karlin-Altschul parameter H
548 """
549 self._blast.ka_params = self._blast.ka_params + (float(self._value),)
550
551 -def parse(handle, debug=0):
552 """Returns an iterator a Blast record for each query.
553
554 handle - file handle to and XML file to parse
555 debug - integer, amount of debug information to print
556
557 This is a generator function that returns multiple Blast records
558 objects - one for each query sequence given to blast. The file
559 is read incrementally, returning complete records as they are read
560 in.
561
562 Should cope with new BLAST 2.2.14+ which gives a single XML file
563 for mutliple query records.
564
565 Should also cope with XML output from older versions BLAST which
566 gave multiple XML files concatenated together (giving a single file
567 which strictly speaking wasn't valid XML)."""
568 from xml.parsers import expat
569 BLOCK = 1024
570 MARGIN = 10
571 XML_START = "<?xml"
572
573 text = handle.read(BLOCK)
574 pending = ""
575
576 if not text :
577
578 raise ValueError("Your XML file was empty")
579
580 while text :
581
582 if not text.startswith(XML_START) :
583 raise ValueError("Your XML file did not start with %s..." \
584 % XML_START)
585
586 expat_parser = expat.ParserCreate()
587 blast_parser = BlastParser(debug)
588 expat_parser.StartElementHandler = blast_parser.startElement
589 expat_parser.EndElementHandler = blast_parser.endElement
590 expat_parser.CharacterDataHandler = blast_parser.characters
591
592 expat_parser.Parse(text, False)
593 while blast_parser._records:
594 record = blast_parser._records[0]
595 blast_parser._records = blast_parser._records[1:]
596 yield record
597
598 while True :
599
600 text, pending = pending + handle.read(BLOCK), ""
601 if not text:
602
603 expat_parser.Parse("", True)
604 break
605
606
607
608 pending = handle.read(MARGIN)
609
610 if (text+pending).find("\n" + XML_START) == -1 :
611
612 expat_parser.Parse(text, False)
613 while blast_parser._records:
614 record = blast_parser._records[0]
615 blast_parser._records = blast_parser._records[1:]
616 yield record
617 else :
618
619
620
621
622 text, pending = (text+pending).split("\n" + XML_START,1)
623 pending = XML_START + pending
624
625 expat_parser.Parse(text, True)
626 while blast_parser._records:
627 record = blast_parser._records[0]
628 blast_parser._records = blast_parser._records[1:]
629 yield record
630
631
632
633 text, pending = pending, ""
634 break
635
636
637
638
639 assert pending==""
640 assert len(blast_parser._records) == 0
641
642
643 assert text==""
644 assert pending==""
645 assert len(blast_parser._records) == 0
646
647 if __name__ == '__main__':
648 import sys
649 import os
650 handle = open(sys.argv[1])
651 r_list = parse(handle)
652
653 for r in r_list :
654
655 print 'Blast of', r.query
656 print 'Found %s alignments with a total of %s HSPs' % (len(r.alignments),
657 reduce(lambda a,b: a+b,
658 [len(a.hsps) for a in r.alignments]))
659
660 for al in r.alignments:
661 print al.title[:50], al.length, 'bp', len(al.hsps), 'HSPs'
662
663
664 E_VALUE_THRESH = 0.04
665 for alignment in r.alignments:
666 for hsp in alignment.hsps:
667 if hsp.expect < E_VALUE_THRESH:
668 print '*****'
669 print 'sequence', alignment.title
670 print 'length', alignment.length
671 print 'e value', hsp.expect
672 print hsp.query[:75] + '...'
673 print hsp.match[:75] + '...'
674 print hsp.sbjct[:75] + '...'
675