Package Bio :: Package Blast :: Module NCBIWWW
[hide private]
[frames] | no frames]

Source Code for Module Bio.Blast.NCBIWWW

  1  # Copyright 1999 by Jeffrey Chang.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  # Patched by Brad Chapman. 
  7  # Chris Wroe added modifications for work in myGrid 
  8   
  9  """ 
 10  This module provides code to work with the WWW version of BLAST 
 11  provided by the NCBI. 
 12  http://www.ncbi.nlm.nih.gov/BLAST/ 
 13   
 14  Classes: 
 15  BlastParser   Parses output from WWW blast. 
 16  _Scanner      Scans output from NCBI's BLAST WWW server. 
 17   
 18  Functions: 
 19  qblast        Do a BLAST search using the QBLAST API. 
 20   
 21  """ 
 22  import re 
 23   
 24  try: 
 25      import cStringIO as StringIO 
 26  except ImportError: 
 27      import StringIO 
 28   
 29  from Bio.ParserSupport import * 
 30   
31 -class BlastParser(AbstractParser):
32 """Parses WWW BLAST data into a Record.Blast object. 33 34 """
35 - def __init__(self):
36 """__init__(self)""" 37 import NCBIStandalone 38 self._scanner = _Scanner() 39 self._consumer = SGMLStrippingConsumer(NCBIStandalone._BlastConsumer())
40
41 - def parse(self, handle):
42 """parse(self, handle)""" 43 self._scanner.feed(handle, self._consumer) 44 return self._consumer.data
45
46 -class _Scanner:
47 """Scan BLAST output from NCBI's web server at: 48 http://www.ncbi.nlm.nih.gov/BLAST/ 49 50 Tested with BLAST v2.0.10 51 52 Methods: 53 feed Feed data into the scanner. 54 55 """
56 - def feed(self, handle, consumer):
57 """S.feed(handle, consumer) 58 59 Feed in a BLAST report for scanning. handle is a file-like 60 object that contains the BLAST report. consumer is a Consumer 61 object that will receive events as the report is scanned. 62 63 """ 64 from Bio import File 65 66 # This stuff appears in 2.0.12. 67 # <p><!-- 68 # QBlastInfoBegin 69 # Status=READY 70 # QBlastInfoEnd 71 # --><p> 72 73 # <HTML> 74 # <HEAD> 75 # <TITLE>BLAST Search Results </TITLE> 76 # </HEAD> 77 # <BODY BGCOLOR="#FFFFFF" LINK="#0000FF" VLINK="#660099" ALINK="#660099 78 # <A HREF="http://www.ncbi.nlm.nih.gov/BLAST/blast_form.map"> <IMG SRC= 79 # <BR><BR><PRE> 80 81 # BLAST Formatted information 82 83 # 84 # </BODY> 85 # </HTML> 86 # </BODY> 87 # </HTML> 88 if isinstance(handle, File.UndoHandle): 89 uhandle = handle 90 else: 91 uhandle = File.UndoHandle(handle) 92 # Read HTML formatting up to the "BLAST" version line. 93 read_and_call_until(uhandle, consumer.noevent, 94 has_re=re.compile(r'<b>.?BLAST')) 95 96 self._scan_header(uhandle, consumer) 97 self._scan_rounds(uhandle, consumer) 98 self._scan_database_report(uhandle, consumer) 99 self._scan_parameters(uhandle, consumer) 100 101 # Read HTML footer information. 102 while uhandle.peekline(): 103 read_and_call(uhandle, consumer.noevent)
104
105 - def _scan_header(self, uhandle, consumer):
106 # <b>BLASTP 2.0.10 [Aug-26-1999]</b> 107 # 108 # 109 # <b><a href="http://www.ncbi.nlm.nih.gov/htbin- 110 # post/Entrez/query?uid=9254694&form=6&db=m&Dopt=r">Reference</a>:</b> 111 # Altschul, Stephen F., Thomas L. Madden, Alejandro A. Sch&auml;ffer, 112 # Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), 113 # "Gapped BLAST and PSI-BLAST: a new generation of protein database sea 114 # programs", Nucleic Acids Res. 25:3389-3402. 115 # <p> 116 # <b>Query=</b> gi|120291|sp|P21297|FLBT_CAUCR FLBT PROTEIN. 117 # (141 letters) 118 # 119 # <b>Database:</b> Non-redundant SwissProt sequences 120 # 82,258 sequences; 29,652,561 total letters 121 # 122 # <p> <p>If you have any problems or questions with the results of this 123 124 # If there are hits, and Graphical Overview was selected: 125 # <FORM NAME="BLASTFORM"> 126 # </PRE> 127 # <CENTER> 128 # <H3><a href="/BLAST/newoptions.html#graphical-overview"> Distribution 129 # <input name=defline size=80 value="Mouse-over to show defline and sco 130 # </CENTER> 131 # <map name=img_map> 132 # <area shape=rect coords=69,101,476,106 href="#120291" ONMOUSEOVER='do 133 # <area shape=rect coords=156,108,305,113 href="#3024946" ONMOUSEOVER=' 134 # </map> 135 # <CENTER> 136 # <IMG WIDTH=529 HEIGHT=115 USEMAP=#img_map BORDER=1 SRC="nph-getgif.cg 137 # <HR> 138 # <PRE> XXX 139 consumer.start_header() 140 141 # Read the "BLAST" version line and the following blanks. 142 read_and_call(uhandle, consumer.version, contains='BLAST') 143 read_and_call_while(uhandle, consumer.noevent, blank=1) 144 145 # Read the reference lines and the '<p>' line. 146 # TBLASTN 2.2.6 has a blank line instead of a "<p>". 147 while 1: 148 line = uhandle.readline() 149 if line[:3] == '<p>' or not line.strip(): 150 consumer.noevent(line) 151 break 152 consumer.reference(line) 153 154 # Read the RID line, for version 2.0.12 (2.0.11?) and above. 155 attempt_read_and_call(uhandle, consumer.noevent, start='RID') 156 # Brad Chapman noticed a '<p>' line in BLASTN 2.1.1; this line 157 # seems to have disappeared again. 158 # attempt_read_and_call(uhandle, consumer.noevent, start='<p>') 159 attempt_read_and_call(uhandle, consumer.noevent) 160 161 # Apparently, there's some discrepancy between whether the 162 # Query or database comes first. Usually the Query does, but 163 # Brad noticed a case where the database came first. 164 if uhandle.peekline().find("Query=") >= 0: 165 self._scan_query_info(uhandle, consumer) 166 self._scan_database_info(uhandle, consumer) 167 else: 168 self._scan_database_info(uhandle, consumer) 169 self._scan_query_info(uhandle, consumer) 170 read_and_call_while(uhandle, consumer.noevent, blank=1) 171 consumer.end_header()
172
173 - def _scan_blastform(self, uhandle, consumer):
174 if attempt_read_and_call(uhandle, consumer.noevent, 175 contains="BLASTFORM"): 176 while 1: 177 line = uhandle.peekline() 178 if is_blank_line(line): 179 break 180 elif "Query=" in line: 181 break 182 consumer.noevent(uhandle.readline())
183
184 - def _scan_database_info(self, uhandle, consumer):
185 attempt_read_and_call(uhandle, consumer.noevent, start='<p>') 186 read_and_call(uhandle, consumer.database_info, contains='Database') 187 # Sagar Damle reported that databases can consist of multiple lines. 188 # But, trickily enough, sometimes the second line can also have the 189 # word sequences in it. Try to use 'sequences;' (with a semicolon) 190 read_and_call_until(uhandle, consumer.database_info, 191 contains='sequences;') 192 read_and_call(uhandle, consumer.database_info, contains='sequences;') 193 read_and_call(uhandle, consumer.noevent, blank=1) 194 attempt_read_and_call(uhandle, consumer.noevent, 195 contains='problems or questions') 196 self._scan_blastform(uhandle, consumer) 197 198 attempt_read_and_call(uhandle, consumer.noevent, blank=1) 199 if attempt_read_and_call(uhandle, consumer.noevent, 200 start="<table border=0 width=600"): 201 read_and_call_until(uhandle, consumer.noevent, 202 contains="</table>") 203 consumer.noevent(uhandle.readline()) 204 read_and_call(uhandle, consumer.noevent, blank=1) 205 206 attempt_read_and_call(uhandle, consumer.noevent, start="<p>") 207 208 if attempt_read_and_call(uhandle, consumer.noevent, 209 contains="Taxonomy reports"): 210 read_and_call(uhandle, consumer.noevent, start="<BR>") 211 attempt_read_and_call(uhandle, consumer.noevent, start="<PRE>") 212 213 # </PRE> 214 # <!-- Progress msg from the server 500 7--> 215 # <!-- Progress msg from the server 1000 15--> 216 # <!-- Progress msg from the server 1500 21--> 217 # ... 218 # <PRE><HR><BR><b>Query=</b> test 219 # (60 letters) 220 if attempt_read_and_call(uhandle, consumer.noevent, start="</PRE>"): 221 read_and_call_until(uhandle, consumer.noevent, start="<PRE>") 222 while 1: 223 line = uhandle.peekline() 224 if not line[:5] == "<PRE>" or line.find("Query=") >= 0: 225 break 226 read_and_call(uhandle, consumer.noevent, start="<PRE>") 227 228 read_and_call_while(uhandle, consumer.noevent, blank=1)
229
230 - def _scan_query_info(self, uhandle, consumer):
231 # Read the Query lines and the following blank line. 232 read_and_call(uhandle, consumer.query_info, contains='Query=') 233 read_and_call_until(uhandle, consumer.query_info, blank=1) 234 read_and_call_while(uhandle, consumer.noevent, blank=1) 235 if attempt_read_and_call(uhandle, consumer.noevent, start="<PRE>"): 236 read_and_call_while(uhandle, consumer.noevent, blank=1) 237 self._scan_blastform(uhandle, consumer)
238
239 - def _scan_rounds(self, uhandle, consumer):
240 self._scan_descriptions(uhandle, consumer) 241 self._scan_alignments(uhandle, consumer)
242
243 - def _scan_descriptions(self, uhandle, consumer):
244 consumer.start_descriptions() 245 246 # Three things can happen here: 247 # 1. line contains 'Score E' 248 # 2. line contains "No significant similarity" 249 # 3. no descriptions 250 if not attempt_read_and_call( 251 uhandle, consumer.description_header, 252 has_re=re.compile(r"Score {4,5}E")): 253 # Either case 2 or 3. Look for "No hits found". 254 attempt_read_and_call(uhandle, consumer.no_hits, 255 contains='No significant similarity') 256 read_and_call_while(uhandle, consumer.noevent, blank=1) 257 consumer.end_descriptions() 258 # Stop processing. 259 return 260 # Sequences producing significant alignments: 261 # 262 # <a href="http://www.ncbi.nlm.nih.gov:80/entrez/query.fcgi?cmd=Ret 263 # <a href="http://www.ncbi.nlm.nih.gov:80/entrez/query.fcgi?cmd=Ret 264 # 265 # Read the score header lines and a blank line. 266 read_and_call(uhandle, consumer.description_header, 267 start='Sequences producing') 268 read_and_call(uhandle, consumer.noevent, blank=1) 269 270 # Read the descriptions 271 # The description contains at least an <a href> into the alignments. 272 # What is no alignments are chosen? 273 read_and_call_while(uhandle, consumer.description, 274 blank=0, contains='<a') 275 276 # two choices here, either blank lines or a </PRE> 277 if not attempt_read_and_call(uhandle, consumer.noevent, 278 contains='</PRE>'): 279 read_and_call_while(uhandle, consumer.noevent, blank=1) 280 281 consumer.end_descriptions()
282
283 - def _scan_alignments(self, uhandle, consumer):
284 # Check to see whether I'm at an alignment or database report. 285 # Possibilities: 286 # 1) BLASTP 2.0.14, pairwise alignment 287 # <CENTER><b><FONT color="green">Alignments</FONT></b></CENTER> 288 # <PRE> 289 # ><a name = 121837></a><a href="http://www.ncbi.nlm.nih.gov:80/entre 290 # 2) BLASTP 2.0.10, pairwise alignment 291 # <PRE> 292 # <a name = 120291> </a><a href="http://www.ncbi.nlm.nih.gov:80/entre 293 # 3) BLASTP 2.0.10, master-slave 294 # <PRE> 295 # blast_tmp 1 MFQQIGAVQAKSGTDEPAHPCEKFPPERKCEAVFWKPLPRHEAREILLAARK 296 # 4) BLASTP 2.0.10, 2.0.14, database 297 # <PRE> 298 # Database: Non-redundant SwissProt sequences 299 # 5) BLASTX 2.2.4, pairwise alignment 300 # <CENTER><b><FONT color="green">Alignments</FONT></b></CENTER> 301 # </form> 302 # <script src="blastResult.js"></script><table border="0"><tr><td><FO 303 # <PRE> 304 # 6) Qblast 2.2.10, database (no 'Database' line) 305 # <PRE> 306 # Lambda K H 307 308 # Get the first two lines and examine them. 309 line1 = safe_readline(uhandle) 310 line2 = safe_readline(uhandle) 311 uhandle.saveline(line2) 312 uhandle.saveline(line1) 313 314 is_pairwise = is_masterslave = 0 315 if 'Alignments' in line2: 316 is_pairwise = 1 317 elif line2.startswith(' Database'): 318 pass 319 elif line2.startswith('Lambda K H'): 320 pass 321 elif line2.startswith('blast_tmp'): 322 is_masterslave = 1 323 elif line1.startswith('<PRE>'): 324 is_pairwise = 1 325 else: 326 raise SyntaxError, "Cannot resolve location at lines:\n%s\n%s" % (line1, line2) 327 328 if is_pairwise: 329 self._scan_pairwise_alignments(uhandle, consumer) 330 elif is_masterslave: 331 self._scan_masterslave_alignment(uhandle, consumer)
332
333 - def _scan_pairwise_alignments(self, uhandle, consumer):
334 while 1: 335 read_and_call_until(uhandle, consumer.noevent, start='<PRE>') 336 337 # The first line is <PRE>. Check the second line to see if 338 # I'm still at an alignment. 339 line1 = safe_readline(uhandle) 340 line2 = safe_readline(uhandle) 341 uhandle.saveline(line2) 342 uhandle.saveline(line1) 343 # Lambda is for Q-blast results, which do not have a Database line 344 if line1.find('Database') >= 0 or line2.find("Database") >= 0 \ 345 or line2.find('Lambda K H') >= 0: 346 break 347 348 # Occasionally, there's a bug where the alignment_header and 349 # hsp_header are skipped, leaving only the hsp_alignment. 350 # Detect this and handle it accordingly. 351 if line2[:6] == 'Query:': 352 self._scan_abbreviated_pairwise_alignment(uhandle, consumer) 353 else: 354 self._scan_one_pairwise_alignment(uhandle, consumer)
355
356 - def _scan_abbreviated_pairwise_alignment(self, uhandle, consumer):
357 # Sometimes all header information is skipped, leaving 358 # only the raw alignments. I believe this is a bug because 359 # without the header information, you lose vital information such 360 # as score, target sequence id, etc. 361 # Format: 362 # <PRE> 363 # hsp_alignment 364 365 consumer.start_alignment() 366 consumer.start_hsp() 367 read_and_call(uhandle, consumer.noevent, start='<PRE>') 368 self._scan_hsp_alignment(uhandle, consumer) 369 consumer.end_hsp() 370 consumer.end_alignment()
371
372 - def _scan_one_pairwise_alignment(self, uhandle, consumer):
373 # Alignment format: 374 # <CENTER><b><FONT color="green">Alignments</FONT></b></CENTER> 375 # (BLAST 2.0.14) 376 # <PRE> 377 # alignment_header 378 # hsp_header 379 # hsp_alignment 380 # [...] 381 # The hsp_header and hsp_alignment blocks can be repeated. 382 383 consumer.start_alignment() 384 read_and_call(uhandle, consumer.noevent, start='<PRE>') 385 self._scan_alignment_header(uhandle, consumer) 386 387 # Scan a bunch of score/alignment's. 388 while 1: 389 # An HSP header starts with ' Score'. 390 # However, if the HSP header is not the first one in the 391 # alignment, there will be a '<PRE>' line first. Therefore, 392 # I will need to check either of the first two lines to 393 # see if I'm at an HSP header. 394 line1 = safe_readline(uhandle) 395 line2 = safe_readline(uhandle) 396 line3 = safe_readline(uhandle) 397 uhandle.saveline(line3) 398 uhandle.saveline(line2) 399 uhandle.saveline(line1) 400 # There can be <a> links in front of 'Score' 401 rea = re.compile(r"</?a[^>]*>") 402 line1 = rea.sub("", line1) 403 line2 = rea.sub("", line2) 404 line3 = rea.sub("", line3) 405 if line1[:6] != ' Score' and line2[:6] != ' Score' and \ 406 line3[:6] != ' Score': 407 break 408 self._scan_hsp(uhandle, consumer) 409 410 consumer.end_alignment()
411
412 - def _scan_alignment_header(self, uhandle, consumer):
413 # <a name = 120291> </a><a href="http://www.ncbi.nlm.nih.gov:80/entrez/ 414 # Length = 141 415 # 416 while 1: 417 line = safe_readline(uhandle) 418 if line.lstrip().startswith('Length ='): 419 consumer.length(line) 420 break 421 elif is_blank_line(line): 422 # Check to make sure I haven't missed the Length line 423 raise SyntaxError, "I missed the Length in an alignment header" 424 consumer.title(line) 425 426 if not attempt_read_and_call(uhandle, consumer.noevent, 427 start=' '): 428 read_and_call(uhandle, consumer.noevent, blank=1)
429
430 - def _scan_hsp(self, uhandle, consumer):
431 consumer.start_hsp() 432 self._scan_hsp_header(uhandle, consumer) 433 self._scan_hsp_alignment(uhandle, consumer) 434 consumer.end_hsp()
435
436 - def _scan_hsp_header(self, uhandle, consumer):
437 # If the HSP is not the first one within an alignment, includes: 438 # <PRE> 439 440 # Score = 22.7 bits (47), Expect = 2.5 441 # Identities = 10/36 (27%), Positives = 18/36 (49%) 442 # Strand = Plus / Plus 443 # Frame = +3 444 # 445 446 attempt_read_and_call(uhandle, consumer.noevent, start='<PRE>') 447 attempt_read_and_call(uhandle, consumer.noevent, blank=1) 448 read_and_call(uhandle, consumer.score, 449 has_re=re.compile(r'^ (<a[^>]*></a>)*Score')) 450 read_and_call(uhandle, consumer.identities, start=' Identities') 451 # BLASTN 452 attempt_read_and_call(uhandle, consumer.strand, start = ' Strand') 453 # BLASTX, TBLASTN, TBLASTX 454 attempt_read_and_call(uhandle, consumer.frame, start = ' Frame') 455 read_and_call(uhandle, consumer.noevent, blank=1)
456
457 - def _scan_hsp_alignment(self, uhandle, consumer):
458 # Query: 11 GRGVSACA-------TCDGFFYRNQKVAVIGGGNTAVEEALYLSNIASEVHLIHRRDGF 459 # GRGVS+ TC Y + + V GGG+ + EE L + I R+ 460 # Sbjct: 12 GRGVSSVVRRCIHKPTCKE--YAVKIIDVTGGGSFSAEEVQELREATLKEVDILRKVSG 461 # 462 # Query: 64 AEKILIKR 71 463 # I +K 464 # Sbjct: 70 PNIIQLKD 77 465 # </PRE> 466 # 467 # 468 469 while 1: 470 # Blastn adds an extra line filled with spaces before Query 471 attempt_read_and_call(uhandle, consumer.noevent, start=' ') 472 read_and_call(uhandle, consumer.query, start='Query') 473 read_and_call(uhandle, consumer.align, start=' ') 474 read_and_call(uhandle, consumer.sbjct, start='Sbjct') 475 if not attempt_read_and_call(uhandle, consumer.noevent, blank=1): 476 break 477 read_and_call(uhandle, consumer.noevent, start='</PRE>') 478 read_and_call_while(uhandle, consumer.noevent, blank=1)
479
480 - def _scan_masterslave_alignment(self, uhandle, consumer):
481 consumer.start_alignment() 482 read_and_call(uhandle, consumer.noevent, start='<PRE>') 483 while 1: 484 line = safe_readline(uhandle) 485 if is_blank_line(line): 486 consumer.noevent(line) 487 elif line[:6] == '</PRE>': 488 consumer.noevent(line) 489 break 490 else: 491 consumer.multalign(line) 492 read_and_call_while(uhandle, consumer.noevent, blank=1) 493 consumer.end_alignment()
494
495 - def _scan_database_report(self, uhandle, consumer):
496 # <PRE> 497 # Database: Non-redundant SwissProt sequences 498 # Posted date: Dec 18, 1999 8:26 PM 499 # Number of letters in database: 29,652,561 500 # Number of sequences in database: 82,258 501 # 502 # Lambda K H 503 # 0.317 0.133 0.395 504 # 505 # Gapped 506 # Lambda K H 507 # 0.270 0.0470 0.230 508 # 509 510 # qblast (BLASTN 2.2.10) does not give the Database: bits before the Lambda 511 # information, so that needs to be skipped 512 513 consumer.start_database_report() 514 515 # TBALSTN 2.2.6 516 # <PRE> Database: /tmp/affyA.fasta 517 line = uhandle.peekline() 518 # only look for database information if we aren't already at the 519 # Lambda bits 520 if line.find("Database") < 0: 521 read_and_call(uhandle, consumer.noevent, start='<PRE>') 522 line2 = uhandle.peekline() 523 if line2.find("Lambda K H") < 0: 524 read_and_call(uhandle, consumer.database, contains=' Database') 525 read_and_call_until(uhandle, consumer.database, contains="Posted") 526 read_and_call(uhandle, consumer.posted_date, start=' Posted') 527 read_and_call(uhandle, consumer.num_letters_in_database, 528 start=' Number of letters') 529 read_and_call(uhandle, consumer.num_sequences_in_database, 530 start=' Number of sequences') 531 read_and_call(uhandle, consumer.noevent, start=' ') 532 533 read_and_call(uhandle, consumer.noevent, start='Lambda') 534 read_and_call(uhandle, consumer.ka_params) 535 read_and_call(uhandle, consumer.noevent, blank=1) 536 537 # not BLASTP 538 attempt_read_and_call(uhandle, consumer.gapped, start='Gapped') 539 # not TBLASTX 540 if attempt_read_and_call(uhandle, consumer.noevent, start='Lambda'): 541 read_and_call(uhandle, consumer.ka_params_gap) 542 read_and_call_while(uhandle, consumer.noevent, blank=1) 543 544 consumer.end_database_report()
545
546 - def _scan_parameters(self, uhandle, consumer):
547 # Matrix: BLOSUM62 548 # Number of Hits to DB: 1st pass: 41542626, 2nd pass: 9765 549 # Number of Sequences: 1st pass: 89405, 2nd pass: 84 550 # Number of extensions: 1st pass: 500847, 2nd pass: 6747 551 # Number of successful extensions: 1st pass: 14, 2nd pass: 49 552 # Number of sequences better than 10.0: 20 553 # length of query: 205 554 # length of database: 10,955,950 555 # effective HSP length: 46 556 # effective length of query: 158 557 # effective length of database: 6,843,320 558 # effective search space: 1081244560 559 # effective search space used: 1081244560 560 # frameshift window, decay const: 50, 0.5 561 # T: 13 562 # A: 40 563 # X1: 16 ( 7.3 bits) 564 # X2: 0 ( 0.0 bits) 565 # S1: 41 (21.7 bits) 566 # S2: 52 (26.7 bits) 567 # 568 # </PRE> 569 570 # 6/3/2001, </PRE> is gone, replaced by </form> 571 572 573 consumer.start_parameters() 574 575 # qblast doesn't have Matrix line 576 attempt_read_and_call(uhandle, consumer.matrix, start='Matrix') 577 # not TBLASTX 578 attempt_read_and_call(uhandle, consumer.gap_penalties, start='Gap') 579 580 # in qblast the Number of Hits and Number of Sequences lines are 581 # reversed 582 if attempt_read_and_call(uhandle, consumer.num_hits, 583 start='Number of Hits'): 584 read_and_call(uhandle, consumer.num_sequences, 585 start='Number of Sequences') 586 else: 587 read_and_call(uhandle, consumer.num_sequences, 588 start='Number of Sequences') 589 read_and_call(uhandle, consumer.num_hits, 590 start='Number of Hits') 591 592 read_and_call(uhandle, consumer.num_extends, 593 start='Number of extensions') 594 read_and_call(uhandle, consumer.num_good_extends, 595 start='Number of successful') 596 597 read_and_call(uhandle, consumer.num_seqs_better_e, 598 start='Number of sequences') 599 600 # not BLASTN, TBLASTX 601 if attempt_read_and_call(uhandle, consumer.hsps_no_gap, 602 start="Number of HSP's better"): 603 # for qblast order of HSP info is changed 604 if attempt_read_and_call(uhandle, consumer.hsps_prelim_gapped, 605 start="Number of HSP's successfully"): 606 read_and_call(uhandle, consumer.hsps_prelim_gap_attempted, 607 start="Number of HSP's that") 608 read_and_call(uhandle, consumer.hsps_gapped, 609 start="Number of HSP's gapped") 610 else: 611 read_and_call(uhandle, consumer.no_event, 612 start="Number of HSP's gapped") 613 read_and_call(uhandle, consumer.no_event, 614 start="Number of HSP's successfully") 615 read_and_call(uhandle, consumer.no_event, 616 start="Number of extra gapped") 617 618 # QBlast has different capitalization on the Length info: 619 if attempt_read_and_call(uhandle, consumer.query_length, 620 start='Length of query'): 621 read_and_call(uhandle, consumer.database_length, 622 start='Length of database') 623 read_and_call(uhandle, consumer.no_event, 624 start='Length adjustment') 625 attempt_read_and_call(uhandle, consumer.effective_query_length, 626 start='Effective length of query') 627 read_and_call(uhandle, consumer.effective_database_length, 628 start='Effective length of database') 629 attempt_read_and_call(uhandle, consumer.effective_search_space, 630 start='Effective search space:') 631 attempt_read_and_call(uhandle, consumer.effective_search_space_used, 632 start='Effective search space used') 633 634 else: 635 attempt_read_and_call(uhandle, consumer.query_length, 636 start='length of query') 637 read_and_call(uhandle, consumer.database_length, 638 start='length of database') 639 read_and_call(uhandle, consumer.effective_hsp_length, 640 start='effective HSP') 641 attempt_read_and_call(uhandle, consumer.effective_query_length, 642 start='effective length of query') 643 read_and_call(uhandle, consumer.effective_database_length, 644 start='effective length of database') 645 attempt_read_and_call(uhandle, consumer.effective_search_space, 646 start='effective search space:') 647 attempt_read_and_call(uhandle, consumer.effective_search_space_used, 648 start='effective search space used') 649 650 # BLASTX, TBLASTN, TBLASTX 651 attempt_read_and_call(uhandle, consumer.frameshift, start='frameshift') 652 attempt_read_and_call(uhandle, consumer.threshold, start='T') 653 read_and_call(uhandle, consumer.window_size, start='A') 654 read_and_call(uhandle, consumer.dropoff_1st_pass, start='X1') 655 read_and_call(uhandle, consumer.gap_x_dropoff, start='X2') 656 # not BLASTN, TBLASTX 657 attempt_read_and_call(uhandle, consumer.gap_x_dropoff_final, 658 start='X3') 659 read_and_call(uhandle, consumer.gap_trigger, start='S1') 660 attempt_read_and_call(uhandle, consumer.blast_cutoff, start='S2') 661 662 attempt_read_and_call(uhandle, consumer.noevent, blank=1) 663 attempt_read_and_call(uhandle, consumer.noevent, start="</PRE>") 664 attempt_read_and_call(uhandle, consumer.noevent, start="</form>") 665 666 consumer.end_parameters()
667
668 -def qblast(program, database, sequence, 669 auto_format=None,composition_based_statistics=None, 670 db_genetic_code=None,endpoints=None,entrez_query='(none)', 671 expect=10.0,filter=None,gapcosts=None,genetic_code=None, 672 hitlist_size=50,i_thresh=None,layout=None,lcase_mask=None, 673 matrix_name=None,nucl_penalty=None,nucl_reward=None, 674 other_advanced=None,perc_ident=None,phi_pattern=None, 675 query_file=None,query_believe_defline=None,query_from=None, 676 query_to=None,searchsp_eff=None,service=None,threshold=None, 677 ungapped_alignment=None,word_size=None, 678 alignments=500,alignment_view=None,descriptions=500, 679 entrez_links_new_window=None,expect_low=None,expect_high=None, 680 format_entrez_query=None,format_object=None,format_type='XML', 681 ncbi_gi=None,results_file=None,show_overview=None 682 ):
683 """Do a BLAST search using the QBLAST server at NCBI. 684 685 Supports all parameters of the qblast API for Put and Get. 686 Some useful parameters: 687 program BLASTP or BLASTN 688 database Which database to search against. 689 sequence The sequence to search. 690 ncbi_gi TRUE/FALSE whether to give 'gi' identifier. 691 descriptions Number of descriptions to show. Def 500. 692 alignments Number of alignments to show. Def 500. 693 expect An expect value cutoff. Def 10.0. 694 matrix_name Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45). 695 filter "none" turns off filtering. Default no filtering 696 format_type "HTML", "Text", "ASN.1", or "XML". Def. "XML". 697 entrez_query Entrez query to limit Blast search 698 hitlist_size Number of hits to return. Default 50 699 700 This function does no checking of the validity of the parameters 701 and passes the values to the server as is. More help is available at: 702 http://www.ncbi.nlm.nih.gov/BLAST/blast_overview.html 703 704 """ 705 import urllib, urllib2 706 from Bio.WWW import RequestLimiter 707 708 assert program == 'blastn' or program == 'blastp' 709 710 # Format the "Put" command, which sends search requests to qblast. 711 # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node5.html on 9 July 2007 712 parameters = [ 713 ('AUTO_FORMAT',auto_format), 714 ('COMPOSITION_BASED_STATISTICS',composition_based_statistics), 715 ('DATABASE',database), 716 ('DB_GENETIC_CODE',db_genetic_code), 717 ('ENDPOINTS',endpoints), 718 ('ENTREZ_QUERY',entrez_query), 719 ('EXPECT',expect), 720 ('FILTER',filter), 721 ('GAPCOSTS',gapcosts), 722 ('GENETIC_CODE',genetic_code), 723 ('HITLIST_SIZE',hitlist_size), 724 ('I_THRESH',i_thresh), 725 ('LAYOUT',layout), 726 ('LCASE_MASK',lcase_mask), 727 ('MATRIX_NAME',matrix_name), 728 ('NUCL_PENALTY',nucl_penalty), 729 ('NUCL_REWARD',nucl_reward), 730 ('OTHER_ADVANCED',other_advanced), 731 ('PERC_IDENT',perc_ident), 732 ('PHI_PATTERN',phi_pattern), 733 ('PROGRAM',program), 734 ('QUERY',sequence), 735 ('QUERY_FILE',query_file), 736 ('QUERY_BELIEVE_DEFLINE',query_believe_defline), 737 ('QUERY_FROM',query_from), 738 ('QUERY_TO',query_to), 739 ('SEARCHSP_EFF',searchsp_eff), 740 ('SERVICE',service), 741 ('THRESHOLD',threshold), 742 ('UNGAPPED_ALIGNMENT',ungapped_alignment), 743 ('WORD_SIZE',word_size), 744 ('CMD', 'Put'), 745 ] 746 query = [x for x in parameters if x[1] is not None] 747 message = urllib.urlencode(query) 748 749 # Send off the initial query to qblast. 750 request = urllib2.Request("http://www.ncbi.nlm.nih.gov/blast/Blast.cgi", 751 message, 752 {"User-Agent":"BiopythonClient"}) 753 handle = urllib2.urlopen(request) 754 755 # Format the "Get" command, which gets the formatted results from qblast 756 # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node6.html on 9 July 2007 757 rid, rtoe = _parse_qblast_ref_page(handle) 758 parameters = [ 759 ('ALIGNMENTS',alignments), 760 ('ALIGNMENT_VIEW',alignment_view), 761 ('DESCRIPTIONS',descriptions), 762 ('ENTREZ_LINKS_NEW_WINDOW',entrez_links_new_window), 763 ('EXPECT_LOW',expect_low), 764 ('EXPECT_HIGH',expect_high), 765 ('FORMAT_ENTREZ_QUERY',format_entrez_query), 766 ('FORMAT_OBJECT',format_object), 767 ('FORMAT_TYPE',format_type), 768 ('NCBI_GI',ncbi_gi), 769 ('RID',rid), 770 ('RESULTS_FILE',results_file), 771 ('SERVICE',service), 772 ('SHOW_OVERVIEW',show_overview), 773 ('CMD', 'Get'), 774 ] 775 query = [x for x in parameters if x[1] is not None] 776 message = urllib.urlencode(query) 777 778 # Poll NCBI until the results are ready. 779 limiter = RequestLimiter(3) 780 while 1: 781 limiter.wait() 782 request = urllib2.Request("http://www.ncbi.nlm.nih.gov/blast/Blast.cgi", 783 message, 784 {"User-Agent":"BiopythonClient"}) 785 handle = urllib2.urlopen(request) 786 results = handle.read() 787 # XML results don't have the Status tag when finished 788 if results.find("Status=") < 0: 789 break 790 i = results.index("Status=") 791 j = results.index("\n", i) 792 status = results[i+len("Status="):j].strip() 793 if status.upper() == "READY": 794 break 795 796 return StringIO.StringIO(results)
797
798 -def _parse_qblast_ref_page(handle):
799 """Return tuple of RID, RTOE.""" 800 s = handle.read() 801 i = s.find("RID =") 802 j = s.find("\n", i) 803 rid = s[i+len("RID ="):j].strip() 804 805 i = s.find("RTOE =") 806 j = s.find("\n", i) 807 rtoe = s[i+len("RTOE ="):j].strip() 808 return rid, int(rtoe)
809