Package Bio :: Package Blast :: Module NCBIStandalone
[hide private]
[frames] | no frames]

Source Code for Module Bio.Blast.NCBIStandalone

   1  # Copyright 1999-2000 by Jeffrey Chang.  All rights reserved. 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license.  Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5  # Patches by Mike Poidinger to support multiple databases. 
   6  # Updated by Peter Cock in 2007 to do a better job on BLAST 2.2.15 
   7   
   8   
   9  """ 
  10  This module provides code to work with the standalone version of 
  11  BLAST, either blastall or blastpgp, provided by the NCBI. 
  12  http://www.ncbi.nlm.nih.gov/BLAST/ 
  13   
  14  Classes: 
  15  LowQualityBlastError     Except that indicates low quality query sequences. 
  16  BlastParser              Parses output from blast. 
  17  BlastErrorParser         Parses output and tries to diagnose possible errors. 
  18  PSIBlastParser           Parses output from psi-blast. 
  19  Iterator                 Iterates over a file of blast results. 
  20   
  21  _Scanner                 Scans output from standalone BLAST. 
  22  _BlastConsumer           Consumes output from blast. 
  23  _PSIBlastConsumer        Consumes output from psi-blast. 
  24  _HeaderConsumer          Consumes header information. 
  25  _DescriptionConsumer     Consumes description information. 
  26  _AlignmentConsumer       Consumes alignment information. 
  27  _HSPConsumer             Consumes hsp information. 
  28  _DatabaseReportConsumer  Consumes database report information. 
  29  _ParametersConsumer      Consumes parameters information. 
  30   
  31  Functions: 
  32  blastall        Execute blastall. 
  33  blastpgp        Execute blastpgp. 
  34  rpsblast        Execute rpsblast. 
  35   
  36  """ 
  37   
  38  from __future__ import generators 
  39  import os 
  40  import re 
  41   
  42  from Bio import File 
  43  from Bio.ParserSupport import * 
  44  from Bio.Blast import Record 
  45   
  46   
47 -class LowQualityBlastError(Exception):
48 """Error caused by running a low quality sequence through BLAST. 49 50 When low quality sequences (like GenBank entries containing only 51 stretches of a single nucleotide) are BLASTed, they will result in 52 BLAST generating an error and not being able to perform the BLAST. 53 search. This error should be raised for the BLAST reports produced 54 in this case. 55 """ 56 pass
57
58 -class ShortQueryBlastError(Exception):
59 """Error caused by running a short query sequence through BLAST. 60 61 If the query sequence is too short, BLAST outputs warnings and errors: 62 Searching[blastall] WARNING: [000.000] AT1G08320: SetUpBlastSearch failed. 63 [blastall] ERROR: [000.000] AT1G08320: Blast: 64 [blastall] ERROR: [000.000] AT1G08320: Blast: Query must be at least wordsize 65 done 66 67 This exception is raised when that condition is detected. 68 69 """ 70 pass
71 72
73 -class _Scanner:
74 """Scan BLAST output from blastall or blastpgp. 75 76 Tested with blastall and blastpgp v2.0.10, v2.0.11 77 78 Methods: 79 feed Feed data into the scanner. 80 81 """
82 - def feed(self, handle, consumer):
83 """S.feed(handle, consumer) 84 85 Feed in a BLAST report for scanning. handle is a file-like 86 object that contains the BLAST report. consumer is a Consumer 87 object that will receive events as the report is scanned. 88 89 """ 90 if isinstance(handle, File.UndoHandle): 91 uhandle = handle 92 else: 93 uhandle = File.UndoHandle(handle) 94 95 # Try to fast-forward to the beginning of the blast report. 96 read_and_call_until(uhandle, consumer.noevent, contains='BLAST') 97 # Now scan the BLAST report. 98 self._scan_header(uhandle, consumer) 99 self._scan_rounds(uhandle, consumer) 100 self._scan_database_report(uhandle, consumer) 101 self._scan_parameters(uhandle, consumer)
102
103 - def _scan_header(self, uhandle, consumer):
104 # BLASTP 2.0.10 [Aug-26-1999] 105 # 106 # 107 # Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Schaf 108 # Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), 109 # "Gapped BLAST and PSI-BLAST: a new generation of protein database sea 110 # programs", Nucleic Acids Res. 25:3389-3402. 111 # 112 # Query= test 113 # (140 letters) 114 # 115 # Database: sdqib40-1.35.seg.fa 116 # 1323 sequences; 223,339 total letters 117 # 118 # ======================================================== 119 # This next example is from the online version of Blast, 120 # note there are TWO references, an RID line, and also 121 # the database is BEFORE the query line. 122 # Note there possibleuse of non-ASCII in the author names. 123 # ======================================================== 124 # 125 # BLASTP 2.2.15 [Oct-15-2006] 126 # Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Sch??ffer, 127 # Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman 128 # (1997), "Gapped BLAST and PSI-BLAST: a new generation of 129 # protein database search programs", Nucleic Acids Res. 25:3389-3402. 130 # 131 # Reference: Sch??ffer, Alejandro A., L. Aravind, Thomas L. Madden, Sergei 132 # Shavirin, John L. Spouge, Yuri I. Wolf, Eugene V. Koonin, and 133 # Stephen F. Altschul (2001), "Improving the accuracy of PSI-BLAST 134 # protein database searches with composition-based statistics 135 # and other refinements", Nucleic Acids Res. 29:2994-3005. 136 # 137 # RID: 1166022616-19998-65316425856.BLASTQ1 138 # 139 # 140 # Database: All non-redundant GenBank CDS 141 # translations+PDB+SwissProt+PIR+PRF excluding environmental samples 142 # 4,254,166 sequences; 1,462,033,012 total letters 143 # Query= gi:16127998 144 # Length=428 145 # 146 147 consumer.start_header() 148 149 read_and_call(uhandle, consumer.version, contains='BLAST') 150 read_and_call_while(uhandle, consumer.noevent, blank=1) 151 152 # There might be a <pre> line, for qblast output. 153 attempt_read_and_call(uhandle, consumer.noevent, start="<pre>") 154 155 # Read the reference(s) 156 while attempt_read_and_call(uhandle, 157 consumer.reference, start='Reference') : 158 # References are normally multiline terminated by a blank line 159 # (or, based on the old code, the RID line) 160 while 1: 161 line = uhandle.readline() 162 if is_blank_line(line) : 163 consumer.noevent(line) 164 break 165 elif line.startswith("RID"): 166 break 167 else : 168 #More of the reference 169 consumer.reference(line) 170 171 #Deal with the optional RID: ... 172 read_and_call_while(uhandle, consumer.noevent, blank=1) 173 attempt_read_and_call(uhandle, consumer.reference, start="RID:") 174 read_and_call_while(uhandle, consumer.noevent, blank=1) 175 176 # blastpgp has a Reference for composition-based statistics. 177 if attempt_read_and_call( 178 uhandle, consumer.reference, start="Reference"): 179 read_and_call_until(uhandle, consumer.reference, blank=1) 180 read_and_call_while(uhandle, consumer.noevent, blank=1) 181 182 line = uhandle.peekline() 183 assert line.strip() <> "" 184 assert not line.startswith("RID:") 185 if line.startswith("Query=") : 186 #This is an old style query then database... 187 188 # Read the Query lines and the following blank line. 189 read_and_call(uhandle, consumer.query_info, start='Query=') 190 read_and_call_until(uhandle, consumer.query_info, blank=1) 191 read_and_call_while(uhandle, consumer.noevent, blank=1) 192 193 # Read the database lines and the following blank line. 194 read_and_call_until(uhandle, consumer.database_info, end='total letters') 195 read_and_call(uhandle, consumer.database_info, contains='sequences') 196 read_and_call_while(uhandle, consumer.noevent, blank=1) 197 elif line.startswith("Database:") : 198 #This is a new style database then query... 199 read_and_call_until(uhandle, consumer.database_info, end='total letters') 200 read_and_call(uhandle, consumer.database_info, contains='sequences') 201 read_and_call_while(uhandle, consumer.noevent, blank=1) 202 203 # Read the Query lines and the following blank line. 204 read_and_call(uhandle, consumer.query_info, start='Query=') 205 read_and_call_until(uhandle, consumer.query_info, blank=1) 206 read_and_call_while(uhandle, consumer.noevent, blank=1) 207 else : 208 raise SyntaxError("Invalid header?") 209 210 consumer.end_header()
211
212 - def _scan_rounds(self, uhandle, consumer):
213 # Scan a bunch of rounds. 214 # Each round begins with either a "Searching......" line 215 # or a 'Score E' line followed by descriptions and alignments. 216 # The email server doesn't give the "Searching....." line. 217 # If there is no 'Searching.....' line then you'll first see a 218 # 'Results from round' line 219 220 while 1: 221 line = safe_peekline(uhandle) 222 if (not line.startswith('Searching') and 223 not line.startswith('Results from round') and 224 re.search(r"Score +E", line) is None and 225 line.find('No hits found') == -1): 226 break 227 228 self._scan_descriptions(uhandle, consumer) 229 self._scan_alignments(uhandle, consumer)
230
231 - def _scan_descriptions(self, uhandle, consumer):
232 # Searching..................................................done 233 # Results from round 2 234 # 235 # 236 # Sc 237 # Sequences producing significant alignments: (b 238 # Sequences used in model and found again: 239 # 240 # d1tde_2 3.4.1.4.4 (119-244) Thioredoxin reductase [Escherichia ... 241 # d1tcob_ 1.31.1.5.16 Calcineurin regulatory subunit (B-chain) [B... 242 # d1symb_ 1.31.1.2.2 Calcyclin (S100) [RAT (RATTUS NORVEGICUS)] 243 # 244 # Sequences not found previously or not previously below threshold: 245 # 246 # d1osa__ 1.31.1.5.11 Calmodulin [Paramecium tetraurelia] 247 # d1aoza3 2.5.1.3.3 (339-552) Ascorbate oxidase [zucchini (Cucurb... 248 # 249 250 # If PSI-BLAST, may also have: 251 # 252 # CONVERGED! 253 254 consumer.start_descriptions() 255 256 # Read 'Searching' 257 # This line seems to be missing in BLASTN 2.1.2 (others?) 258 attempt_read_and_call(uhandle, consumer.noevent, start='Searching') 259 260 # blastpgp 2.0.10 from NCBI 9/19/99 for Solaris sometimes crashes here. 261 # If this happens, the handle will yield no more information. 262 if not uhandle.peekline(): 263 raise SyntaxError, "Unexpected end of blast report. " + \ 264 "Looks suspiciously like a PSI-BLAST crash." 265 266 # BLASTN 2.2.3 sometimes spews a bunch of warnings and errors here: 267 # Searching[blastall] WARNING: [000.000] AT1G08320: SetUpBlastSearch 268 # [blastall] ERROR: [000.000] AT1G08320: Blast: 269 # [blastall] ERROR: [000.000] AT1G08320: Blast: Query must be at leas 270 # done 271 # Reported by David Weisman. 272 # Check for these error lines and ignore them for now. Let 273 # the BlastErrorParser deal with them. 274 line = uhandle.peekline() 275 if line.find("ERROR:") != -1 or line.startswith("done"): 276 read_and_call_while(uhandle, consumer.noevent, contains="ERROR:") 277 read_and_call(uhandle, consumer.noevent, start="done") 278 279 # Check to see if this is PSI-BLAST. 280 # If it is, the 'Searching' line will be followed by: 281 # (version 2.0.10) 282 # Searching............................. 283 # Results from round 2 284 # or (version 2.0.11) 285 # Searching............................. 286 # 287 # 288 # Results from round 2 289 290 # Skip a bunch of blank lines. 291 read_and_call_while(uhandle, consumer.noevent, blank=1) 292 # Check for the results line if it's there. 293 if attempt_read_and_call(uhandle, consumer.round, start='Results'): 294 read_and_call_while(uhandle, consumer.noevent, blank=1) 295 296 # Three things can happen here: 297 # 1. line contains 'Score E' 298 # 2. line contains "No hits found" 299 # 3. no descriptions 300 # The first one begins a bunch of descriptions. The last two 301 # indicates that no descriptions follow, and we should go straight 302 # to the alignments. 303 if not attempt_read_and_call( 304 uhandle, consumer.description_header, 305 has_re=re.compile(r'Score +E')): 306 # Either case 2 or 3. Look for "No hits found". 307 attempt_read_and_call(uhandle, consumer.no_hits, 308 contains='No hits found') 309 read_and_call_while(uhandle, consumer.noevent, blank=1) 310 311 consumer.end_descriptions() 312 # Stop processing. 313 return 314 315 # Read the score header lines 316 read_and_call(uhandle, consumer.description_header, 317 start='Sequences producing') 318 319 # If PSI-BLAST, read the 'Sequences used in model' line. 320 attempt_read_and_call(uhandle, consumer.model_sequences, 321 start='Sequences used in model') 322 read_and_call_while(uhandle, consumer.noevent, blank=1) 323 324 # Read the descriptions and the following blank lines, making 325 # sure that there are descriptions. 326 if not uhandle.peekline().startswith('Sequences not found'): 327 read_and_call_until(uhandle, consumer.description, blank=1) 328 read_and_call_while(uhandle, consumer.noevent, blank=1) 329 330 # If PSI-BLAST, read the 'Sequences not found' line followed 331 # by more descriptions. However, I need to watch out for the 332 # case where there were no sequences not found previously, in 333 # which case there will be no more descriptions. 334 if attempt_read_and_call(uhandle, consumer.nonmodel_sequences, 335 start='Sequences not found'): 336 # Read the descriptions and the following blank lines. 337 read_and_call_while(uhandle, consumer.noevent, blank=1) 338 l = safe_peekline(uhandle) 339 # Brad -- added check for QUERY. On some PSI-BLAST outputs 340 # there will be a 'Sequences not found' line followed by no 341 # descriptions. Check for this case since the first thing you'll 342 # get is a blank line and then 'QUERY' 343 if not l.startswith('CONVERGED') and l[0] != '>' \ 344 and not l.startswith('QUERY'): 345 read_and_call_until(uhandle, consumer.description, blank=1) 346 read_and_call_while(uhandle, consumer.noevent, blank=1) 347 348 attempt_read_and_call(uhandle, consumer.converged, start='CONVERGED') 349 read_and_call_while(uhandle, consumer.noevent, blank=1) 350 351 consumer.end_descriptions()
352
353 - def _scan_alignments(self, uhandle, consumer):
354 # qblast inserts a helpful line here. 355 attempt_read_and_call(uhandle, consumer.noevent, start="ALIGNMENTS") 356 357 # First, check to see if I'm at the database report. 358 line = safe_peekline(uhandle) 359 if line.startswith(' Database'): 360 return 361 elif line[0] == '>': 362 # XXX make a better check here between pairwise and masterslave 363 self._scan_pairwise_alignments(uhandle, consumer) 364 else: 365 # XXX put in a check to make sure I'm in a masterslave alignment 366 self._scan_masterslave_alignment(uhandle, consumer)
367
368 - def _scan_pairwise_alignments(self, uhandle, consumer):
369 while 1: 370 line = safe_peekline(uhandle) 371 if line[0] != '>': 372 break 373 self._scan_one_pairwise_alignment(uhandle, consumer)
374
375 - def _scan_one_pairwise_alignment(self, uhandle, consumer):
376 consumer.start_alignment() 377 378 self._scan_alignment_header(uhandle, consumer) 379 380 # Scan a bunch of score/alignment pairs. 381 while 1: 382 line = safe_peekline(uhandle) 383 if not line.startswith(' Score'): 384 break 385 self._scan_hsp(uhandle, consumer) 386 consumer.end_alignment()
387
388 - def _scan_alignment_header(self, uhandle, consumer):
389 # >d1rip__ 2.24.7.1.1 Ribosomal S17 protein [Bacillus 390 # stearothermophilus] 391 # Length = 81 392 # 393 # Or, more recently with different white space: 394 # 395 # >gi|15799684|ref|NP_285696.1| threonine synthase ... 396 # gi|15829258|ref|NP_308031.1| threonine synthase 397 # ... 398 # Length=428 399 read_and_call(uhandle, consumer.title, start='>') 400 while 1: 401 line = safe_readline(uhandle) 402 if line.lstrip().startswith('Length =') \ 403 or line.lstrip().startswith('Length='): 404 consumer.length(line) 405 break 406 elif is_blank_line(line): 407 # Check to make sure I haven't missed the Length line 408 raise SyntaxError, "I missed the Length in an alignment header" 409 consumer.title(line) 410 411 # Older versions of BLAST will have a line with some spaces. 412 # Version 2.0.14 (maybe 2.0.13?) and above print a true blank line. 413 if not attempt_read_and_call(uhandle, consumer.noevent, 414 start=' '): 415 read_and_call(uhandle, consumer.noevent, blank=1)
416
417 - def _scan_hsp(self, uhandle, consumer):
418 consumer.start_hsp() 419 self._scan_hsp_header(uhandle, consumer) 420 self._scan_hsp_alignment(uhandle, consumer) 421 consumer.end_hsp()
422
423 - def _scan_hsp_header(self, uhandle, consumer):
424 # Score = 22.7 bits (47), Expect = 2.5 425 # Identities = 10/36 (27%), Positives = 18/36 (49%) 426 # Strand = Plus / Plus 427 # Frame = +3 428 # 429 430 read_and_call(uhandle, consumer.score, start=' Score') 431 read_and_call(uhandle, consumer.identities, start=' Identities') 432 # BLASTN 433 attempt_read_and_call(uhandle, consumer.strand, start = ' Strand') 434 # BLASTX, TBLASTN, TBLASTX 435 attempt_read_and_call(uhandle, consumer.frame, start = ' Frame') 436 read_and_call(uhandle, consumer.noevent, blank=1)
437
438 - def _scan_hsp_alignment(self, uhandle, consumer):
439 # Query: 11 GRGVSACA-------TCDGFFYRNQKVAVIGGGNTAVEEALYLSNIASEVHLIHRRDGF 440 # GRGVS+ TC Y + + V GGG+ + EE L + I R+ 441 # Sbjct: 12 GRGVSSVVRRCIHKPTCKE--YAVKIIDVTGGGSFSAEEVQELREATLKEVDILRKVSG 442 # 443 # Query: 64 AEKILIKR 71 444 # I +K 445 # Sbjct: 70 PNIIQLKD 77 446 # 447 448 while 1: 449 # Blastn adds an extra line filled with spaces before Query 450 attempt_read_and_call(uhandle, consumer.noevent, start=' ') 451 read_and_call(uhandle, consumer.query, start='Query') 452 read_and_call(uhandle, consumer.align, start=' ') 453 read_and_call(uhandle, consumer.sbjct, start='Sbjct') 454 read_and_call_while(uhandle, consumer.noevent, blank=1) 455 line = safe_peekline(uhandle) 456 # Alignment continues if I see a 'Query' or the spaces for Blastn. 457 if not (line.startswith('Query') or line.startswith(' ')): 458 break
459
460 - def _scan_masterslave_alignment(self, uhandle, consumer):
461 consumer.start_alignment() 462 while 1: 463 line = safe_readline(uhandle) 464 # Check to see whether I'm finished reading the alignment. 465 # This is indicated by 1) database section, 2) next psi-blast 466 # round, which can also be a 'Results from round' if no 467 # searching line is present 468 # patch by chapmanb 469 if line.startswith('Searching') or \ 470 line.startswith('Results from round'): 471 uhandle.saveline(line) 472 break 473 elif line.startswith(' Database'): 474 uhandle.saveline(line) 475 break 476 elif is_blank_line(line): 477 consumer.noevent(line) 478 else: 479 consumer.multalign(line) 480 read_and_call_while(uhandle, consumer.noevent, blank=1) 481 consumer.end_alignment()
482
483 - def _scan_database_report(self, uhandle, consumer):
484 # Database: sdqib40-1.35.seg.fa 485 # Posted date: Nov 1, 1999 4:25 PM 486 # Number of letters in database: 223,339 487 # Number of sequences in database: 1323 488 # 489 # Lambda K H 490 # 0.322 0.133 0.369 491 # 492 # Gapped 493 # Lambda K H 494 # 0.270 0.0470 0.230 495 # 496 ########################################## 497 # Or, more recently Blast 2.2.15 gives less blank lines 498 ########################################## 499 # Database: All non-redundant GenBank CDS translations+PDB+SwissProt+PIR+PRF excluding 500 # environmental samples 501 # Posted date: Dec 12, 2006 5:51 PM 502 # Number of letters in database: 667,088,753 503 # Number of sequences in database: 2,094,974 504 # Lambda K H 505 # 0.319 0.136 0.395 506 # Gapped 507 # Lambda K H 508 # 0.267 0.0410 0.140 509 510 511 consumer.start_database_report() 512 513 # Subset of the database(s) listed below 514 # Number of letters searched: 562,618,960 515 # Number of sequences searched: 228,924 516 if attempt_read_and_call(uhandle, consumer.noevent, start=" Subset"): 517 read_and_call(uhandle, consumer.noevent, contains="letters") 518 read_and_call(uhandle, consumer.noevent, contains="sequences") 519 read_and_call(uhandle, consumer.noevent, start=" ") 520 521 # Sameet Mehta reported seeing output from BLASTN 2.2.9 that 522 # was missing the "Database" stanza completely. 523 while attempt_read_and_call(uhandle, consumer.database, 524 start=' Database'): 525 # BLAT output ends abruptly here, without any of the other 526 # information. Check to see if this is the case. If so, 527 # then end the database report here gracefully. 528 if not uhandle.peekline(): 529 consumer.end_database_report() 530 return 531 532 # Database can span multiple lines. 533 read_and_call_until(uhandle, consumer.database, start=' Posted') 534 read_and_call(uhandle, consumer.posted_date, start=' Posted') 535 read_and_call(uhandle, consumer.num_letters_in_database, 536 start=' Number of letters') 537 read_and_call(uhandle, consumer.num_sequences_in_database, 538 start=' Number of sequences') 539 #There may not be a line starting with spaces... 540 attempt_read_and_call(uhandle, consumer.noevent, start=' ') 541 542 line = safe_readline(uhandle) 543 uhandle.saveline(line) 544 if line.find('Lambda') != -1: 545 break 546 547 read_and_call(uhandle, consumer.noevent, start='Lambda') 548 read_and_call(uhandle, consumer.ka_params) 549 550 #This blank line is optional: 551 attempt_read_and_call(uhandle, consumer.noevent, blank=1) 552 553 # not BLASTP 554 attempt_read_and_call(uhandle, consumer.gapped, start='Gapped') 555 # not TBLASTX 556 if attempt_read_and_call(uhandle, consumer.noevent, start='Lambda'): 557 read_and_call(uhandle, consumer.ka_params_gap) 558 559 # Blast 2.2.4 can sometimes skip the whole parameter section. 560 # Thus, I need to be careful not to read past the end of the 561 # file. 562 try: 563 read_and_call_while(uhandle, consumer.noevent, blank=1) 564 except SyntaxError, x: 565 if str(x) != "Unexpected end of stream.": 566 raise 567 consumer.end_database_report()
568
569 - def _scan_parameters(self, uhandle, consumer):
570 # Matrix: BLOSUM62 571 # Gap Penalties: Existence: 11, Extension: 1 572 # Number of Hits to DB: 50604 573 # Number of Sequences: 1323 574 # Number of extensions: 1526 575 # Number of successful extensions: 6 576 # Number of sequences better than 10.0: 5 577 # Number of HSP's better than 10.0 without gapping: 5 578 # Number of HSP's successfully gapped in prelim test: 0 579 # Number of HSP's that attempted gapping in prelim test: 1 580 # Number of HSP's gapped (non-prelim): 5 581 # length of query: 140 582 # length of database: 223,339 583 # effective HSP length: 39 584 # effective length of query: 101 585 # effective length of database: 171,742 586 # effective search space: 17345942 587 # effective search space used: 17345942 588 # T: 11 589 # A: 40 590 # X1: 16 ( 7.4 bits) 591 # X2: 38 (14.8 bits) 592 # X3: 64 (24.9 bits) 593 # S1: 41 (21.9 bits) 594 # S2: 42 (20.8 bits) 595 ########################################## 596 # Or, more recently Blast(x) 2.2.15 gives 597 ########################################## 598 # Matrix: BLOSUM62 599 # Gap Penalties: Existence: 11, Extension: 1 600 # Number of Sequences: 4535438 601 # Number of Hits to DB: 2,588,844,100 602 # Number of extensions: 60427286 603 # Number of successful extensions: 126433 604 # Number of sequences better than 2.0: 30 605 # Number of HSP's gapped: 126387 606 # Number of HSP's successfully gapped: 35 607 # Length of query: 291 608 # Length of database: 1,573,298,872 609 # Length adjustment: 130 610 # Effective length of query: 161 611 # Effective length of database: 983,691,932 612 # Effective search space: 158374401052 613 # Effective search space used: 158374401052 614 # Neighboring words threshold: 12 615 # Window for multiple hits: 40 616 # X1: 16 ( 7.3 bits) 617 # X2: 38 (14.6 bits) 618 # X3: 64 (24.7 bits) 619 # S1: 41 (21.7 bits) 620 # S2: 32 (16.9 bits) 621 622 623 # Blast 2.2.4 can sometimes skip the whole parameter section. 624 # Thus, check to make sure that the parameter section really 625 # exists. 626 if not uhandle.peekline(): 627 return 628 629 # BLASTN 2.2.9 looks like it reverses the "Number of Hits" and 630 # "Number of Sequences" lines. 631 consumer.start_parameters() 632 633 # Matrix line may be missing in BLASTN 2.2.9 634 attempt_read_and_call(uhandle, consumer.matrix, start='Matrix') 635 # not TBLASTX 636 attempt_read_and_call(uhandle, consumer.gap_penalties, start='Gap') 637 638 attempt_read_and_call(uhandle, consumer.num_sequences, 639 start='Number of Sequences') 640 read_and_call(uhandle, consumer.num_hits, 641 start='Number of Hits') 642 attempt_read_and_call(uhandle, consumer.num_sequences, 643 start='Number of Sequences') 644 read_and_call(uhandle, consumer.num_extends, 645 start='Number of extensions') 646 read_and_call(uhandle, consumer.num_good_extends, 647 start='Number of successful') 648 649 read_and_call(uhandle, consumer.num_seqs_better_e, 650 start='Number of sequences') 651 652 # not BLASTN, TBLASTX 653 if attempt_read_and_call(uhandle, consumer.hsps_no_gap, 654 start="Number of HSP's better"): 655 # BLASTN 2.2.9 656 if attempt_read_and_call(uhandle, consumer.noevent, 657 start="Number of HSP's gapped:"): 658 read_and_call(uhandle, consumer.noevent, 659 start="Number of HSP's successfully") 660 #This is ommitted in 2.2.15 661 attempt_read_and_call(uhandle, consumer.noevent, 662 start="Number of extra gapped extensions") 663 else: 664 read_and_call(uhandle, consumer.hsps_prelim_gapped, 665 start="Number of HSP's successfully") 666 read_and_call(uhandle, consumer.hsps_prelim_gap_attempted, 667 start="Number of HSP's that") 668 read_and_call(uhandle, consumer.hsps_gapped, 669 start="Number of HSP's gapped") 670 #e.g. BLASTX 2.2.15 where the "better" line is missing 671 elif attempt_read_and_call(uhandle, consumer.noevent, 672 start="Number of HSP's gapped"): 673 read_and_call(uhandle, consumer.noevent, 674 start="Number of HSP's successfully") 675 676 # not in blastx 2.2.1 677 attempt_read_and_call(uhandle, consumer.query_length, 678 has_re=re.compile(r"[Ll]ength of query")) 679 read_and_call(uhandle, consumer.database_length, 680 has_re=re.compile(r"[Ll]ength of \s*[Dd]atabase")) 681 682 # BLASTN 2.2.9 683 attempt_read_and_call(uhandle, consumer.noevent, 684 start="Length adjustment") 685 attempt_read_and_call(uhandle, consumer.effective_hsp_length, 686 start='effective HSP') 687 # Not in blastx 2.2.1 688 attempt_read_and_call( 689 uhandle, consumer.effective_query_length, 690 has_re=re.compile(r'[Ee]ffective length of query')) 691 692 # This is not in BLASTP 2.2.15 693 attempt_read_and_call( 694 uhandle, consumer.effective_database_length, 695 has_re=re.compile(r'[Ee]ffective length of \s*[Dd]atabase')) 696 # Not in blastx 2.2.1, added a ':' to distinguish between 697 # this and the 'effective search space used' line 698 attempt_read_and_call( 699 uhandle, consumer.effective_search_space, 700 has_re=re.compile(r'[Ee]ffective search space:')) 701 # Does not appear in BLASTP 2.0.5 702 attempt_read_and_call( 703 uhandle, consumer.effective_search_space_used, 704 has_re=re.compile(r'[Ee]ffective search space used')) 705 706 # BLASTX, TBLASTN, TBLASTX 707 attempt_read_and_call(uhandle, consumer.frameshift, start='frameshift') 708 709 # not in BLASTN 2.2.9 710 attempt_read_and_call(uhandle, consumer.threshold, start='T') 711 # In BLASTX 2.2.15 replaced by: "Neighboring words threshold: 12" 712 attempt_read_and_call(uhandle, consumer.threshold, start='Neighboring words threshold') 713 714 # not in BLASTX 2.2.15 715 attempt_read_and_call(uhandle, consumer.window_size, start='A') 716 # get this instead: "Window for multiple hits: 40" 717 attempt_read_and_call(uhandle, consumer.window_size, start='Window for multiple hits') 718 719 read_and_call(uhandle, consumer.dropoff_1st_pass, start='X1') 720 read_and_call(uhandle, consumer.gap_x_dropoff, start='X2') 721 722 # not BLASTN, TBLASTX 723 attempt_read_and_call(uhandle, consumer.gap_x_dropoff_final, 724 start='X3') 725 726 read_and_call(uhandle, consumer.gap_trigger, start='S1') 727 # not in blastx 2.2.1 728 # first we make sure we have additional lines to work with, if 729 # not then the file is done and we don't have a final S2 730 if not is_blank_line(uhandle.peekline(), allow_spaces=1): 731 read_and_call(uhandle, consumer.blast_cutoff, start='S2') 732 733 consumer.end_parameters()
734
735 -class BlastParser(AbstractParser):
736 """Parses BLAST data into a Record.Blast object. 737 738 """
739 - def __init__(self):
740 """__init__(self)""" 741 self._scanner = _Scanner() 742 self._consumer = _BlastConsumer()
743
744 - def parse(self, handle):
745 """parse(self, handle)""" 746 self._scanner.feed(handle, self._consumer) 747 return self._consumer.data
748
749 -class PSIBlastParser(AbstractParser):
750 """Parses BLAST data into a Record.PSIBlast object. 751 752 """
753 - def __init__(self):
754 """__init__(self)""" 755 self._scanner = _Scanner() 756 self._consumer = _PSIBlastConsumer()
757
758 - def parse(self, handle):
759 """parse(self, handle)""" 760 self._scanner.feed(handle, self._consumer) 761 return self._consumer.data
762
763 -class _HeaderConsumer:
764 - def start_header(self):
765 self._header = Record.Header()
766
767 - def version(self, line):
768 c = line.split() 769 self._header.application = c[0] 770 self._header.version = c[1] 771 self._header.date = c[2][1:-1]
772
773 - def reference(self, line):
774 if line.startswith('Reference: '): 775 self._header.reference = line[11:] 776 else: 777 self._header.reference = self._header.reference + line
778
779 - def query_info(self, line):
780 if line.startswith('Query= '): 781 self._header.query = line[7:] 782 elif not line.startswith(' '): # continuation of query_info 783 self._header.query = "%s%s" % (self._header.query, line) 784 else: 785 letters, = _re_search( 786 r"([0-9,]+) letters", line, 787 "I could not find the number of letters in line\n%s" % line) 788 self._header.query_letters = _safe_int(letters)
789
790 - def database_info(self, line):
791 line = line.rstrip() 792 if line.startswith('Database: '): 793 self._header.database = line[10:] 794 elif not line.endswith('total letters'): 795 self._header.database = self._header.database + line.strip() 796 else: 797 sequences, letters =_re_search( 798 r"([0-9,]+) sequences; ([0-9,-]+) total letters", line, 799 "I could not find the sequences and letters in line\n%s" %line) 800 self._header.database_sequences = _safe_int(sequences) 801 self._header.database_letters = _safe_int(letters)
802
803 - def end_header(self):
804 # Get rid of the trailing newlines 805 self._header.reference = self._header.reference.rstrip() 806 self._header.query = self._header.query.rstrip()
807
808 -class _DescriptionConsumer:
809 - def start_descriptions(self):
810 self._descriptions = [] 811 self._model_sequences = [] 812 self._nonmodel_sequences = [] 813 self._converged = 0 814 self._type = None 815 self._roundnum = None 816 817 self.__has_n = 0 # Does the description line contain an N value?
818
819 - def description_header(self, line):
820 if line.startswith('Sequences producing'): 821 cols = line.split() 822 if cols[-1] == 'N': 823 self.__has_n = 1
824
825 - def description(self, line):
826 dh = self._parse(line) 827 if self._type == 'model': 828 self._model_sequences.append(dh) 829 elif self._type == 'nonmodel': 830 self._nonmodel_sequences.append(dh) 831 else: 832 self._descriptions.append(dh)
833
834 - def model_sequences(self, line):
835 self._type = 'model'
836
837 - def nonmodel_sequences(self, line):
838 self._type = 'nonmodel'
839
840 - def converged(self, line):
841 self._converged = 1
842
843 - def no_hits(self, line):
844 pass
845
846 - def round(self, line):
847 if not line.startswith('Results from round'): 848 raise SyntaxError, "I didn't understand the round line\n%s" % line 849 self._roundnum = _safe_int(line[18:].strip())
850
851 - def end_descriptions(self):
852 pass
853
854 - def _parse(self, description_line):
855 line = description_line # for convenience 856 dh = Record.Description() 857 858 # I need to separate the score and p-value from the title. 859 # sp|P21297|FLBT_CAUCR FLBT PROTEIN [snip] 284 7e-77 860 # sp|P21297|FLBT_CAUCR FLBT PROTEIN [snip] 284 7e-77 1 861 # special cases to handle: 862 # - title must be preserved exactly (including whitespaces) 863 # - score could be equal to e-value (not likely, but what if??) 864 # - sometimes there's an "N" score of '1'. 865 cols = line.split() 866 if len(cols) < 3: 867 raise SyntaxError, \ 868 "Line does not appear to contain description:\n%s" % line 869 if self.__has_n: 870 i = line.rfind(cols[-1]) # find start of N 871 i = line.rfind(cols[-2], 0, i) # find start of p-value 872 i = line.rfind(cols[-3], 0, i) # find start of score 873 else: 874 i = line.rfind(cols[-1]) # find start of p-value 875 i = line.rfind(cols[-2], 0, i) # find start of score 876 if self.__has_n: 877 dh.title, dh.score, dh.e, dh.num_alignments = \ 878 line[:i].rstrip(), cols[-3], cols[-2], cols[-1] 879 else: 880 dh.title, dh.score, dh.e, dh.num_alignments = \ 881 line[:i].rstrip(), cols[-2], cols[-1], 1 882 dh.num_alignments = _safe_int(dh.num_alignments) 883 dh.score = _safe_int(dh.score) 884 dh.e = _safe_float(dh.e) 885 return dh
886
887 -class _AlignmentConsumer:
888 # This is a little bit tricky. An alignment can either be a 889 # pairwise alignment or a multiple alignment. Since it's difficult 890 # to know a-priori which one the blast record will contain, I'm going 891 # to make one class that can parse both of them.
892 - def start_alignment(self):
893 self._alignment = Record.Alignment() 894 self._multiple_alignment = Record.MultipleAlignment()
895
896 - def title(self, line):
897 self._alignment.title = "%s%s" % (self._alignment.title, 898 line.lstrip())
899
900 - def length(self, line):
901 #e.g. "Length = 81" or more recently, "Length=428" 902 parts = line.replace(" ","").split("=") 903 assert len(parts)==2, "Unrecognised format length line" 904 self._alignment.length = parts[1] 905 self._alignment.length = _safe_int(self._alignment.length)
906
907 - def multalign(self, line):
908 # Standalone version uses 'QUERY', while WWW version uses blast_tmp. 909 if line.startswith('QUERY') or line.startswith('blast_tmp'): 910 # If this is the first line of the multiple alignment, 911 # then I need to figure out how the line is formatted. 912 913 # Format of line is: 914 # QUERY 1 acttg...gccagaggtggtttattcagtctccataagagaggggacaaacg 60 915 try: 916 name, start, seq, end = line.split() 917 except ValueError: 918 raise SyntaxError, "I do not understand the line\n%s" \ 919 % line 920 self._start_index = line.index(start, len(name)) 921 self._seq_index = line.index(seq, 922 self._start_index+len(start)) 923 # subtract 1 for the space 924 self._name_length = self._start_index - 1 925 self._start_length = self._seq_index - self._start_index - 1 926 self._seq_length = line.rfind(end) - self._seq_index - 1 927 928 #self._seq_index = line.index(seq) 929 ## subtract 1 for the space 930 #self._seq_length = line.rfind(end) - self._seq_index - 1 931 #self._start_index = line.index(start) 932 #self._start_length = self._seq_index - self._start_index - 1 933 #self._name_length = self._start_index 934 935 # Extract the information from the line 936 name = line[:self._name_length] 937 name = name.rstrip() 938 start = line[self._start_index:self._start_index+self._start_length] 939 start = start.rstrip() 940 if start: 941 start = _safe_int(start) 942 end = line[self._seq_index+self._seq_length:].rstrip() 943 if end: 944 end = _safe_int(end) 945 seq = line[self._seq_index:self._seq_index+self._seq_length].rstrip() 946 # right pad the sequence with spaces if necessary 947 if len(seq) < self._seq_length: 948 seq = seq + ' '*(self._seq_length-len(seq)) 949 950 # I need to make sure the sequence is aligned correctly with the query. 951 # First, I will find the length of the query. Then, if necessary, 952 # I will pad my current sequence with spaces so that they will line 953 # up correctly. 954 955 # Two possible things can happen: 956 # QUERY 957 # 504 958 # 959 # QUERY 960 # 403 961 # 962 # Sequence 504 will need padding at the end. Since I won't know 963 # this until the end of the alignment, this will be handled in 964 # end_alignment. 965 # Sequence 403 will need padding before being added to the alignment. 966 967 align = self._multiple_alignment.alignment # for convenience 968 align.append((name, start, seq, end))
969 970 # This is old code that tried to line up all the sequences 971 # in a multiple alignment by using the sequence title's as 972 # identifiers. The problem with this is that BLAST assigns 973 # different HSP's from the same sequence the same id. Thus, 974 # in one alignment block, there may be multiple sequences with 975 # the same id. I'm not sure how to handle this, so I'm not 976 # going to. 977 978 # # If the sequence is the query, then just add it. 979 # if name == 'QUERY': 980 # if len(align) == 0: 981 # align.append((name, start, seq)) 982 # else: 983 # aname, astart, aseq = align[0] 984 # if name != aname: 985 # raise SyntaxError, "Query is not the first sequence" 986 # aseq = aseq + seq 987 # align[0] = aname, astart, aseq 988 # else: 989 # if len(align) == 0: 990 # raise SyntaxError, "I could not find the query sequence" 991 # qname, qstart, qseq = align[0] 992 # 993 # # Now find my sequence in the multiple alignment. 994 # for i in range(1, len(align)): 995 # aname, astart, aseq = align[i] 996 # if name == aname: 997 # index = i 998 # break 999 # else: 1000 # # If I couldn't find it, then add a new one. 1001 # align.append((None, None, None)) 1002 # index = len(align)-1 1003 # # Make sure to left-pad it. 1004 # aname, astart, aseq = name, start, ' '*(len(qseq)-len(seq)) 1005 # 1006 # if len(qseq) != len(aseq) + len(seq): 1007 # # If my sequences are shorter than the query sequence, 1008 # # then I will need to pad some spaces to make them line up. 1009 # # Since I've already right padded seq, that means aseq 1010 # # must be too short. 1011 # aseq = aseq + ' '*(len(qseq)-len(aseq)-len(seq)) 1012 # aseq = aseq + seq 1013 # if astart is None: 1014 # astart = start 1015 # align[index] = aname, astart, aseq 1016
1017 - def end_alignment(self):
1018 # Remove trailing newlines 1019 if self._alignment: 1020 self._alignment.title = self._alignment.title.rstrip() 1021 1022 # This code is also obsolete. See note above. 1023 # If there's a multiple alignment, I will need to make sure 1024 # all the sequences are aligned. That is, I may need to 1025 # right-pad the sequences. 1026 # if self._multiple_alignment is not None: 1027 # align = self._multiple_alignment.alignment 1028 # seqlen = None 1029 # for i in range(len(align)): 1030 # name, start, seq = align[i] 1031 # if seqlen is None: 1032 # seqlen = len(seq) 1033 # else: 1034 # if len(seq) < seqlen: 1035 # seq = seq + ' '*(seqlen - len(seq)) 1036 # align[i] = name, start, seq 1037 # elif len(seq) > seqlen: 1038 # raise SyntaxError, \ 1039 # "Sequence %s is longer than the query" % name 1040 1041 # Clean up some variables, if they exist. 1042 try: 1043 del self._seq_index 1044 del self._seq_length 1045 del self._start_index 1046 del self._start_length 1047 del self._name_length 1048 except AttributeError: 1049 pass
1050
1051 -class _HSPConsumer:
1052 - def start_hsp(self):
1053 self._hsp = Record.HSP()
1054
1055 - def score(self, line):
1056 self._hsp.bits, self._hsp.score = _re_search( 1057 r"Score =\s*([0-9.e+]+) bits \(([0-9]+)\)", line, 1058 "I could not find the score in line\n%s" % line) 1059 self._hsp.score = _safe_float(self._hsp.score) 1060 self._hsp.bits = _safe_float(self._hsp.bits) 1061 1062 x, y = _re_search( 1063 r"Expect\(?(\d*)\)? = +([0-9.e\-|\+]+)", line, 1064 "I could not find the expect in line\n%s" % line) 1065 if x: 1066 self._hsp.num_alignments = _safe_int(x) 1067 else: 1068 self._hsp.num_alignments = 1 1069 self._hsp.expect = _safe_float(y)
1070
1071 - def identities(self, line):
1072 x, y = _re_search( 1073 r"Identities = (\d+)\/(\d+)", line, 1074 "I could not find the identities in line\n%s" % line) 1075 self._hsp.identities = _safe_int(x), _safe_int(y) 1076 1077 if line.find('Positives') != -1: 1078 x, y = _re_search( 1079 r"Positives = (\d+)\/(\d+)", line, 1080 "I could not find the positives in line\n%s" % line) 1081 self._hsp.positives = _safe_int(x), _safe_int(y) 1082 1083 if line.find('Gaps') != -1: 1084 x, y = _re_search( 1085 r"Gaps = (\d+)\/(\d+)", line, 1086 "I could not find the gaps in line\n%s" % line) 1087 self._hsp.gaps = _safe_int(x), _safe_int(y)
1088 1089
1090 - def strand(self, line):
1091 self._hsp.strand = _re_search( 1092 r"Strand = (\w+) / (\w+)", line, 1093 "I could not find the strand in line\n%s" % line)
1094
1095 - def frame(self, line):
1096 # Frame can be in formats: 1097 # Frame = +1 1098 # Frame = +2 / +2 1099 if line.find('/') != -1: 1100 self._hsp.frame = _re_search( 1101 r"Frame = ([-+][123]) / ([-+][123])", line, 1102 "I could not find the frame in line\n%s" % line) 1103 else: 1104 self._hsp.frame = _re_search( 1105 r"Frame = ([-+][123])", line, 1106 "I could not find the frame in line\n%s" % line)
1107 1108 # Match a space, if one is available. Masahir Ishikawa found a 1109 # case where there's no space between the start and the sequence: 1110 # Query: 100tt 101 1111 # line below modified by Yair Benita, Sep 2004 1112 # Note that the colon is not always present. 2006 1113 _query_re = re.compile(r"Query(:?) \s*(\d+)\s*(.+) (\d+)")
1114 - def query(self, line):
1115 m = self._query_re.search(line) 1116 if m is None: 1117 raise SyntaxError, "I could not find the query in line\n%s" % line 1118 1119 # line below modified by Yair Benita, Sep 2004. 1120 # added the end attribute for the query 1121 colon, start, seq, end = m.groups() 1122 self._hsp.query = self._hsp.query + seq 1123 if self._hsp.query_start is None: 1124 self._hsp.query_start = _safe_int(start) 1125 1126 # line below added by Yair Benita, Sep 2004. 1127 # added the end attribute for the query 1128 self._hsp.query_end = _safe_int(end) 1129 1130 #Get index for sequence start (regular expression element 3) 1131 self._query_start_index = m.start(3) 1132 self._query_len = len(seq)
1133
1134 - def align(self, line):
1135 seq = line[self._query_start_index:].rstrip() 1136 if len(seq) < self._query_len: 1137 # Make sure the alignment is the same length as the query 1138 seq = seq + ' ' * (self._query_len-len(seq)) 1139 elif len(seq) < self._query_len: 1140 raise SyntaxError, "Match is longer than the query in line\n%s" % \ 1141 line 1142 self._hsp.match = self._hsp.match + seq
1143 1144 # To match how we do the query, cache the regular expression. 1145 # Note that the colon is not always present. 1146 _sbjct_re = re.compile(r"Sbjct(:?) \s*(\d+)\s*(.+) (\d+)")
1147 - def sbjct(self, line):
1148 m = self._sbjct_re.search(line) 1149 if m is None: 1150 raise SyntaxError, "I could not find the sbjct in line\n%s" % line 1151 colon, start, seq, end = m.groups() 1152 #mikep 26/9/00 1153 #On occasion, there is a blast hit with no subject match 1154 #so far, it only occurs with 1-line short "matches" 1155 #I have decided to let these pass as they appear 1156 if not seq.strip(): 1157 seq = ' ' * self._query_len 1158 self._hsp.sbjct = self._hsp.sbjct + seq 1159 if self._hsp.sbjct_start is None: 1160 self._hsp.sbjct_start = _safe_int(start) 1161 1162 self._hsp.sbjct_end = _safe_int(end) 1163 if len(seq) != self._query_len: 1164 raise SyntaxError, \ 1165 "QUERY and SBJCT sequence lengths don't match in line\n%s" \ 1166 % line 1167 1168 del self._query_start_index # clean up unused variables 1169 del self._query_len
1170
1171 - def end_hsp(self):
1172 pass
1173
1174 -class _DatabaseReportConsumer:
1175
1176 - def start_database_report(self):
1177 self._dr = Record.DatabaseReport()
1178
1179 - def database(self, line):
1180 m = re.search(r"Database: (.+)$", line) 1181 if m: 1182 self._dr.database_name.append(m.group(1)) 1183 elif self._dr.database_name: 1184 # This must be a continuation of the previous name. 1185 self._dr.database_name[-1] = "%s%s" % (self._dr.database_name[-1], 1186 line.strip())
1187
1188 - def posted_date(self, line):
1189 self._dr.posted_date.append(_re_search( 1190 r"Posted date:\s*(.+)$", line, 1191 "I could not find the posted date in line\n%s" % line))
1192
1193 - def num_letters_in_database(self, line):
1194 letters, = _get_cols( 1195 line, (-1,), ncols=6, expected={2:"letters", 4:"database:"}) 1196 self._dr.num_letters_in_database.append(_safe_int(letters))
1197
1198 - def num_sequences_in_database(self, line):
1199 sequences, = _get_cols( 1200 line, (-1,), ncols=6, expected={2:"sequences", 4:"database:"}) 1201 self._dr.num_sequences_in_database.append(_safe_int(sequences))
1202
1203 - def ka_params(self, line):
1204 x = line.split() 1205 self._dr.ka_params = map(_safe_float, x)
1206
1207 - def gapped(self, line):
1208 self._dr.gapped = 1
1209
1210 - def ka_params_gap(self, line):
1211 x = line.split() 1212 self._dr.ka_params_gap = map(_safe_float, x)
1213
1214 - def end_database_report(self):
1215 pass
1216
1217 -class _ParametersConsumer:
1218 - def start_parameters(self):
1219 self._params = Record.Parameters()
1220
1221 - def matrix(self, line):
1222 self._params.matrix = line[8:].rstrip()
1223
1224 - def gap_penalties(self, line):
1225 x = _get_cols( 1226 line, (3, 5), ncols=6, expected={2:"Existence:", 4:"Extension:"}) 1227 self._params.gap_penalties = map(_safe_float, x)
1228
1229 - def num_hits(self, line):
1230 if line.find('1st pass') != -1: 1231 x, = _get_cols(line, (-4,), ncols=11, expected={2:"Hits"}) 1232 self._params.num_hits = _safe_int(x) 1233 else: 1234 x, = _get_cols(line, (-1,), ncols=6, expected={2:"Hits"}) 1235 self._params.num_hits = _safe_int(x)
1236
1237 - def num_sequences(self, line):
1238 if line.find('1st pass') != -1: 1239 x, = _get_cols(line, (-4,), ncols=9, expected={2:"Sequences:"}) 1240 self._params.num_sequences = _safe_int(x) 1241 else: 1242 x, = _get_cols(line, (-1,), ncols=4, expected={2:"Sequences:"}) 1243 self._params.num_sequences = _safe_int(x)
1244
1245 - def num_extends(self, line):
1246 if line.find('1st pass') != -1: 1247 x, = _get_cols(line, (-4,), ncols=9, expected={2:"extensions:"}) 1248 self._params.num_extends = _safe_int(x) 1249 else: 1250 x, = _get_cols(line, (-1,), ncols=4, expected={2:"extensions:"}) 1251 self._params.num_extends = _safe_int(x)
1252
1253 - def num_good_extends(self, line):
1254 if line.find('1st pass') != -1: 1255 x, = _get_cols(line, (-4,), ncols=10, expected={3:"extensions:"}) 1256 self._params.num_good_extends = _safe_int(x) 1257 else: 1258 x, = _get_cols(line, (-1,), ncols=5, expected={3:"extensions:"}) 1259 self._params.num_good_extends = _safe_int(x)
1260
1261 - def num_seqs_better_e(self, line):
1262 self._params.num_seqs_better_e, = _get_cols( 1263 line, (-1,), ncols=7, expected={2:"sequences"}) 1264 self._params.num_seqs_better_e = _safe_int( 1265 self._params.num_seqs_better_e)
1266
1267 - def hsps_no_gap(self, line):
1268 self._params.hsps_no_gap, = _get_cols( 1269 line, (-1,), ncols=9, expected={3:"better", 7:"gapping:"}) 1270 self._params.hsps_no_gap = _safe_int(self._params.hsps_no_gap)
1271
1272 - def hsps_prelim_gapped(self, line):
1273 self._params.hsps_prelim_gapped, = _get_cols( 1274 line, (-1,), ncols=9, expected={4:"gapped", 6:"prelim"}) 1275 self._params.hsps_prelim_gapped = _safe_int( 1276 self._params.hsps_prelim_gapped)
1277
1278 - def hsps_prelim_gapped_attempted(self, line):
1279 self._params.hsps_prelim_gapped_attempted, = _get_cols( 1280 line, (-1,), ncols=10, expected={4:"attempted", 7:"prelim"}) 1281 self._params.hsps_prelim_gapped_attempted = _safe_int( 1282 self._params.hsps_prelim_gapped_attempted)
1283
1284 - def hsps_gapped(self, line):
1285 self._params.hsps_gapped, = _get_cols( 1286 line, (-1,), ncols=6, expected={3:"gapped"}) 1287 self._params.hsps_gapped = _safe_int(self._params.hsps_gapped)
1288
1289 - def query_length(self, line):
1290 self._params.query_length, = _get_cols( 1291 line.lower(), (-1,), ncols=4, expected={0:"length", 2:"query:"}) 1292 self._params.query_length = _safe_int(self._params.query_length)
1293
1294 - def database_length(self, line):
1295 self._params.database_length, = _get_cols( 1296 line.lower(), (-1,), ncols=4, expected={0:"length", 2:"database:"}) 1297 self._params.database_length = _safe_int(self._params.database_length)
1298
1299 - def effective_hsp_length(self, line):
1300 self._params.effective_hsp_length, = _get_cols( 1301 line, (-1,), ncols=4, expected={1:"HSP", 2:"length:"}) 1302 self._params.effective_hsp_length = _safe_int( 1303 self._params.effective_hsp_length)
1304
1305 - def effective_query_length(self, line):
1306 self._params.effective_query_length, = _get_cols( 1307 line, (-1,), ncols=5, expected={1:"length", 3:"query:"}) 1308 self._params.effective_query_length = _safe_int( 1309 self._params.effective_query_length)
1310
1311 - def effective_database_length(self, line):
1312 self._params.effective_database_length, = _get_cols( 1313 line.lower(), (-1,), ncols=5, expected={1:"length", 3:"database:"}) 1314 self._params.effective_database_length = _safe_int( 1315 self._params.effective_database_length)
1316
1317 - def effective_search_space(self, line):
1318 self._params.effective_search_space, = _get_cols( 1319 line, (-1,), ncols=4, expected={1:"search"}) 1320 self._params.effective_search_space = _safe_int( 1321 self._params.effective_search_space)
1322
1323 - def effective_search_space_used(self, line):
1324 self._params.effective_search_space_used, = _get_cols( 1325 line, (-1,), ncols=5, expected={1:"search", 3:"used:"}) 1326 self._params.effective_search_space_used = _safe_int( 1327 self._params.effective_search_space_used)
1328
1329 - def frameshift(self, line):
1330 self._params.frameshift = _get_cols( 1331 line, (4, 5), ncols=6, expected={0:"frameshift", 2:"decay"})
1332
1333 - def threshold(self, line):
1334 if line[:2] == "T:" : 1335 #Assume its an old stlye line like "T: 123" 1336 self._params.threshold, = _get_cols( 1337 line, (1,), ncols=2, expected={0:"T:"}) 1338 elif line[:28] == "Neighboring words threshold:" : 1339 self._params.threshold, = _get_cols( 1340 line, (3,), ncols=4, expected={0:"Neighboring", 1:"words", 2:"threshold:"}) 1341 else : 1342 raise SyntaxError("Unrecognised threshold line:\n%s" % line) 1343 self._params.threshold = _safe_int(self._params.threshold)
1344
1345 - def window_size(self, line):
1346 if line[:2] == "A:" : 1347 self._params.window_size, = _get_cols( 1348 line, (1,), ncols=2, expected={0:"A:"}) 1349 elif line[:25] == "Window for multiple hits:" : 1350 self._params.window_size, = _get_cols( 1351 line, (4,), ncols=5, expected={0:"Window", 2:"multiple", 3:"hits:"}) 1352 else : 1353 raise SyntaxError("Unrecognised window size line:\n%s" % line) 1354 self._params.window_size = _safe_int(self._params.window_size)
1355
1356 - def dropoff_1st_pass(self, line):
1357 score, bits = _re_search( 1358 r"X1: (\d+) \(\s*([0-9,.]+) bits\)", line, 1359 "I could not find the dropoff in line\n%s" % line) 1360 self._params.dropoff_1st_pass = _safe_int(score), _safe_float(bits)
1361
1362 - def gap_x_dropoff(self, line):
1363 score, bits = _re_search( 1364 r"X2: (\d+) \(\s*([0-9,.]+) bits\)", line, 1365 "I could not find the gap dropoff in line\n%s" % line) 1366 self._params.gap_x_dropoff = _safe_int(score), _safe_float(bits)
1367
1368 - def gap_x_dropoff_final(self, line):
1369 score, bits = _re_search( 1370 r"X3: (\d+) \(\s*([0-9,.]+) bits\)", line, 1371 "I could not find the gap dropoff final in line\n%s" % line) 1372 self._params.gap_x_dropoff_final = _safe_int(score), _safe_float(bits)
1373
1374 - def gap_trigger(self, line):
1375 score, bits = _re_search( 1376 r"S1: (\d+) \(\s*([0-9,.]+) bits\)", line, 1377 "I could not find the gap trigger in line\n%s" % line) 1378 self._params.gap_trigger = _safe_int(score), _safe_float(bits)
1379
1380 - def blast_cutoff(self, line):
1381 score, bits = _re_search( 1382 r"S2: (\d+) \(\s*([0-9,.]+) bits\)", line, 1383 "I could not find the blast cutoff in line\n%s" % line) 1384 self._params.blast_cutoff = _safe_int(score), _safe_float(bits)
1385
1386 - def end_parameters(self):
1387 pass
1388 1389
1390 -class _BlastConsumer(AbstractConsumer, 1391 _HeaderConsumer, 1392 _DescriptionConsumer, 1393 _AlignmentConsumer, 1394 _HSPConsumer, 1395 _DatabaseReportConsumer, 1396 _ParametersConsumer 1397 ):
1398 # This Consumer is inherits from many other consumer classes that handle 1399 # the actual dirty work. An alternate way to do it is to create objects 1400 # of those classes and then delegate the parsing tasks to them in a 1401 # decorator-type pattern. The disadvantage of that is that the method 1402 # names will need to be resolved in this classes. However, using 1403 # a decorator will retain more control in this class (which may or 1404 # may not be a bad thing). In addition, having each sub-consumer as 1405 # its own object prevents this object's dictionary from being cluttered 1406 # with members and reduces the chance of member collisions.
1407 - def __init__(self):
1408 self.data = None
1409
1410 - def round(self, line):
1411 # Make sure nobody's trying to pass me PSI-BLAST data! 1412 raise ValueError, \ 1413 "This consumer doesn't handle PSI-BLAST data"
1414
1415 - def start_header(self):
1416 self.data = Record.Blast() 1417 _HeaderConsumer.start_header(self)
1418
1419 - def end_header(self):
1420 _HeaderConsumer.end_header(self) 1421 self.data.__dict__.update(self._header.__dict__)
1422
1423 - def end_descriptions(self):
1424 self.data.descriptions = self._descriptions
1425
1426 - def end_alignment(self):
1427 _AlignmentConsumer.end_alignment(self) 1428 if self._alignment.hsps: 1429 self.data.alignments.append(self._alignment) 1430 if self._multiple_alignment.alignment: 1431 self.data.multiple_alignment = self._multiple_alignment
1432
1433 - def end_hsp(self):
1434 _HSPConsumer.end_hsp(self) 1435 try: 1436 self._alignment.hsps.append(self._hsp) 1437 except AttributeError: 1438 raise SyntaxError, "Found an HSP before an alignment"
1439
1440 - def end_database_report(self):
1441 _DatabaseReportConsumer.end_database_report(self) 1442 self.data.__dict__.update(self._dr.__dict__)
1443
1444 - def end_parameters(self):
1445 _ParametersConsumer.end_parameters(self) 1446 self.data.__dict__.update(self._params.__dict__)
1447
1448 -class _PSIBlastConsumer(AbstractConsumer, 1449 _HeaderConsumer, 1450 _DescriptionConsumer, 1451 _AlignmentConsumer, 1452 _HSPConsumer, 1453 _DatabaseReportConsumer, 1454 _ParametersConsumer 1455 ):
1456 - def __init__(self):
1457 self.data = None
1458
1459 - def start_header(self):
1460 self.data = Record.PSIBlast() 1461 _HeaderConsumer.start_header(self)
1462
1463 - def end_header(self):
1464 _HeaderConsumer.end_header(self) 1465 self.data.__dict__.update(self._header.__dict__)
1466
1467 - def start_descriptions(self):
1468 self._round = Record.Round() 1469 self.data.rounds.append(self._round) 1470 _DescriptionConsumer.start_descriptions(self)
1471
1472 - def end_descriptions(self):
1473 _DescriptionConsumer.end_descriptions(self) 1474 self._round.number = self._roundnum 1475 if self._descriptions: 1476 self._round.new_seqs.extend(self._descriptions) 1477 self._round.reused_seqs.extend(self._model_sequences) 1478 self._round.new_seqs.extend(self._nonmodel_sequences) 1479 if self._converged: 1480 self.data.converged = 1
1481
1482 - def end_alignment(self):
1483 _AlignmentConsumer.end_alignment(self) 1484 if self._alignment.hsps: 1485 self._round.alignments.append(self._alignment) 1486 if self._multiple_alignment: 1487 self._round.multiple_alignment = self._multiple_alignment
1488
1489 - def end_hsp(self):
1490 _HSPConsumer.end_hsp(self) 1491 try: 1492 self._alignment.hsps.append(self._hsp) 1493 except AttributeError: 1494 raise SyntaxError, "Found an HSP before an alignment"
1495
1496 - def end_database_report(self):
1497 _DatabaseReportConsumer.end_database_report(self) 1498 self.data.__dict__.update(self._dr.__dict__)
1499
1500 - def end_parameters(self):
1501 _ParametersConsumer.end_parameters(self) 1502 self.data.__dict__.update(self._params.__dict__)
1503
1504 -class Iterator:
1505 """Iterates over a file of multiple BLAST results. 1506 1507 Methods: 1508 next Return the next record from the stream, or None. 1509 1510 """
1511 - def __init__(self, handle, parser=None):
1512 """__init__(self, handle, parser=None) 1513 1514 Create a new iterator. handle is a file-like object. parser 1515 is an optional Parser object to change the results into another form. 1516 If set to None, then the raw contents of the file will be returned. 1517 1518 """ 1519 try: 1520 handle.readline 1521 except AttributeError: 1522 raise ValueError( 1523 "I expected a file handle or file-like object, got %s" 1524 % type(handle)) 1525 self._uhandle = File.UndoHandle(handle) 1526 self._parser = parser
1527
1528 - def next(self):
1529 """next(self) -> object 1530 1531 Return the next Blast record from the file. If no more records, 1532 return None. 1533 1534 """ 1535 lines = [] 1536 while 1: 1537 line = self._uhandle.readline() 1538 if not line: 1539 break 1540 # If I've reached the next one, then put the line back and stop. 1541 if lines and (line.startswith('BLAST') 1542 or line.startswith('BLAST', 1) 1543 or line.startswith('<?xml ')): 1544 self._uhandle.saveline(line) 1545 break 1546 lines.append(line) 1547 1548 if not lines: 1549 return None 1550 1551 data = ''.join(lines) 1552 if self._parser is not None: 1553 return self._parser.parse(File.StringHandle(data)) 1554 return data
1555
1556 - def __iter__(self):
1557 return iter(self.next, None)
1558
1559 -def blastall(blastcmd, program, database, infile, align_view='7', **keywds):
1560 """blastall(blastcmd, program, database, infile, align_view='7', **keywds) 1561 -> read, error Undohandles 1562 1563 Execute and retrieve data from blastall. blastcmd is the command 1564 used to launch the 'blastall' executable. program is the blast program 1565 to use, e.g. 'blastp', 'blastn', etc. database is the path to the database 1566 to search against. infile is the path to the file containing 1567 the sequence to search with. 1568 1569 You may pass more parameters to **keywds to change the behavior of 1570 the search. Otherwise, optional values will be chosen by blastall. 1571 The Blast output is by default in XML format. Use the align_view keyword 1572 for output in a different format. 1573 1574 Scoring 1575 matrix Matrix to use. 1576 gap_open Gap open penalty. 1577 gap_extend Gap extension penalty. 1578 nuc_match Nucleotide match reward. (BLASTN) 1579 nuc_mismatch Nucleotide mismatch penalty. (BLASTN) 1580 query_genetic_code Genetic code for Query. 1581 db_genetic_code Genetic code for database. (TBLAST[NX]) 1582 1583 Algorithm 1584 gapped Whether to do a gapped alignment. T/F (not for TBLASTX) 1585 expectation Expectation value cutoff. 1586 wordsize Word size. 1587 strands Query strands to search against database.([T]BLAST[NX]) 1588 keep_hits Number of best hits from a region to keep. 1589 xdrop Dropoff value (bits) for gapped alignments. 1590 hit_extend Threshold for extending hits. 1591 region_length Length of region used to judge hits. 1592 db_length Effective database length. 1593 search_length Effective length of search space. 1594 1595 Processing 1596 filter Filter query sequence? T/F 1597 believe_query Believe the query defline. T/F 1598 restrict_gi Restrict search to these GI's. 1599 nprocessors Number of processors to use. 1600 oldengine Force use of old engine T/F 1601 1602 Formatting 1603 html Produce HTML output? T/F 1604 descriptions Number of one-line descriptions. 1605 alignments Number of alignments. 1606 align_view Alignment view. Integer 0-11, 1607 passed as a string or integer. 1608 show_gi Show GI's in deflines? T/F 1609 seqalign_file seqalign file to output. 1610 1611 """ 1612 att2param = { 1613 'matrix' : '-M', 1614 'gap_open' : '-G', 1615 'gap_extend' : '-E', 1616 'nuc_match' : '-r', 1617 'nuc_mismatch' : '-q', 1618 'query_genetic_code' : '-Q', 1619 'db_genetic_code' : '-D', 1620 1621 'gapped' : '-g', 1622 'expectation' : '-e', 1623 'wordsize' : '-W', 1624 'strands' : '-S', 1625 'keep_hits' : '-K', 1626 'xdrop' : '-X', 1627 'hit_extend' : '-f', 1628 'region_length' : '-L', 1629 'db_length' : '-z', 1630 'search_length' : '-Y', 1631 1632 'program' : '-p', 1633 'database' : '-d', 1634 'infile' : '-i', 1635 'filter' : '-F', 1636 'believe_query' : '-J', 1637 'restrict_gi' : '-l', 1638 'nprocessors' : '-a', 1639 'oldengine' : '-V', 1640 1641 'html' : '-T', 1642 'descriptions' : '-v', 1643 'alignments' : '-b', 1644 'align_view' : '-m', 1645 'show_gi' : '-I', 1646 'seqalign_file' : '-O' 1647 } 1648 1649 if not os.path.exists(blastcmd): 1650 raise ValueError, "blastall does not exist at %s" % blastcmd 1651 1652 params = [] 1653 1654 params.extend([att2param['program'], program]) 1655 params.extend([att2param['database'], database]) 1656 params.extend([att2param['infile'], infile]) 1657 params.extend([att2param['align_view'], str(align_view)]) 1658 1659 for attr in keywds.keys(): 1660 params.extend([att2param[attr], str(keywds[attr])]) 1661 1662 w, r, e = os.popen3(" ".join([blastcmd] + params)) 1663 w.close() 1664 return File.UndoHandle(r), File.UndoHandle(e)
1665 1666
1667 -def blastpgp(blastcmd, database, infile, align_view='7', **keywds):
1668 """blastpgp(blastcmd, database, infile, align_view='7', **keywds) -> 1669 read, error Undohandles 1670 1671 Execute and retrieve data from blastpgp. blastcmd is the command 1672 used to launch the 'blastpgp' executable. database is the path to the 1673 database to search against. infile is the path to the file containing 1674 the sequence to search with. 1675 1676 You may pass more parameters to **keywds to change the behavior of 1677 the search. Otherwise, optional values will be chosen by blastpgp. 1678 The Blast output is by default in XML format. Use the align_view keyword 1679 for output in a different format. 1680 1681 Scoring 1682 matrix Matrix to use. 1683 gap_open Gap open penalty. 1684 gap_extend Gap extension penalty. 1685 window_size Multiple hits window size. 1686 npasses Number of passes. 1687 passes Hits/passes. Integer 0-2. 1688 1689 Algorithm 1690 gapped Whether to do a gapped alignment. T/F 1691 expectation Expectation value cutoff. 1692 wordsize Word size. 1693 keep_hits Number of beset hits from a region to keep. 1694 xdrop Dropoff value (bits) for gapped alignments. 1695 hit_extend Threshold for extending hits. 1696 region_length Length of region used to judge hits. 1697 db_length Effective database length. 1698 search_length Effective length of search space. 1699 nbits_gapping Number of bits to trigger gapping. 1700 pseudocounts Pseudocounts constants for multiple passes. 1701 xdrop_final X dropoff for final gapped alignment. 1702 xdrop_extension Dropoff for blast extensions. 1703 model_threshold E-value threshold to include in multipass model. 1704 required_start Start of required region in query. 1705 required_end End of required region in query. 1706 1707 Processing 1708 XXX should document default values 1709 program The blast program to use. (PHI-BLAST) 1710 filter Filter query sequence with SEG? T/F 1711 believe_query Believe the query defline? T/F 1712 nprocessors Number of processors to use. 1713 1714 Formatting 1715 html Produce HTML output? T/F 1716 descriptions Number of one-line descriptions. 1717 alignments Number of alignments. 1718 align_view Alignment view. Integer 0-11, 1719 passed as a string or integer. 1720 show_gi Show GI's in deflines? T/F 1721 seqalign_file seqalign file to output. 1722 align_outfile Output file for alignment. 1723 checkpoint_outfile Output file for PSI-BLAST checkpointing. 1724 restart_infile Input file for PSI-BLAST restart. 1725 hit_infile Hit file for PHI-BLAST. 1726 matrix_outfile Output file for PSI-BLAST matrix in ASCII. 1727 align_infile Input alignment file for PSI-BLAST restart. 1728 1729 """ 1730 att2param = { 1731 'matrix' : '-M', 1732 'gap_open' : '-G', 1733 'gap_extend' : '-E', 1734 'window_size' : '-A', 1735 'npasses' : '-j', 1736 'passes' : '-P', 1737 1738 'gapped' : '-g', 1739 'expectation' : '-e', 1740 'wordsize' : '-W', 1741 'keep_hits' : '-K', 1742 'xdrop' : '-X', 1743 'hit_extend' : '-f', 1744 'region_length' : '-L', 1745 'db_length' : '-Z', 1746 'search_length' : '-Y', 1747 'nbits_gapping' : '-N', 1748 'pseudocounts' : '-c', 1749 'xdrop_final' : '-Z', 1750 'xdrop_extension' : '-y', 1751 'model_threshold' : '-h', 1752 'required_start' : '-S', 1753 'required_end' : '-H', 1754 1755 'program' : '-p', 1756 'database' : '-d', 1757 'infile' : '-i', 1758 'filter' : '-F', 1759 'believe_query' : '-J', 1760 'nprocessors' : '-a', 1761 1762 'html' : '-T', 1763 'descriptions' : '-v', 1764 'alignments' : '-b', 1765 'align_view' : '-m', 1766 'show_gi' : '-I', 1767 'seqalign_file' : '-O', 1768 'align_outfile' : '-o', 1769 'checkpoint_outfile' : '-C', 1770 'restart_infile' : '-R', 1771 'hit_infile' : '-k', 1772 'matrix_outfile' : '-Q', 1773 'align_infile' : '-B' 1774 } 1775 1776 if not os.path.exists(blastcmd): 1777 raise ValueError, "blastpgp does not exist at %s" % blastcmd 1778 1779 params = [] 1780 1781 params.extend([att2param['database'], database]) 1782 params.extend([att2param['infile'], infile]) 1783 params.extend([att2param['align_view'], str(align_view)]) 1784 1785 for attr in keywds.keys(): 1786 params.extend([att2param[attr], str(keywds[attr])]) 1787 1788 w, r, e = os.popen3(" ".join([blastcmd] + params)) 1789 w.close() 1790 return File.UndoHandle(r), File.UndoHandle(e)
1791 1792
1793 -def rpsblast(blastcmd, database, infile, align_view="7", **keywds):
1794 """rpsblast(blastcmd, database, infile, **keywds) -> 1795 read, error Undohandles 1796 1797 Execute and retrieve data from standalone RPS-BLAST. blastcmd is the 1798 command used to launch the 'rpsblast' executable. database is the path 1799 to the database to search against. infile is the path to the file 1800 containing the sequence to search with. 1801 1802 You may pass more parameters to **keywds to change the behavior of 1803 the search. Otherwise, optional values will be chosen by rpsblast. 1804 1805 Please note that this function will give XML output by default, by 1806 setting align_view to seven (i.e. command line option -m 7). 1807 You should use the NCBIXML.BlastParser() to read the resulting output. 1808 This is because NCBIStandalone.BlastParser() does not understand the 1809 plain text output format from rpsblast. 1810 1811 WARNING - The following text and associated parameter handling has not 1812 received extensive testing. Please report any errors we might have made... 1813 1814 Algorithm/Scoring 1815 gapped Whether to do a gapped alignment. T/F 1816 multihit 0 for multiple hit (default), 1 for single hit 1817 expectation Expectation value cutoff. 1818 range_restriction Range restriction on query sequence (Format: start,stop) blastp only 1819 0 in 'start' refers to the beginning of the sequence 1820 0 in 'stop' refers to the end of the sequence 1821 Default = 0,0 1822 xdrop Dropoff value (bits) for gapped alignments. 1823 xdrop_final X dropoff for final gapped alignment (in bits). 1824 xdrop_extension Dropoff for blast extensions (in bits). 1825 search_length Effective length of search space. 1826 nbits_gapping Number of bits to trigger gapping. 1827 protein Query sequence is protein. T/F 1828 db_length Effective database length. 1829 1830 Processing 1831 filter Filter query sequence with SEG? T/F 1832 case_filter Use lower case filtering of FASTA sequence T/F, default F 1833 believe_query Believe the query defline. T/F 1834 nprocessors Number of processors to use. 1835 logfile Name of log file to use, default rpsblast.log 1836 1837 Formatting 1838 html Produce HTML output? T/F 1839 descriptions Number of one-line descriptions. 1840 alignments Number of alignments. 1841 align_view Alignment view. Integer 0-11, 1842 passed as a string or integer. 1843 show_gi Show GI's in deflines? T/F 1844 seqalign_file seqalign file to output. 1845 align_outfile Output file for alignment. 1846 1847 """ 1848 att2param = { 1849 'multihit' : '-P', 1850 'gapped' : '-g', 1851 'expectation' : '-e', 1852 'range_restriction' : '-L', 1853 'xdrop' : '-X', 1854 'xdrop_final' : '-Z', 1855 'xdrop_extension' : '-y', 1856 'search_length' : '-Y', 1857 'nbits_gapping' : '-N', 1858 'protein' : '-p', 1859 'db_length' : '-z', 1860 1861 'database' : '-d', 1862 'infile' : '-i', 1863 'filter' : '-F', 1864 'case_filter' : '-U', 1865 'believe_query' : '-J', 1866 'nprocessors' : '-a', 1867 'logfile' : '-l', 1868 1869 'html' : '-T', 1870 'descriptions' : '-v', 1871 'alignments' : '-b', 1872 'align_view' : '-m', 1873 'show_gi' : '-I', 1874 'seqalign_file' : '-O', 1875 'align_outfile' : '-o' 1876 } 1877 1878 if not os.path.exists(blastcmd): 1879 raise ValueError, "rpsblast does not exist at %s" % blastcmd 1880 1881 params = [] 1882 1883 params.extend([att2param['database'], database]) 1884 params.extend([att2param['infile'], infile]) 1885 params.extend([att2param['align_view'], str(align_view)]) 1886 1887 for attr in keywds.keys(): 1888 params.extend([att2param[attr], str(keywds[attr])]) 1889 1890 w, r, e = os.popen3(" ".join([blastcmd] + params)) 1891 w.close() 1892 return File.UndoHandle(r), File.UndoHandle(e)
1893
1894 -def _re_search(regex, line, error_msg):
1895 m = re.search(regex, line) 1896 if not m: 1897 raise SyntaxError, error_msg 1898 return m.groups()
1899
1900 -def _get_cols(line, cols_to_get, ncols=None, expected={}):
1901 cols = line.split() 1902 1903 # Check to make sure number of columns is correct 1904 if ncols is not None and len(cols) != ncols: 1905 raise SyntaxError, "I expected %d columns (got %d) in line\n%s" % \ 1906 (ncols, len(cols), line) 1907 1908 # Check to make sure columns contain the correct data 1909 for k in expected.keys(): 1910 if cols[k] != expected[k]: 1911 raise SyntaxError, "I expected '%s' in column %d in line\n%s" % ( 1912 expected[k], k, line) 1913 1914 # Construct the answer tuple 1915 results = [] 1916 for c in cols_to_get: 1917 results.append(cols[c]) 1918 return tuple(results)
1919
1920 -def _safe_int(str):
1921 try: 1922 return int(str) 1923 except ValueError: 1924 # Something went wrong. Try to clean up the string. 1925 # Remove all commas from the string 1926 str = str.replace(',', '') 1927 try: 1928 # try again. 1929 return int(str) 1930 except ValueError: 1931 pass 1932 # If it fails again, maybe it's too long? 1933 # XXX why converting to float? 1934 return long(float(str))
1935
1936 -def _safe_float(str):
1937 # Thomas Rosleff Soerensen (rosleff@mpiz-koeln.mpg.de) noted that 1938 # float('e-172') does not produce an error on his platform. Thus, 1939 # we need to check the string for this condition. 1940 1941 # Sometimes BLAST leaves of the '1' in front of an exponent. 1942 if str and str[0] in ['E', 'e']: 1943 str = '1' + str 1944 try: 1945 return float(str) 1946 except ValueError: 1947 # Remove all commas from the string 1948 str = str.replace(',', '') 1949 # try again. 1950 return float(str)
1951
1952 -class _BlastErrorConsumer(_BlastConsumer):
1953 - def __init__(self):
1955 - def noevent(self, line):
1956 if line.find("Query must be at least wordsize") != -1: 1957 raise ShortQueryBlastError, "Query must be at least wordsize" 1958 # Now pass the line back up to the superclass. 1959 method = getattr(_BlastConsumer, 'noevent', 1960 _BlastConsumer.__getattr__(self, 'noevent')) 1961 method(line)
1962
1963 -class BlastErrorParser(AbstractParser):
1964 """Attempt to catch and diagnose BLAST errors while parsing. 1965 1966 This utilizes the BlastParser module but adds an additional layer 1967 of complexity on top of it by attempting to diagnose SyntaxError's 1968 that may actually indicate problems during BLAST parsing. 1969 1970 Current BLAST problems this detects are: 1971 o LowQualityBlastError - When BLASTing really low quality sequences 1972 (ie. some GenBank entries which are just short streches of a single 1973 nucleotide), BLAST will report an error with the sequence and be 1974 unable to search with this. This will lead to a badly formatted 1975 BLAST report that the parsers choke on. The parser will convert the 1976 SyntaxError to a LowQualityBlastError and attempt to provide useful 1977 information. 1978 1979 """
1980 - def __init__(self, bad_report_handle = None):
1981 """Initialize a parser that tries to catch BlastErrors. 1982 1983 Arguments: 1984 o bad_report_handle - An optional argument specifying a handle 1985 where bad reports should be sent. This would allow you to save 1986 all of the bad reports to a file, for instance. If no handle 1987 is specified, the bad reports will not be saved. 1988 """ 1989 self._bad_report_handle = bad_report_handle 1990 1991 #self._b_parser = BlastParser() 1992 self._scanner = _Scanner() 1993 self._consumer = _BlastErrorConsumer()
1994
1995 - def parse(self, handle):
1996 """Parse a handle, attempting to diagnose errors. 1997 """ 1998 results = handle.read() 1999 2000 try: 2001 self._scanner.feed(File.StringHandle(results), self._consumer) 2002 except SyntaxError, msg: 2003 # if we have a bad_report_file, save the info to it first 2004 if self._bad_report_handle: 2005 # send the info to the error handle 2006 self._bad_report_handle.write(results) 2007 2008 # now we want to try and diagnose the error 2009 self._diagnose_error( 2010 File.StringHandle(results), self._consumer.data) 2011 2012 # if we got here we can't figure out the problem 2013 # so we should pass along the syntax error we got 2014 raise 2015 return self._consumer.data
2016
2017 - def _diagnose_error(self, handle, data_record):
2018 """Attempt to diagnose an error in the passed handle. 2019 2020 Arguments: 2021 o handle - The handle potentially containing the error 2022 o data_record - The data record partially created by the consumer. 2023 """ 2024 line = handle.readline() 2025 2026 while line: 2027 # 'Searchingdone' instead of 'Searching......done' seems 2028 # to indicate a failure to perform the BLAST due to 2029 # low quality sequence 2030 if line.startswith('Searchingdone'): 2031 raise LowQualityBlastError("Blast failure occured on query: ", 2032 data_record.query) 2033 line = handle.readline()
2034