1
2
3
4
5
6
7
8
9 """
10 This module provides code to work with the standalone version of
11 BLAST, either blastall or blastpgp, provided by the NCBI.
12 http://www.ncbi.nlm.nih.gov/BLAST/
13
14 Classes:
15 LowQualityBlastError Except that indicates low quality query sequences.
16 BlastParser Parses output from blast.
17 BlastErrorParser Parses output and tries to diagnose possible errors.
18 PSIBlastParser Parses output from psi-blast.
19 Iterator Iterates over a file of blast results.
20
21 _Scanner Scans output from standalone BLAST.
22 _BlastConsumer Consumes output from blast.
23 _PSIBlastConsumer Consumes output from psi-blast.
24 _HeaderConsumer Consumes header information.
25 _DescriptionConsumer Consumes description information.
26 _AlignmentConsumer Consumes alignment information.
27 _HSPConsumer Consumes hsp information.
28 _DatabaseReportConsumer Consumes database report information.
29 _ParametersConsumer Consumes parameters information.
30
31 Functions:
32 blastall Execute blastall.
33 blastpgp Execute blastpgp.
34 rpsblast Execute rpsblast.
35
36 """
37
38 from __future__ import generators
39 import os
40 import re
41
42 from Bio import File
43 from Bio.ParserSupport import *
44 from Bio.Blast import Record
45
46
48 """Error caused by running a low quality sequence through BLAST.
49
50 When low quality sequences (like GenBank entries containing only
51 stretches of a single nucleotide) are BLASTed, they will result in
52 BLAST generating an error and not being able to perform the BLAST.
53 search. This error should be raised for the BLAST reports produced
54 in this case.
55 """
56 pass
57
59 """Error caused by running a short query sequence through BLAST.
60
61 If the query sequence is too short, BLAST outputs warnings and errors:
62 Searching[blastall] WARNING: [000.000] AT1G08320: SetUpBlastSearch failed.
63 [blastall] ERROR: [000.000] AT1G08320: Blast:
64 [blastall] ERROR: [000.000] AT1G08320: Blast: Query must be at least wordsize
65 done
66
67 This exception is raised when that condition is detected.
68
69 """
70 pass
71
72
74 """Scan BLAST output from blastall or blastpgp.
75
76 Tested with blastall and blastpgp v2.0.10, v2.0.11
77
78 Methods:
79 feed Feed data into the scanner.
80
81 """
82 - def feed(self, handle, consumer):
102
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147 consumer.start_header()
148
149 read_and_call(uhandle, consumer.version, contains='BLAST')
150 read_and_call_while(uhandle, consumer.noevent, blank=1)
151
152
153 attempt_read_and_call(uhandle, consumer.noevent, start="<pre>")
154
155
156 while attempt_read_and_call(uhandle,
157 consumer.reference, start='Reference') :
158
159
160 while 1:
161 line = uhandle.readline()
162 if is_blank_line(line) :
163 consumer.noevent(line)
164 break
165 elif line.startswith("RID"):
166 break
167 else :
168
169 consumer.reference(line)
170
171
172 read_and_call_while(uhandle, consumer.noevent, blank=1)
173 attempt_read_and_call(uhandle, consumer.reference, start="RID:")
174 read_and_call_while(uhandle, consumer.noevent, blank=1)
175
176
177 if attempt_read_and_call(
178 uhandle, consumer.reference, start="Reference"):
179 read_and_call_until(uhandle, consumer.reference, blank=1)
180 read_and_call_while(uhandle, consumer.noevent, blank=1)
181
182 line = uhandle.peekline()
183 assert line.strip() <> ""
184 assert not line.startswith("RID:")
185 if line.startswith("Query=") :
186
187
188
189 read_and_call(uhandle, consumer.query_info, start='Query=')
190 read_and_call_until(uhandle, consumer.query_info, blank=1)
191 read_and_call_while(uhandle, consumer.noevent, blank=1)
192
193
194 read_and_call_until(uhandle, consumer.database_info, end='total letters')
195 read_and_call(uhandle, consumer.database_info, contains='sequences')
196 read_and_call_while(uhandle, consumer.noevent, blank=1)
197 elif line.startswith("Database:") :
198
199 read_and_call_until(uhandle, consumer.database_info, end='total letters')
200 read_and_call(uhandle, consumer.database_info, contains='sequences')
201 read_and_call_while(uhandle, consumer.noevent, blank=1)
202
203
204 read_and_call(uhandle, consumer.query_info, start='Query=')
205 read_and_call_until(uhandle, consumer.query_info, blank=1)
206 read_and_call_while(uhandle, consumer.noevent, blank=1)
207 else :
208 raise SyntaxError("Invalid header?")
209
210 consumer.end_header()
211
213
214
215
216
217
218
219
220 while 1:
221 line = safe_peekline(uhandle)
222 if (not line.startswith('Searching') and
223 not line.startswith('Results from round') and
224 re.search(r"Score +E", line) is None and
225 line.find('No hits found') == -1):
226 break
227
228 self._scan_descriptions(uhandle, consumer)
229 self._scan_alignments(uhandle, consumer)
230
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254 consumer.start_descriptions()
255
256
257
258 attempt_read_and_call(uhandle, consumer.noevent, start='Searching')
259
260
261
262 if not uhandle.peekline():
263 raise SyntaxError, "Unexpected end of blast report. " + \
264 "Looks suspiciously like a PSI-BLAST crash."
265
266
267
268
269
270
271
272
273
274 line = uhandle.peekline()
275 if line.find("ERROR:") != -1 or line.startswith("done"):
276 read_and_call_while(uhandle, consumer.noevent, contains="ERROR:")
277 read_and_call(uhandle, consumer.noevent, start="done")
278
279
280
281
282
283
284
285
286
287
288
289
290
291 read_and_call_while(uhandle, consumer.noevent, blank=1)
292
293 if attempt_read_and_call(uhandle, consumer.round, start='Results'):
294 read_and_call_while(uhandle, consumer.noevent, blank=1)
295
296
297
298
299
300
301
302
303 if not attempt_read_and_call(
304 uhandle, consumer.description_header,
305 has_re=re.compile(r'Score +E')):
306
307 attempt_read_and_call(uhandle, consumer.no_hits,
308 contains='No hits found')
309 read_and_call_while(uhandle, consumer.noevent, blank=1)
310
311 consumer.end_descriptions()
312
313 return
314
315
316 read_and_call(uhandle, consumer.description_header,
317 start='Sequences producing')
318
319
320 attempt_read_and_call(uhandle, consumer.model_sequences,
321 start='Sequences used in model')
322 read_and_call_while(uhandle, consumer.noevent, blank=1)
323
324
325
326 if not uhandle.peekline().startswith('Sequences not found'):
327 read_and_call_until(uhandle, consumer.description, blank=1)
328 read_and_call_while(uhandle, consumer.noevent, blank=1)
329
330
331
332
333
334 if attempt_read_and_call(uhandle, consumer.nonmodel_sequences,
335 start='Sequences not found'):
336
337 read_and_call_while(uhandle, consumer.noevent, blank=1)
338 l = safe_peekline(uhandle)
339
340
341
342
343 if not l.startswith('CONVERGED') and l[0] != '>' \
344 and not l.startswith('QUERY'):
345 read_and_call_until(uhandle, consumer.description, blank=1)
346 read_and_call_while(uhandle, consumer.noevent, blank=1)
347
348 attempt_read_and_call(uhandle, consumer.converged, start='CONVERGED')
349 read_and_call_while(uhandle, consumer.noevent, blank=1)
350
351 consumer.end_descriptions()
352
367
374
387
416
422
424
425
426
427
428
429
430 read_and_call(uhandle, consumer.score, start=' Score')
431 read_and_call(uhandle, consumer.identities, start=' Identities')
432
433 attempt_read_and_call(uhandle, consumer.strand, start = ' Strand')
434
435 attempt_read_and_call(uhandle, consumer.frame, start = ' Frame')
436 read_and_call(uhandle, consumer.noevent, blank=1)
437
439
440
441
442
443
444
445
446
447
448 while 1:
449
450 attempt_read_and_call(uhandle, consumer.noevent, start=' ')
451 read_and_call(uhandle, consumer.query, start='Query')
452 read_and_call(uhandle, consumer.align, start=' ')
453 read_and_call(uhandle, consumer.sbjct, start='Sbjct')
454 read_and_call_while(uhandle, consumer.noevent, blank=1)
455 line = safe_peekline(uhandle)
456
457 if not (line.startswith('Query') or line.startswith(' ')):
458 break
459
482
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511 consumer.start_database_report()
512
513
514
515
516 if attempt_read_and_call(uhandle, consumer.noevent, start=" Subset"):
517 read_and_call(uhandle, consumer.noevent, contains="letters")
518 read_and_call(uhandle, consumer.noevent, contains="sequences")
519 read_and_call(uhandle, consumer.noevent, start=" ")
520
521
522
523 while attempt_read_and_call(uhandle, consumer.database,
524 start=' Database'):
525
526
527
528 if not uhandle.peekline():
529 consumer.end_database_report()
530 return
531
532
533 read_and_call_until(uhandle, consumer.database, start=' Posted')
534 read_and_call(uhandle, consumer.posted_date, start=' Posted')
535 read_and_call(uhandle, consumer.num_letters_in_database,
536 start=' Number of letters')
537 read_and_call(uhandle, consumer.num_sequences_in_database,
538 start=' Number of sequences')
539
540 attempt_read_and_call(uhandle, consumer.noevent, start=' ')
541
542 line = safe_readline(uhandle)
543 uhandle.saveline(line)
544 if line.find('Lambda') != -1:
545 break
546
547 read_and_call(uhandle, consumer.noevent, start='Lambda')
548 read_and_call(uhandle, consumer.ka_params)
549
550
551 attempt_read_and_call(uhandle, consumer.noevent, blank=1)
552
553
554 attempt_read_and_call(uhandle, consumer.gapped, start='Gapped')
555
556 if attempt_read_and_call(uhandle, consumer.noevent, start='Lambda'):
557 read_and_call(uhandle, consumer.ka_params_gap)
558
559
560
561
562 try:
563 read_and_call_while(uhandle, consumer.noevent, blank=1)
564 except SyntaxError, x:
565 if str(x) != "Unexpected end of stream.":
566 raise
567 consumer.end_database_report()
568
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626 if not uhandle.peekline():
627 return
628
629
630
631 consumer.start_parameters()
632
633
634 attempt_read_and_call(uhandle, consumer.matrix, start='Matrix')
635
636 attempt_read_and_call(uhandle, consumer.gap_penalties, start='Gap')
637
638 attempt_read_and_call(uhandle, consumer.num_sequences,
639 start='Number of Sequences')
640 read_and_call(uhandle, consumer.num_hits,
641 start='Number of Hits')
642 attempt_read_and_call(uhandle, consumer.num_sequences,
643 start='Number of Sequences')
644 read_and_call(uhandle, consumer.num_extends,
645 start='Number of extensions')
646 read_and_call(uhandle, consumer.num_good_extends,
647 start='Number of successful')
648
649 read_and_call(uhandle, consumer.num_seqs_better_e,
650 start='Number of sequences')
651
652
653 if attempt_read_and_call(uhandle, consumer.hsps_no_gap,
654 start="Number of HSP's better"):
655
656 if attempt_read_and_call(uhandle, consumer.noevent,
657 start="Number of HSP's gapped:"):
658 read_and_call(uhandle, consumer.noevent,
659 start="Number of HSP's successfully")
660
661 attempt_read_and_call(uhandle, consumer.noevent,
662 start="Number of extra gapped extensions")
663 else:
664 read_and_call(uhandle, consumer.hsps_prelim_gapped,
665 start="Number of HSP's successfully")
666 read_and_call(uhandle, consumer.hsps_prelim_gap_attempted,
667 start="Number of HSP's that")
668 read_and_call(uhandle, consumer.hsps_gapped,
669 start="Number of HSP's gapped")
670
671 elif attempt_read_and_call(uhandle, consumer.noevent,
672 start="Number of HSP's gapped"):
673 read_and_call(uhandle, consumer.noevent,
674 start="Number of HSP's successfully")
675
676
677 attempt_read_and_call(uhandle, consumer.query_length,
678 has_re=re.compile(r"[Ll]ength of query"))
679 read_and_call(uhandle, consumer.database_length,
680 has_re=re.compile(r"[Ll]ength of \s*[Dd]atabase"))
681
682
683 attempt_read_and_call(uhandle, consumer.noevent,
684 start="Length adjustment")
685 attempt_read_and_call(uhandle, consumer.effective_hsp_length,
686 start='effective HSP')
687
688 attempt_read_and_call(
689 uhandle, consumer.effective_query_length,
690 has_re=re.compile(r'[Ee]ffective length of query'))
691
692
693 attempt_read_and_call(
694 uhandle, consumer.effective_database_length,
695 has_re=re.compile(r'[Ee]ffective length of \s*[Dd]atabase'))
696
697
698 attempt_read_and_call(
699 uhandle, consumer.effective_search_space,
700 has_re=re.compile(r'[Ee]ffective search space:'))
701
702 attempt_read_and_call(
703 uhandle, consumer.effective_search_space_used,
704 has_re=re.compile(r'[Ee]ffective search space used'))
705
706
707 attempt_read_and_call(uhandle, consumer.frameshift, start='frameshift')
708
709
710 attempt_read_and_call(uhandle, consumer.threshold, start='T')
711
712 attempt_read_and_call(uhandle, consumer.threshold, start='Neighboring words threshold')
713
714
715 attempt_read_and_call(uhandle, consumer.window_size, start='A')
716
717 attempt_read_and_call(uhandle, consumer.window_size, start='Window for multiple hits')
718
719 read_and_call(uhandle, consumer.dropoff_1st_pass, start='X1')
720 read_and_call(uhandle, consumer.gap_x_dropoff, start='X2')
721
722
723 attempt_read_and_call(uhandle, consumer.gap_x_dropoff_final,
724 start='X3')
725
726 read_and_call(uhandle, consumer.gap_trigger, start='S1')
727
728
729
730 if not is_blank_line(uhandle.peekline(), allow_spaces=1):
731 read_and_call(uhandle, consumer.blast_cutoff, start='S2')
732
733 consumer.end_parameters()
734
736 """Parses BLAST data into a Record.Blast object.
737
738 """
743
744 - def parse(self, handle):
745 """parse(self, handle)"""
746 self._scanner.feed(handle, self._consumer)
747 return self._consumer.data
748
750 """Parses BLAST data into a Record.PSIBlast object.
751
752 """
757
758 - def parse(self, handle):
759 """parse(self, handle)"""
760 self._scanner.feed(handle, self._consumer)
761 return self._consumer.data
762
766
768 c = line.split()
769 self._header.application = c[0]
770 self._header.version = c[1]
771 self._header.date = c[2][1:-1]
772
774 if line.startswith('Reference: '):
775 self._header.reference = line[11:]
776 else:
777 self._header.reference = self._header.reference + line
778
780 if line.startswith('Query= '):
781 self._header.query = line[7:]
782 elif not line.startswith(' '):
783 self._header.query = "%s%s" % (self._header.query, line)
784 else:
785 letters, = _re_search(
786 r"([0-9,]+) letters", line,
787 "I could not find the number of letters in line\n%s" % line)
788 self._header.query_letters = _safe_int(letters)
789
791 line = line.rstrip()
792 if line.startswith('Database: '):
793 self._header.database = line[10:]
794 elif not line.endswith('total letters'):
795 self._header.database = self._header.database + line.strip()
796 else:
797 sequences, letters =_re_search(
798 r"([0-9,]+) sequences; ([0-9,-]+) total letters", line,
799 "I could not find the sequences and letters in line\n%s" %line)
800 self._header.database_sequences = _safe_int(sequences)
801 self._header.database_letters = _safe_int(letters)
802
807
810 self._descriptions = []
811 self._model_sequences = []
812 self._nonmodel_sequences = []
813 self._converged = 0
814 self._type = None
815 self._roundnum = None
816
817 self.__has_n = 0
818
820 if line.startswith('Sequences producing'):
821 cols = line.split()
822 if cols[-1] == 'N':
823 self.__has_n = 1
824
826 dh = self._parse(line)
827 if self._type == 'model':
828 self._model_sequences.append(dh)
829 elif self._type == 'nonmodel':
830 self._nonmodel_sequences.append(dh)
831 else:
832 self._descriptions.append(dh)
833
836
838 self._type = 'nonmodel'
839
842
845
847 if not line.startswith('Results from round'):
848 raise SyntaxError, "I didn't understand the round line\n%s" % line
849 self._roundnum = _safe_int(line[18:].strip())
850
853
854 - def _parse(self, description_line):
855 line = description_line
856 dh = Record.Description()
857
858
859
860
861
862
863
864
865 cols = line.split()
866 if len(cols) < 3:
867 raise SyntaxError, \
868 "Line does not appear to contain description:\n%s" % line
869 if self.__has_n:
870 i = line.rfind(cols[-1])
871 i = line.rfind(cols[-2], 0, i)
872 i = line.rfind(cols[-3], 0, i)
873 else:
874 i = line.rfind(cols[-1])
875 i = line.rfind(cols[-2], 0, i)
876 if self.__has_n:
877 dh.title, dh.score, dh.e, dh.num_alignments = \
878 line[:i].rstrip(), cols[-3], cols[-2], cols[-1]
879 else:
880 dh.title, dh.score, dh.e, dh.num_alignments = \
881 line[:i].rstrip(), cols[-2], cols[-1], 1
882 dh.num_alignments = _safe_int(dh.num_alignments)
883 dh.score = _safe_int(dh.score)
884 dh.e = _safe_float(dh.e)
885 return dh
886
888
889
890
891
895
897 self._alignment.title = "%s%s" % (self._alignment.title,
898 line.lstrip())
899
901
902 parts = line.replace(" ","").split("=")
903 assert len(parts)==2, "Unrecognised format length line"
904 self._alignment.length = parts[1]
905 self._alignment.length = _safe_int(self._alignment.length)
906
908
909 if line.startswith('QUERY') or line.startswith('blast_tmp'):
910
911
912
913
914
915 try:
916 name, start, seq, end = line.split()
917 except ValueError:
918 raise SyntaxError, "I do not understand the line\n%s" \
919 % line
920 self._start_index = line.index(start, len(name))
921 self._seq_index = line.index(seq,
922 self._start_index+len(start))
923
924 self._name_length = self._start_index - 1
925 self._start_length = self._seq_index - self._start_index - 1
926 self._seq_length = line.rfind(end) - self._seq_index - 1
927
928
929
930
931
932
933
934
935
936 name = line[:self._name_length]
937 name = name.rstrip()
938 start = line[self._start_index:self._start_index+self._start_length]
939 start = start.rstrip()
940 if start:
941 start = _safe_int(start)
942 end = line[self._seq_index+self._seq_length:].rstrip()
943 if end:
944 end = _safe_int(end)
945 seq = line[self._seq_index:self._seq_index+self._seq_length].rstrip()
946
947 if len(seq) < self._seq_length:
948 seq = seq + ' '*(self._seq_length-len(seq))
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967 align = self._multiple_alignment.alignment
968 align.append((name, start, seq, end))
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1018
1019 if self._alignment:
1020 self._alignment.title = self._alignment.title.rstrip()
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042 try:
1043 del self._seq_index
1044 del self._seq_length
1045 del self._start_index
1046 del self._start_length
1047 del self._name_length
1048 except AttributeError:
1049 pass
1050
1054
1056 self._hsp.bits, self._hsp.score = _re_search(
1057 r"Score =\s*([0-9.e+]+) bits \(([0-9]+)\)", line,
1058 "I could not find the score in line\n%s" % line)
1059 self._hsp.score = _safe_float(self._hsp.score)
1060 self._hsp.bits = _safe_float(self._hsp.bits)
1061
1062 x, y = _re_search(
1063 r"Expect\(?(\d*)\)? = +([0-9.e\-|\+]+)", line,
1064 "I could not find the expect in line\n%s" % line)
1065 if x:
1066 self._hsp.num_alignments = _safe_int(x)
1067 else:
1068 self._hsp.num_alignments = 1
1069 self._hsp.expect = _safe_float(y)
1070
1072 x, y = _re_search(
1073 r"Identities = (\d+)\/(\d+)", line,
1074 "I could not find the identities in line\n%s" % line)
1075 self._hsp.identities = _safe_int(x), _safe_int(y)
1076
1077 if line.find('Positives') != -1:
1078 x, y = _re_search(
1079 r"Positives = (\d+)\/(\d+)", line,
1080 "I could not find the positives in line\n%s" % line)
1081 self._hsp.positives = _safe_int(x), _safe_int(y)
1082
1083 if line.find('Gaps') != -1:
1084 x, y = _re_search(
1085 r"Gaps = (\d+)\/(\d+)", line,
1086 "I could not find the gaps in line\n%s" % line)
1087 self._hsp.gaps = _safe_int(x), _safe_int(y)
1088
1089
1091 self._hsp.strand = _re_search(
1092 r"Strand = (\w+) / (\w+)", line,
1093 "I could not find the strand in line\n%s" % line)
1094
1096
1097
1098
1099 if line.find('/') != -1:
1100 self._hsp.frame = _re_search(
1101 r"Frame = ([-+][123]) / ([-+][123])", line,
1102 "I could not find the frame in line\n%s" % line)
1103 else:
1104 self._hsp.frame = _re_search(
1105 r"Frame = ([-+][123])", line,
1106 "I could not find the frame in line\n%s" % line)
1107
1108
1109
1110
1111
1112
1113 _query_re = re.compile(r"Query(:?) \s*(\d+)\s*(.+) (\d+)")
1115 m = self._query_re.search(line)
1116 if m is None:
1117 raise SyntaxError, "I could not find the query in line\n%s" % line
1118
1119
1120
1121 colon, start, seq, end = m.groups()
1122 self._hsp.query = self._hsp.query + seq
1123 if self._hsp.query_start is None:
1124 self._hsp.query_start = _safe_int(start)
1125
1126
1127
1128 self._hsp.query_end = _safe_int(end)
1129
1130
1131 self._query_start_index = m.start(3)
1132 self._query_len = len(seq)
1133
1135 seq = line[self._query_start_index:].rstrip()
1136 if len(seq) < self._query_len:
1137
1138 seq = seq + ' ' * (self._query_len-len(seq))
1139 elif len(seq) < self._query_len:
1140 raise SyntaxError, "Match is longer than the query in line\n%s" % \
1141 line
1142 self._hsp.match = self._hsp.match + seq
1143
1144
1145
1146 _sbjct_re = re.compile(r"Sbjct(:?) \s*(\d+)\s*(.+) (\d+)")
1148 m = self._sbjct_re.search(line)
1149 if m is None:
1150 raise SyntaxError, "I could not find the sbjct in line\n%s" % line
1151 colon, start, seq, end = m.groups()
1152
1153
1154
1155
1156 if not seq.strip():
1157 seq = ' ' * self._query_len
1158 self._hsp.sbjct = self._hsp.sbjct + seq
1159 if self._hsp.sbjct_start is None:
1160 self._hsp.sbjct_start = _safe_int(start)
1161
1162 self._hsp.sbjct_end = _safe_int(end)
1163 if len(seq) != self._query_len:
1164 raise SyntaxError, \
1165 "QUERY and SBJCT sequence lengths don't match in line\n%s" \
1166 % line
1167
1168 del self._query_start_index
1169 del self._query_len
1170
1173
1175
1178
1187
1188 - def posted_date(self, line):
1189 self._dr.posted_date.append(_re_search(
1190 r"Posted date:\s*(.+)$", line,
1191 "I could not find the posted date in line\n%s" % line))
1192
1197
1202
1206
1209
1213
1216
1220
1222 self._params.matrix = line[8:].rstrip()
1223
1228
1230 if line.find('1st pass') != -1:
1231 x, = _get_cols(line, (-4,), ncols=11, expected={2:"Hits"})
1232 self._params.num_hits = _safe_int(x)
1233 else:
1234 x, = _get_cols(line, (-1,), ncols=6, expected={2:"Hits"})
1235 self._params.num_hits = _safe_int(x)
1236
1238 if line.find('1st pass') != -1:
1239 x, = _get_cols(line, (-4,), ncols=9, expected={2:"Sequences:"})
1240 self._params.num_sequences = _safe_int(x)
1241 else:
1242 x, = _get_cols(line, (-1,), ncols=4, expected={2:"Sequences:"})
1243 self._params.num_sequences = _safe_int(x)
1244
1246 if line.find('1st pass') != -1:
1247 x, = _get_cols(line, (-4,), ncols=9, expected={2:"extensions:"})
1248 self._params.num_extends = _safe_int(x)
1249 else:
1250 x, = _get_cols(line, (-1,), ncols=4, expected={2:"extensions:"})
1251 self._params.num_extends = _safe_int(x)
1252
1254 if line.find('1st pass') != -1:
1255 x, = _get_cols(line, (-4,), ncols=10, expected={3:"extensions:"})
1256 self._params.num_good_extends = _safe_int(x)
1257 else:
1258 x, = _get_cols(line, (-1,), ncols=5, expected={3:"extensions:"})
1259 self._params.num_good_extends = _safe_int(x)
1260
1266
1271
1277
1283
1288
1293
1298
1304
1310
1316
1322
1328
1332
1334 if line[:2] == "T:" :
1335
1336 self._params.threshold, = _get_cols(
1337 line, (1,), ncols=2, expected={0:"T:"})
1338 elif line[:28] == "Neighboring words threshold:" :
1339 self._params.threshold, = _get_cols(
1340 line, (3,), ncols=4, expected={0:"Neighboring", 1:"words", 2:"threshold:"})
1341 else :
1342 raise SyntaxError("Unrecognised threshold line:\n%s" % line)
1343 self._params.threshold = _safe_int(self._params.threshold)
1344
1346 if line[:2] == "A:" :
1347 self._params.window_size, = _get_cols(
1348 line, (1,), ncols=2, expected={0:"A:"})
1349 elif line[:25] == "Window for multiple hits:" :
1350 self._params.window_size, = _get_cols(
1351 line, (4,), ncols=5, expected={0:"Window", 2:"multiple", 3:"hits:"})
1352 else :
1353 raise SyntaxError("Unrecognised window size line:\n%s" % line)
1354 self._params.window_size = _safe_int(self._params.window_size)
1355
1361
1367
1373
1379
1385
1388
1389
1390 -class _BlastConsumer(AbstractConsumer,
1391 _HeaderConsumer,
1392 _DescriptionConsumer,
1393 _AlignmentConsumer,
1394 _HSPConsumer,
1395 _DatabaseReportConsumer,
1396 _ParametersConsumer
1397 ):
1398
1399
1400
1401
1402
1403
1404
1405
1406
1409
1411
1412 raise ValueError, \
1413 "This consumer doesn't handle PSI-BLAST data"
1414
1418
1422
1424 self.data.descriptions = self._descriptions
1425
1432
1434 _HSPConsumer.end_hsp(self)
1435 try:
1436 self._alignment.hsps.append(self._hsp)
1437 except AttributeError:
1438 raise SyntaxError, "Found an HSP before an alignment"
1439
1443
1447
1448 -class _PSIBlastConsumer(AbstractConsumer,
1449 _HeaderConsumer,
1450 _DescriptionConsumer,
1451 _AlignmentConsumer,
1452 _HSPConsumer,
1453 _DatabaseReportConsumer,
1454 _ParametersConsumer
1455 ):
1458
1462
1466
1471
1473 _DescriptionConsumer.end_descriptions(self)
1474 self._round.number = self._roundnum
1475 if self._descriptions:
1476 self._round.new_seqs.extend(self._descriptions)
1477 self._round.reused_seqs.extend(self._model_sequences)
1478 self._round.new_seqs.extend(self._nonmodel_sequences)
1479 if self._converged:
1480 self.data.converged = 1
1481
1483 _AlignmentConsumer.end_alignment(self)
1484 if self._alignment.hsps:
1485 self._round.alignments.append(self._alignment)
1486 if self._multiple_alignment:
1487 self._round.multiple_alignment = self._multiple_alignment
1488
1490 _HSPConsumer.end_hsp(self)
1491 try:
1492 self._alignment.hsps.append(self._hsp)
1493 except AttributeError:
1494 raise SyntaxError, "Found an HSP before an alignment"
1495
1499
1503
1505 """Iterates over a file of multiple BLAST results.
1506
1507 Methods:
1508 next Return the next record from the stream, or None.
1509
1510 """
1511 - def __init__(self, handle, parser=None):
1512 """__init__(self, handle, parser=None)
1513
1514 Create a new iterator. handle is a file-like object. parser
1515 is an optional Parser object to change the results into another form.
1516 If set to None, then the raw contents of the file will be returned.
1517
1518 """
1519 try:
1520 handle.readline
1521 except AttributeError:
1522 raise ValueError(
1523 "I expected a file handle or file-like object, got %s"
1524 % type(handle))
1525 self._uhandle = File.UndoHandle(handle)
1526 self._parser = parser
1527
1529 """next(self) -> object
1530
1531 Return the next Blast record from the file. If no more records,
1532 return None.
1533
1534 """
1535 lines = []
1536 while 1:
1537 line = self._uhandle.readline()
1538 if not line:
1539 break
1540
1541 if lines and (line.startswith('BLAST')
1542 or line.startswith('BLAST', 1)
1543 or line.startswith('<?xml ')):
1544 self._uhandle.saveline(line)
1545 break
1546 lines.append(line)
1547
1548 if not lines:
1549 return None
1550
1551 data = ''.join(lines)
1552 if self._parser is not None:
1553 return self._parser.parse(File.StringHandle(data))
1554 return data
1555
1557 return iter(self.next, None)
1558
1559 -def blastall(blastcmd, program, database, infile, align_view='7', **keywds):
1560 """blastall(blastcmd, program, database, infile, align_view='7', **keywds)
1561 -> read, error Undohandles
1562
1563 Execute and retrieve data from blastall. blastcmd is the command
1564 used to launch the 'blastall' executable. program is the blast program
1565 to use, e.g. 'blastp', 'blastn', etc. database is the path to the database
1566 to search against. infile is the path to the file containing
1567 the sequence to search with.
1568
1569 You may pass more parameters to **keywds to change the behavior of
1570 the search. Otherwise, optional values will be chosen by blastall.
1571 The Blast output is by default in XML format. Use the align_view keyword
1572 for output in a different format.
1573
1574 Scoring
1575 matrix Matrix to use.
1576 gap_open Gap open penalty.
1577 gap_extend Gap extension penalty.
1578 nuc_match Nucleotide match reward. (BLASTN)
1579 nuc_mismatch Nucleotide mismatch penalty. (BLASTN)
1580 query_genetic_code Genetic code for Query.
1581 db_genetic_code Genetic code for database. (TBLAST[NX])
1582
1583 Algorithm
1584 gapped Whether to do a gapped alignment. T/F (not for TBLASTX)
1585 expectation Expectation value cutoff.
1586 wordsize Word size.
1587 strands Query strands to search against database.([T]BLAST[NX])
1588 keep_hits Number of best hits from a region to keep.
1589 xdrop Dropoff value (bits) for gapped alignments.
1590 hit_extend Threshold for extending hits.
1591 region_length Length of region used to judge hits.
1592 db_length Effective database length.
1593 search_length Effective length of search space.
1594
1595 Processing
1596 filter Filter query sequence? T/F
1597 believe_query Believe the query defline. T/F
1598 restrict_gi Restrict search to these GI's.
1599 nprocessors Number of processors to use.
1600 oldengine Force use of old engine T/F
1601
1602 Formatting
1603 html Produce HTML output? T/F
1604 descriptions Number of one-line descriptions.
1605 alignments Number of alignments.
1606 align_view Alignment view. Integer 0-11,
1607 passed as a string or integer.
1608 show_gi Show GI's in deflines? T/F
1609 seqalign_file seqalign file to output.
1610
1611 """
1612 att2param = {
1613 'matrix' : '-M',
1614 'gap_open' : '-G',
1615 'gap_extend' : '-E',
1616 'nuc_match' : '-r',
1617 'nuc_mismatch' : '-q',
1618 'query_genetic_code' : '-Q',
1619 'db_genetic_code' : '-D',
1620
1621 'gapped' : '-g',
1622 'expectation' : '-e',
1623 'wordsize' : '-W',
1624 'strands' : '-S',
1625 'keep_hits' : '-K',
1626 'xdrop' : '-X',
1627 'hit_extend' : '-f',
1628 'region_length' : '-L',
1629 'db_length' : '-z',
1630 'search_length' : '-Y',
1631
1632 'program' : '-p',
1633 'database' : '-d',
1634 'infile' : '-i',
1635 'filter' : '-F',
1636 'believe_query' : '-J',
1637 'restrict_gi' : '-l',
1638 'nprocessors' : '-a',
1639 'oldengine' : '-V',
1640
1641 'html' : '-T',
1642 'descriptions' : '-v',
1643 'alignments' : '-b',
1644 'align_view' : '-m',
1645 'show_gi' : '-I',
1646 'seqalign_file' : '-O'
1647 }
1648
1649 if not os.path.exists(blastcmd):
1650 raise ValueError, "blastall does not exist at %s" % blastcmd
1651
1652 params = []
1653
1654 params.extend([att2param['program'], program])
1655 params.extend([att2param['database'], database])
1656 params.extend([att2param['infile'], infile])
1657 params.extend([att2param['align_view'], str(align_view)])
1658
1659 for attr in keywds.keys():
1660 params.extend([att2param[attr], str(keywds[attr])])
1661
1662 w, r, e = os.popen3(" ".join([blastcmd] + params))
1663 w.close()
1664 return File.UndoHandle(r), File.UndoHandle(e)
1665
1666
1667 -def blastpgp(blastcmd, database, infile, align_view='7', **keywds):
1668 """blastpgp(blastcmd, database, infile, align_view='7', **keywds) ->
1669 read, error Undohandles
1670
1671 Execute and retrieve data from blastpgp. blastcmd is the command
1672 used to launch the 'blastpgp' executable. database is the path to the
1673 database to search against. infile is the path to the file containing
1674 the sequence to search with.
1675
1676 You may pass more parameters to **keywds to change the behavior of
1677 the search. Otherwise, optional values will be chosen by blastpgp.
1678 The Blast output is by default in XML format. Use the align_view keyword
1679 for output in a different format.
1680
1681 Scoring
1682 matrix Matrix to use.
1683 gap_open Gap open penalty.
1684 gap_extend Gap extension penalty.
1685 window_size Multiple hits window size.
1686 npasses Number of passes.
1687 passes Hits/passes. Integer 0-2.
1688
1689 Algorithm
1690 gapped Whether to do a gapped alignment. T/F
1691 expectation Expectation value cutoff.
1692 wordsize Word size.
1693 keep_hits Number of beset hits from a region to keep.
1694 xdrop Dropoff value (bits) for gapped alignments.
1695 hit_extend Threshold for extending hits.
1696 region_length Length of region used to judge hits.
1697 db_length Effective database length.
1698 search_length Effective length of search space.
1699 nbits_gapping Number of bits to trigger gapping.
1700 pseudocounts Pseudocounts constants for multiple passes.
1701 xdrop_final X dropoff for final gapped alignment.
1702 xdrop_extension Dropoff for blast extensions.
1703 model_threshold E-value threshold to include in multipass model.
1704 required_start Start of required region in query.
1705 required_end End of required region in query.
1706
1707 Processing
1708 XXX should document default values
1709 program The blast program to use. (PHI-BLAST)
1710 filter Filter query sequence with SEG? T/F
1711 believe_query Believe the query defline? T/F
1712 nprocessors Number of processors to use.
1713
1714 Formatting
1715 html Produce HTML output? T/F
1716 descriptions Number of one-line descriptions.
1717 alignments Number of alignments.
1718 align_view Alignment view. Integer 0-11,
1719 passed as a string or integer.
1720 show_gi Show GI's in deflines? T/F
1721 seqalign_file seqalign file to output.
1722 align_outfile Output file for alignment.
1723 checkpoint_outfile Output file for PSI-BLAST checkpointing.
1724 restart_infile Input file for PSI-BLAST restart.
1725 hit_infile Hit file for PHI-BLAST.
1726 matrix_outfile Output file for PSI-BLAST matrix in ASCII.
1727 align_infile Input alignment file for PSI-BLAST restart.
1728
1729 """
1730 att2param = {
1731 'matrix' : '-M',
1732 'gap_open' : '-G',
1733 'gap_extend' : '-E',
1734 'window_size' : '-A',
1735 'npasses' : '-j',
1736 'passes' : '-P',
1737
1738 'gapped' : '-g',
1739 'expectation' : '-e',
1740 'wordsize' : '-W',
1741 'keep_hits' : '-K',
1742 'xdrop' : '-X',
1743 'hit_extend' : '-f',
1744 'region_length' : '-L',
1745 'db_length' : '-Z',
1746 'search_length' : '-Y',
1747 'nbits_gapping' : '-N',
1748 'pseudocounts' : '-c',
1749 'xdrop_final' : '-Z',
1750 'xdrop_extension' : '-y',
1751 'model_threshold' : '-h',
1752 'required_start' : '-S',
1753 'required_end' : '-H',
1754
1755 'program' : '-p',
1756 'database' : '-d',
1757 'infile' : '-i',
1758 'filter' : '-F',
1759 'believe_query' : '-J',
1760 'nprocessors' : '-a',
1761
1762 'html' : '-T',
1763 'descriptions' : '-v',
1764 'alignments' : '-b',
1765 'align_view' : '-m',
1766 'show_gi' : '-I',
1767 'seqalign_file' : '-O',
1768 'align_outfile' : '-o',
1769 'checkpoint_outfile' : '-C',
1770 'restart_infile' : '-R',
1771 'hit_infile' : '-k',
1772 'matrix_outfile' : '-Q',
1773 'align_infile' : '-B'
1774 }
1775
1776 if not os.path.exists(blastcmd):
1777 raise ValueError, "blastpgp does not exist at %s" % blastcmd
1778
1779 params = []
1780
1781 params.extend([att2param['database'], database])
1782 params.extend([att2param['infile'], infile])
1783 params.extend([att2param['align_view'], str(align_view)])
1784
1785 for attr in keywds.keys():
1786 params.extend([att2param[attr], str(keywds[attr])])
1787
1788 w, r, e = os.popen3(" ".join([blastcmd] + params))
1789 w.close()
1790 return File.UndoHandle(r), File.UndoHandle(e)
1791
1792
1793 -def rpsblast(blastcmd, database, infile, align_view="7", **keywds):
1794 """rpsblast(blastcmd, database, infile, **keywds) ->
1795 read, error Undohandles
1796
1797 Execute and retrieve data from standalone RPS-BLAST. blastcmd is the
1798 command used to launch the 'rpsblast' executable. database is the path
1799 to the database to search against. infile is the path to the file
1800 containing the sequence to search with.
1801
1802 You may pass more parameters to **keywds to change the behavior of
1803 the search. Otherwise, optional values will be chosen by rpsblast.
1804
1805 Please note that this function will give XML output by default, by
1806 setting align_view to seven (i.e. command line option -m 7).
1807 You should use the NCBIXML.BlastParser() to read the resulting output.
1808 This is because NCBIStandalone.BlastParser() does not understand the
1809 plain text output format from rpsblast.
1810
1811 WARNING - The following text and associated parameter handling has not
1812 received extensive testing. Please report any errors we might have made...
1813
1814 Algorithm/Scoring
1815 gapped Whether to do a gapped alignment. T/F
1816 multihit 0 for multiple hit (default), 1 for single hit
1817 expectation Expectation value cutoff.
1818 range_restriction Range restriction on query sequence (Format: start,stop) blastp only
1819 0 in 'start' refers to the beginning of the sequence
1820 0 in 'stop' refers to the end of the sequence
1821 Default = 0,0
1822 xdrop Dropoff value (bits) for gapped alignments.
1823 xdrop_final X dropoff for final gapped alignment (in bits).
1824 xdrop_extension Dropoff for blast extensions (in bits).
1825 search_length Effective length of search space.
1826 nbits_gapping Number of bits to trigger gapping.
1827 protein Query sequence is protein. T/F
1828 db_length Effective database length.
1829
1830 Processing
1831 filter Filter query sequence with SEG? T/F
1832 case_filter Use lower case filtering of FASTA sequence T/F, default F
1833 believe_query Believe the query defline. T/F
1834 nprocessors Number of processors to use.
1835 logfile Name of log file to use, default rpsblast.log
1836
1837 Formatting
1838 html Produce HTML output? T/F
1839 descriptions Number of one-line descriptions.
1840 alignments Number of alignments.
1841 align_view Alignment view. Integer 0-11,
1842 passed as a string or integer.
1843 show_gi Show GI's in deflines? T/F
1844 seqalign_file seqalign file to output.
1845 align_outfile Output file for alignment.
1846
1847 """
1848 att2param = {
1849 'multihit' : '-P',
1850 'gapped' : '-g',
1851 'expectation' : '-e',
1852 'range_restriction' : '-L',
1853 'xdrop' : '-X',
1854 'xdrop_final' : '-Z',
1855 'xdrop_extension' : '-y',
1856 'search_length' : '-Y',
1857 'nbits_gapping' : '-N',
1858 'protein' : '-p',
1859 'db_length' : '-z',
1860
1861 'database' : '-d',
1862 'infile' : '-i',
1863 'filter' : '-F',
1864 'case_filter' : '-U',
1865 'believe_query' : '-J',
1866 'nprocessors' : '-a',
1867 'logfile' : '-l',
1868
1869 'html' : '-T',
1870 'descriptions' : '-v',
1871 'alignments' : '-b',
1872 'align_view' : '-m',
1873 'show_gi' : '-I',
1874 'seqalign_file' : '-O',
1875 'align_outfile' : '-o'
1876 }
1877
1878 if not os.path.exists(blastcmd):
1879 raise ValueError, "rpsblast does not exist at %s" % blastcmd
1880
1881 params = []
1882
1883 params.extend([att2param['database'], database])
1884 params.extend([att2param['infile'], infile])
1885 params.extend([att2param['align_view'], str(align_view)])
1886
1887 for attr in keywds.keys():
1888 params.extend([att2param[attr], str(keywds[attr])])
1889
1890 w, r, e = os.popen3(" ".join([blastcmd] + params))
1891 w.close()
1892 return File.UndoHandle(r), File.UndoHandle(e)
1893
1895 m = re.search(regex, line)
1896 if not m:
1897 raise SyntaxError, error_msg
1898 return m.groups()
1899
1900 -def _get_cols(line, cols_to_get, ncols=None, expected={}):
1901 cols = line.split()
1902
1903
1904 if ncols is not None and len(cols) != ncols:
1905 raise SyntaxError, "I expected %d columns (got %d) in line\n%s" % \
1906 (ncols, len(cols), line)
1907
1908
1909 for k in expected.keys():
1910 if cols[k] != expected[k]:
1911 raise SyntaxError, "I expected '%s' in column %d in line\n%s" % (
1912 expected[k], k, line)
1913
1914
1915 results = []
1916 for c in cols_to_get:
1917 results.append(cols[c])
1918 return tuple(results)
1919
1921 try:
1922 return int(str)
1923 except ValueError:
1924
1925
1926 str = str.replace(',', '')
1927 try:
1928
1929 return int(str)
1930 except ValueError:
1931 pass
1932
1933
1934 return long(float(str))
1935
1937
1938
1939
1940
1941
1942 if str and str[0] in ['E', 'e']:
1943 str = '1' + str
1944 try:
1945 return float(str)
1946 except ValueError:
1947
1948 str = str.replace(',', '')
1949
1950 return float(str)
1951
1962
1964 """Attempt to catch and diagnose BLAST errors while parsing.
1965
1966 This utilizes the BlastParser module but adds an additional layer
1967 of complexity on top of it by attempting to diagnose SyntaxError's
1968 that may actually indicate problems during BLAST parsing.
1969
1970 Current BLAST problems this detects are:
1971 o LowQualityBlastError - When BLASTing really low quality sequences
1972 (ie. some GenBank entries which are just short streches of a single
1973 nucleotide), BLAST will report an error with the sequence and be
1974 unable to search with this. This will lead to a badly formatted
1975 BLAST report that the parsers choke on. The parser will convert the
1976 SyntaxError to a LowQualityBlastError and attempt to provide useful
1977 information.
1978
1979 """
1980 - def __init__(self, bad_report_handle = None):
1981 """Initialize a parser that tries to catch BlastErrors.
1982
1983 Arguments:
1984 o bad_report_handle - An optional argument specifying a handle
1985 where bad reports should be sent. This would allow you to save
1986 all of the bad reports to a file, for instance. If no handle
1987 is specified, the bad reports will not be saved.
1988 """
1989 self._bad_report_handle = bad_report_handle
1990
1991
1992 self._scanner = _Scanner()
1993 self._consumer = _BlastErrorConsumer()
1994
1995 - def parse(self, handle):
1996 """Parse a handle, attempting to diagnose errors.
1997 """
1998 results = handle.read()
1999
2000 try:
2001 self._scanner.feed(File.StringHandle(results), self._consumer)
2002 except SyntaxError, msg:
2003
2004 if self._bad_report_handle:
2005
2006 self._bad_report_handle.write(results)
2007
2008
2009 self._diagnose_error(
2010 File.StringHandle(results), self._consumer.data)
2011
2012
2013
2014 raise
2015 return self._consumer.data
2016
2018 """Attempt to diagnose an error in the passed handle.
2019
2020 Arguments:
2021 o handle - The handle potentially containing the error
2022 o data_record - The data record partially created by the consumer.
2023 """
2024 line = handle.readline()
2025
2026 while line:
2027
2028
2029
2030 if line.startswith('Searchingdone'):
2031 raise LowQualityBlastError("Blast failure occured on query: ",
2032 data_record.query)
2033 line = handle.readline()
2034