1
2
3
4
5
6
7
8
9 """
10 This module provides code to work with the standalone version of
11 BLAST, either blastall or blastpgp, provided by the NCBI.
12 http://www.ncbi.nlm.nih.gov/BLAST/
13
14 Classes:
15 LowQualityBlastError Except that indicates low quality query sequences.
16 BlastParser Parses output from blast.
17 BlastErrorParser Parses output and tries to diagnose possible errors.
18 PSIBlastParser Parses output from psi-blast.
19 Iterator Iterates over a file of blast results.
20
21 _Scanner Scans output from standalone BLAST.
22 _BlastConsumer Consumes output from blast.
23 _PSIBlastConsumer Consumes output from psi-blast.
24 _HeaderConsumer Consumes header information.
25 _DescriptionConsumer Consumes description information.
26 _AlignmentConsumer Consumes alignment information.
27 _HSPConsumer Consumes hsp information.
28 _DatabaseReportConsumer Consumes database report information.
29 _ParametersConsumer Consumes parameters information.
30
31 Functions:
32 blastall Execute blastall.
33 blastpgp Execute blastpgp.
34 rpsblast Execute rpsblast.
35
36 """
37
38 from __future__ import generators
39 import os
40 import re
41
42 from Bio import File
43 from Bio.ParserSupport import *
44 from Bio.Blast import Record
45
46
48 """Error caused by running a low quality sequence through BLAST.
49
50 When low quality sequences (like GenBank entries containing only
51 stretches of a single nucleotide) are BLASTed, they will result in
52 BLAST generating an error and not being able to perform the BLAST.
53 search. This error should be raised for the BLAST reports produced
54 in this case.
55 """
56 pass
57
59 """Error caused by running a short query sequence through BLAST.
60
61 If the query sequence is too short, BLAST outputs warnings and errors:
62 Searching[blastall] WARNING: [000.000] AT1G08320: SetUpBlastSearch failed.
63 [blastall] ERROR: [000.000] AT1G08320: Blast:
64 [blastall] ERROR: [000.000] AT1G08320: Blast: Query must be at least wordsize
65 done
66
67 This exception is raised when that condition is detected.
68
69 """
70 pass
71
72
74 """Scan BLAST output from blastall or blastpgp.
75
76 Tested with blastall and blastpgp v2.0.10, v2.0.11
77
78 Methods:
79 feed Feed data into the scanner.
80
81 """
82 - def feed(self, handle, consumer):
102
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147 consumer.start_header()
148
149 read_and_call(uhandle, consumer.version, contains='BLAST')
150 read_and_call_while(uhandle, consumer.noevent, blank=1)
151
152
153 attempt_read_and_call(uhandle, consumer.noevent, start="<pre>")
154
155
156 while attempt_read_and_call(uhandle,
157 consumer.reference, start='Reference') :
158
159
160 while 1:
161 line = uhandle.readline()
162 if is_blank_line(line) :
163 consumer.noevent(line)
164 break
165 elif line.startswith("RID"):
166 break
167 else :
168
169 consumer.reference(line)
170
171
172 read_and_call_while(uhandle, consumer.noevent, blank=1)
173 attempt_read_and_call(uhandle, consumer.reference, start="RID:")
174 read_and_call_while(uhandle, consumer.noevent, blank=1)
175
176
177
178 if attempt_read_and_call(
179 uhandle, consumer.reference, start="Reference"):
180 read_and_call_until(uhandle, consumer.reference, blank=1)
181 read_and_call_while(uhandle, consumer.noevent, blank=1)
182
183
184 if attempt_read_and_call(
185 uhandle, consumer.reference, start="Reference"):
186 read_and_call_until(uhandle, consumer.reference, blank=1)
187 read_and_call_while(uhandle, consumer.noevent, blank=1)
188
189 line = uhandle.peekline()
190 assert line.strip() <> ""
191 assert not line.startswith("RID:")
192 if line.startswith("Query=") :
193
194
195
196 read_and_call(uhandle, consumer.query_info, start='Query=')
197 read_and_call_until(uhandle, consumer.query_info, blank=1)
198 read_and_call_while(uhandle, consumer.noevent, blank=1)
199
200
201 read_and_call_until(uhandle, consumer.database_info, end='total letters')
202 read_and_call(uhandle, consumer.database_info, contains='sequences')
203 read_and_call_while(uhandle, consumer.noevent, blank=1)
204 elif line.startswith("Database:") :
205
206 read_and_call_until(uhandle, consumer.database_info, end='total letters')
207 read_and_call(uhandle, consumer.database_info, contains='sequences')
208 read_and_call_while(uhandle, consumer.noevent, blank=1)
209
210
211 read_and_call(uhandle, consumer.query_info, start='Query=')
212 read_and_call_until(uhandle, consumer.query_info, blank=1)
213 read_and_call_while(uhandle, consumer.noevent, blank=1)
214 else :
215 raise ValueError("Invalid header?")
216
217 consumer.end_header()
218
220
221
222
223
224
225
226
227 while 1:
228 line = safe_peekline(uhandle)
229 if (not line.startswith('Searching') and
230 not line.startswith('Results from round') and
231 re.search(r"Score +E", line) is None and
232 line.find('No hits found') == -1):
233 break
234
235 self._scan_descriptions(uhandle, consumer)
236 self._scan_alignments(uhandle, consumer)
237
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261 consumer.start_descriptions()
262
263
264
265 attempt_read_and_call(uhandle, consumer.noevent, start='Searching')
266
267
268
269 if not uhandle.peekline():
270 raise ValueError, "Unexpected end of blast report. " + \
271 "Looks suspiciously like a PSI-BLAST crash."
272
273
274
275
276
277
278
279
280
281 line = uhandle.peekline()
282 if line.find("ERROR:") != -1 or line.startswith("done"):
283 read_and_call_while(uhandle, consumer.noevent, contains="ERROR:")
284 read_and_call(uhandle, consumer.noevent, start="done")
285
286
287
288
289
290
291
292
293
294
295
296
297
298 read_and_call_while(uhandle, consumer.noevent, blank=1)
299
300 if attempt_read_and_call(uhandle, consumer.round, start='Results'):
301 read_and_call_while(uhandle, consumer.noevent, blank=1)
302
303
304
305
306
307
308
309
310 if not attempt_read_and_call(
311 uhandle, consumer.description_header,
312 has_re=re.compile(r'Score +E')):
313
314 attempt_read_and_call(uhandle, consumer.no_hits,
315 contains='No hits found')
316 read_and_call_while(uhandle, consumer.noevent, blank=1)
317
318 consumer.end_descriptions()
319
320 return
321
322
323 read_and_call(uhandle, consumer.description_header,
324 start='Sequences producing')
325
326
327 attempt_read_and_call(uhandle, consumer.model_sequences,
328 start='Sequences used in model')
329 read_and_call_while(uhandle, consumer.noevent, blank=1)
330
331
332
333 if not uhandle.peekline().startswith('Sequences not found'):
334 read_and_call_until(uhandle, consumer.description, blank=1)
335 read_and_call_while(uhandle, consumer.noevent, blank=1)
336
337
338
339
340
341 if attempt_read_and_call(uhandle, consumer.nonmodel_sequences,
342 start='Sequences not found'):
343
344 read_and_call_while(uhandle, consumer.noevent, blank=1)
345 l = safe_peekline(uhandle)
346
347
348
349
350 if not l.startswith('CONVERGED') and l[0] != '>' \
351 and not l.startswith('QUERY'):
352 read_and_call_until(uhandle, consumer.description, blank=1)
353 read_and_call_while(uhandle, consumer.noevent, blank=1)
354
355 attempt_read_and_call(uhandle, consumer.converged, start='CONVERGED')
356 read_and_call_while(uhandle, consumer.noevent, blank=1)
357
358 consumer.end_descriptions()
359
374
381
394
423
429
431
432
433
434
435
436
437 read_and_call(uhandle, consumer.score, start=' Score')
438 read_and_call(uhandle, consumer.identities, start=' Identities')
439
440 attempt_read_and_call(uhandle, consumer.strand, start = ' Strand')
441
442 attempt_read_and_call(uhandle, consumer.frame, start = ' Frame')
443 read_and_call(uhandle, consumer.noevent, blank=1)
444
446
447
448
449
450
451
452
453
454
455 while 1:
456
457 attempt_read_and_call(uhandle, consumer.noevent, start=' ')
458 read_and_call(uhandle, consumer.query, start='Query')
459 read_and_call(uhandle, consumer.align, start=' ')
460 read_and_call(uhandle, consumer.sbjct, start='Sbjct')
461 read_and_call_while(uhandle, consumer.noevent, blank=1)
462 line = safe_peekline(uhandle)
463
464 if not (line.startswith('Query') or line.startswith(' ')):
465 break
466
489
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518 consumer.start_database_report()
519
520
521
522
523 if attempt_read_and_call(uhandle, consumer.noevent, start=" Subset"):
524 read_and_call(uhandle, consumer.noevent, contains="letters")
525 read_and_call(uhandle, consumer.noevent, contains="sequences")
526 read_and_call(uhandle, consumer.noevent, start=" ")
527
528
529
530 while attempt_read_and_call(uhandle, consumer.database,
531 start=' Database'):
532
533
534
535 if not uhandle.peekline():
536 consumer.end_database_report()
537 return
538
539
540 read_and_call_until(uhandle, consumer.database, start=' Posted')
541 read_and_call(uhandle, consumer.posted_date, start=' Posted')
542 read_and_call(uhandle, consumer.num_letters_in_database,
543 start=' Number of letters')
544 read_and_call(uhandle, consumer.num_sequences_in_database,
545 start=' Number of sequences')
546
547 attempt_read_and_call(uhandle, consumer.noevent, start=' ')
548
549 line = safe_readline(uhandle)
550 uhandle.saveline(line)
551 if line.find('Lambda') != -1:
552 break
553
554 read_and_call(uhandle, consumer.noevent, start='Lambda')
555 read_and_call(uhandle, consumer.ka_params)
556
557
558 attempt_read_and_call(uhandle, consumer.noevent, blank=1)
559
560
561 attempt_read_and_call(uhandle, consumer.gapped, start='Gapped')
562
563 if attempt_read_and_call(uhandle, consumer.noevent, start='Lambda'):
564 read_and_call(uhandle, consumer.ka_params_gap)
565
566
567
568
569 try:
570 read_and_call_while(uhandle, consumer.noevent, blank=1)
571 except ValueError, x:
572 if str(x) != "Unexpected end of stream.":
573 raise
574 consumer.end_database_report()
575
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633 if not uhandle.peekline():
634 return
635
636
637
638 consumer.start_parameters()
639
640
641 attempt_read_and_call(uhandle, consumer.matrix, start='Matrix')
642
643 attempt_read_and_call(uhandle, consumer.gap_penalties, start='Gap')
644
645 attempt_read_and_call(uhandle, consumer.num_sequences,
646 start='Number of Sequences')
647 read_and_call(uhandle, consumer.num_hits,
648 start='Number of Hits')
649 attempt_read_and_call(uhandle, consumer.num_sequences,
650 start='Number of Sequences')
651 read_and_call(uhandle, consumer.num_extends,
652 start='Number of extensions')
653 read_and_call(uhandle, consumer.num_good_extends,
654 start='Number of successful')
655
656 read_and_call(uhandle, consumer.num_seqs_better_e,
657 start='Number of sequences')
658
659
660 if attempt_read_and_call(uhandle, consumer.hsps_no_gap,
661 start="Number of HSP's better"):
662
663 if attempt_read_and_call(uhandle, consumer.noevent,
664 start="Number of HSP's gapped:"):
665 read_and_call(uhandle, consumer.noevent,
666 start="Number of HSP's successfully")
667
668 attempt_read_and_call(uhandle, consumer.noevent,
669 start="Number of extra gapped extensions")
670 else:
671 read_and_call(uhandle, consumer.hsps_prelim_gapped,
672 start="Number of HSP's successfully")
673 read_and_call(uhandle, consumer.hsps_prelim_gap_attempted,
674 start="Number of HSP's that")
675 read_and_call(uhandle, consumer.hsps_gapped,
676 start="Number of HSP's gapped")
677
678 elif attempt_read_and_call(uhandle, consumer.noevent,
679 start="Number of HSP's gapped"):
680 read_and_call(uhandle, consumer.noevent,
681 start="Number of HSP's successfully")
682
683
684 attempt_read_and_call(uhandle, consumer.query_length,
685 has_re=re.compile(r"[Ll]ength of query"))
686 read_and_call(uhandle, consumer.database_length,
687 has_re=re.compile(r"[Ll]ength of \s*[Dd]atabase"))
688
689
690 attempt_read_and_call(uhandle, consumer.noevent,
691 start="Length adjustment")
692 attempt_read_and_call(uhandle, consumer.effective_hsp_length,
693 start='effective HSP')
694
695 attempt_read_and_call(
696 uhandle, consumer.effective_query_length,
697 has_re=re.compile(r'[Ee]ffective length of query'))
698
699
700 attempt_read_and_call(
701 uhandle, consumer.effective_database_length,
702 has_re=re.compile(r'[Ee]ffective length of \s*[Dd]atabase'))
703
704
705 attempt_read_and_call(
706 uhandle, consumer.effective_search_space,
707 has_re=re.compile(r'[Ee]ffective search space:'))
708
709 attempt_read_and_call(
710 uhandle, consumer.effective_search_space_used,
711 has_re=re.compile(r'[Ee]ffective search space used'))
712
713
714 attempt_read_and_call(uhandle, consumer.frameshift, start='frameshift')
715
716
717 attempt_read_and_call(uhandle, consumer.threshold, start='T')
718
719 attempt_read_and_call(uhandle, consumer.threshold, start='Neighboring words threshold')
720
721
722 attempt_read_and_call(uhandle, consumer.window_size, start='A')
723
724 attempt_read_and_call(uhandle, consumer.window_size, start='Window for multiple hits')
725
726 read_and_call(uhandle, consumer.dropoff_1st_pass, start='X1')
727 read_and_call(uhandle, consumer.gap_x_dropoff, start='X2')
728
729
730 attempt_read_and_call(uhandle, consumer.gap_x_dropoff_final,
731 start='X3')
732
733 read_and_call(uhandle, consumer.gap_trigger, start='S1')
734
735
736
737 if not is_blank_line(uhandle.peekline(), allow_spaces=1):
738 read_and_call(uhandle, consumer.blast_cutoff, start='S2')
739
740 consumer.end_parameters()
741
743 """Parses BLAST data into a Record.Blast object.
744
745 """
750
751 - def parse(self, handle):
752 """parse(self, handle)"""
753 self._scanner.feed(handle, self._consumer)
754 return self._consumer.data
755
757 """Parses BLAST data into a Record.PSIBlast object.
758
759 """
764
765 - def parse(self, handle):
766 """parse(self, handle)"""
767 self._scanner.feed(handle, self._consumer)
768 return self._consumer.data
769
773
775 c = line.split()
776 self._header.application = c[0]
777 self._header.version = c[1]
778 self._header.date = c[2][1:-1]
779
781 if line.startswith('Reference: '):
782 self._header.reference = line[11:]
783 else:
784 self._header.reference = self._header.reference + line
785
787 if line.startswith('Query= '):
788 self._header.query = line[7:]
789 elif not line.startswith(' '):
790 self._header.query = "%s%s" % (self._header.query, line)
791 else:
792 letters, = _re_search(
793 r"([0-9,]+) letters", line,
794 "I could not find the number of letters in line\n%s" % line)
795 self._header.query_letters = _safe_int(letters)
796
798 line = line.rstrip()
799 if line.startswith('Database: '):
800 self._header.database = line[10:]
801 elif not line.endswith('total letters'):
802 self._header.database = self._header.database + line.strip()
803 else:
804 sequences, letters =_re_search(
805 r"([0-9,]+) sequences; ([0-9,-]+) total letters", line,
806 "I could not find the sequences and letters in line\n%s" %line)
807 self._header.database_sequences = _safe_int(sequences)
808 self._header.database_letters = _safe_int(letters)
809
814
817 self._descriptions = []
818 self._model_sequences = []
819 self._nonmodel_sequences = []
820 self._converged = 0
821 self._type = None
822 self._roundnum = None
823
824 self.__has_n = 0
825
827 if line.startswith('Sequences producing'):
828 cols = line.split()
829 if cols[-1] == 'N':
830 self.__has_n = 1
831
833 dh = self._parse(line)
834 if self._type == 'model':
835 self._model_sequences.append(dh)
836 elif self._type == 'nonmodel':
837 self._nonmodel_sequences.append(dh)
838 else:
839 self._descriptions.append(dh)
840
843
845 self._type = 'nonmodel'
846
849
852
854 if not line.startswith('Results from round'):
855 raise ValueError, "I didn't understand the round line\n%s" % line
856 self._roundnum = _safe_int(line[18:].strip())
857
860
861 - def _parse(self, description_line):
862 line = description_line
863 dh = Record.Description()
864
865
866
867
868
869
870
871
872 cols = line.split()
873 if len(cols) < 3:
874 raise ValueError, \
875 "Line does not appear to contain description:\n%s" % line
876 if self.__has_n:
877 i = line.rfind(cols[-1])
878 i = line.rfind(cols[-2], 0, i)
879 i = line.rfind(cols[-3], 0, i)
880 else:
881 i = line.rfind(cols[-1])
882 i = line.rfind(cols[-2], 0, i)
883 if self.__has_n:
884 dh.title, dh.score, dh.e, dh.num_alignments = \
885 line[:i].rstrip(), cols[-3], cols[-2], cols[-1]
886 else:
887 dh.title, dh.score, dh.e, dh.num_alignments = \
888 line[:i].rstrip(), cols[-2], cols[-1], 1
889 dh.num_alignments = _safe_int(dh.num_alignments)
890 dh.score = _safe_int(dh.score)
891 dh.e = _safe_float(dh.e)
892 return dh
893
895
896
897
898
902
904 self._alignment.title = "%s%s" % (self._alignment.title,
905 line.lstrip())
906
908
909 parts = line.replace(" ","").split("=")
910 assert len(parts)==2, "Unrecognised format length line"
911 self._alignment.length = parts[1]
912 self._alignment.length = _safe_int(self._alignment.length)
913
915
916 if line.startswith('QUERY') or line.startswith('blast_tmp'):
917
918
919
920
921
922 try:
923 name, start, seq, end = line.split()
924 except ValueError:
925 raise ValueError, "I do not understand the line\n%s" \
926 % line
927 self._start_index = line.index(start, len(name))
928 self._seq_index = line.index(seq,
929 self._start_index+len(start))
930
931 self._name_length = self._start_index - 1
932 self._start_length = self._seq_index - self._start_index - 1
933 self._seq_length = line.rfind(end) - self._seq_index - 1
934
935
936
937
938
939
940
941
942
943 name = line[:self._name_length]
944 name = name.rstrip()
945 start = line[self._start_index:self._start_index+self._start_length]
946 start = start.rstrip()
947 if start:
948 start = _safe_int(start)
949 end = line[self._seq_index+self._seq_length:].rstrip()
950 if end:
951 end = _safe_int(end)
952 seq = line[self._seq_index:self._seq_index+self._seq_length].rstrip()
953
954 if len(seq) < self._seq_length:
955 seq = seq + ' '*(self._seq_length-len(seq))
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974 align = self._multiple_alignment.alignment
975 align.append((name, start, seq, end))
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1025
1026 if self._alignment:
1027 self._alignment.title = self._alignment.title.rstrip()
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049 try:
1050 del self._seq_index
1051 del self._seq_length
1052 del self._start_index
1053 del self._start_length
1054 del self._name_length
1055 except AttributeError:
1056 pass
1057
1061
1063 self._hsp.bits, self._hsp.score = _re_search(
1064 r"Score =\s*([0-9.e+]+) bits \(([0-9]+)\)", line,
1065 "I could not find the score in line\n%s" % line)
1066 self._hsp.score = _safe_float(self._hsp.score)
1067 self._hsp.bits = _safe_float(self._hsp.bits)
1068
1069 x, y = _re_search(
1070 r"Expect\(?(\d*)\)? = +([0-9.e\-|\+]+)", line,
1071 "I could not find the expect in line\n%s" % line)
1072 if x:
1073 self._hsp.num_alignments = _safe_int(x)
1074 else:
1075 self._hsp.num_alignments = 1
1076 self._hsp.expect = _safe_float(y)
1077
1079 x, y = _re_search(
1080 r"Identities = (\d+)\/(\d+)", line,
1081 "I could not find the identities in line\n%s" % line)
1082 self._hsp.identities = _safe_int(x), _safe_int(y)
1083
1084 if line.find('Positives') != -1:
1085 x, y = _re_search(
1086 r"Positives = (\d+)\/(\d+)", line,
1087 "I could not find the positives in line\n%s" % line)
1088 self._hsp.positives = _safe_int(x), _safe_int(y)
1089
1090 if line.find('Gaps') != -1:
1091 x, y = _re_search(
1092 r"Gaps = (\d+)\/(\d+)", line,
1093 "I could not find the gaps in line\n%s" % line)
1094 self._hsp.gaps = _safe_int(x), _safe_int(y)
1095
1096
1098 self._hsp.strand = _re_search(
1099 r"Strand = (\w+) / (\w+)", line,
1100 "I could not find the strand in line\n%s" % line)
1101
1103
1104
1105
1106 if line.find('/') != -1:
1107 self._hsp.frame = _re_search(
1108 r"Frame = ([-+][123]) / ([-+][123])", line,
1109 "I could not find the frame in line\n%s" % line)
1110 else:
1111 self._hsp.frame = _re_search(
1112 r"Frame = ([-+][123])", line,
1113 "I could not find the frame in line\n%s" % line)
1114
1115
1116
1117
1118
1119
1120 _query_re = re.compile(r"Query(:?) \s*(\d+)\s*(.+) (\d+)")
1122 m = self._query_re.search(line)
1123 if m is None:
1124 raise ValueError, "I could not find the query in line\n%s" % line
1125
1126
1127
1128 colon, start, seq, end = m.groups()
1129 self._hsp.query = self._hsp.query + seq
1130 if self._hsp.query_start is None:
1131 self._hsp.query_start = _safe_int(start)
1132
1133
1134
1135 self._hsp.query_end = _safe_int(end)
1136
1137
1138 self._query_start_index = m.start(3)
1139 self._query_len = len(seq)
1140
1142 seq = line[self._query_start_index:].rstrip()
1143 if len(seq) < self._query_len:
1144
1145 seq = seq + ' ' * (self._query_len-len(seq))
1146 elif len(seq) < self._query_len:
1147 raise ValueError, "Match is longer than the query in line\n%s" % \
1148 line
1149 self._hsp.match = self._hsp.match + seq
1150
1151
1152
1153 _sbjct_re = re.compile(r"Sbjct(:?) \s*(\d+)\s*(.+) (\d+)")
1155 m = self._sbjct_re.search(line)
1156 if m is None:
1157 raise ValueError, "I could not find the sbjct in line\n%s" % line
1158 colon, start, seq, end = m.groups()
1159
1160
1161
1162
1163 if not seq.strip():
1164 seq = ' ' * self._query_len
1165 self._hsp.sbjct = self._hsp.sbjct + seq
1166 if self._hsp.sbjct_start is None:
1167 self._hsp.sbjct_start = _safe_int(start)
1168
1169 self._hsp.sbjct_end = _safe_int(end)
1170 if len(seq) != self._query_len:
1171 raise ValueError, \
1172 "QUERY and SBJCT sequence lengths don't match in line\n%s" \
1173 % line
1174
1175 del self._query_start_index
1176 del self._query_len
1177
1180
1182
1185
1194
1195 - def posted_date(self, line):
1196 self._dr.posted_date.append(_re_search(
1197 r"Posted date:\s*(.+)$", line,
1198 "I could not find the posted date in line\n%s" % line))
1199
1204
1209
1213
1216
1220
1223
1227
1229 self._params.matrix = line[8:].rstrip()
1230
1235
1237 if line.find('1st pass') != -1:
1238 x, = _get_cols(line, (-4,), ncols=11, expected={2:"Hits"})
1239 self._params.num_hits = _safe_int(x)
1240 else:
1241 x, = _get_cols(line, (-1,), ncols=6, expected={2:"Hits"})
1242 self._params.num_hits = _safe_int(x)
1243
1245 if line.find('1st pass') != -1:
1246 x, = _get_cols(line, (-4,), ncols=9, expected={2:"Sequences:"})
1247 self._params.num_sequences = _safe_int(x)
1248 else:
1249 x, = _get_cols(line, (-1,), ncols=4, expected={2:"Sequences:"})
1250 self._params.num_sequences = _safe_int(x)
1251
1253 if line.find('1st pass') != -1:
1254 x, = _get_cols(line, (-4,), ncols=9, expected={2:"extensions:"})
1255 self._params.num_extends = _safe_int(x)
1256 else:
1257 x, = _get_cols(line, (-1,), ncols=4, expected={2:"extensions:"})
1258 self._params.num_extends = _safe_int(x)
1259
1261 if line.find('1st pass') != -1:
1262 x, = _get_cols(line, (-4,), ncols=10, expected={3:"extensions:"})
1263 self._params.num_good_extends = _safe_int(x)
1264 else:
1265 x, = _get_cols(line, (-1,), ncols=5, expected={3:"extensions:"})
1266 self._params.num_good_extends = _safe_int(x)
1267
1273
1278
1284
1290
1295
1300
1305
1311
1317
1323
1329
1335
1339
1341 if line[:2] == "T:" :
1342
1343 self._params.threshold, = _get_cols(
1344 line, (1,), ncols=2, expected={0:"T:"})
1345 elif line[:28] == "Neighboring words threshold:" :
1346 self._params.threshold, = _get_cols(
1347 line, (3,), ncols=4, expected={0:"Neighboring", 1:"words", 2:"threshold:"})
1348 else :
1349 raise ValueError("Unrecognised threshold line:\n%s" % line)
1350 self._params.threshold = _safe_int(self._params.threshold)
1351
1353 if line[:2] == "A:" :
1354 self._params.window_size, = _get_cols(
1355 line, (1,), ncols=2, expected={0:"A:"})
1356 elif line[:25] == "Window for multiple hits:" :
1357 self._params.window_size, = _get_cols(
1358 line, (4,), ncols=5, expected={0:"Window", 2:"multiple", 3:"hits:"})
1359 else :
1360 raise ValueError("Unrecognised window size line:\n%s" % line)
1361 self._params.window_size = _safe_int(self._params.window_size)
1362
1368
1374
1380
1386
1392
1395
1396
1397 -class _BlastConsumer(AbstractConsumer,
1398 _HeaderConsumer,
1399 _DescriptionConsumer,
1400 _AlignmentConsumer,
1401 _HSPConsumer,
1402 _DatabaseReportConsumer,
1403 _ParametersConsumer
1404 ):
1405
1406
1407
1408
1409
1410
1411
1412
1413
1416
1418
1419 raise ValueError, \
1420 "This consumer doesn't handle PSI-BLAST data"
1421
1425
1429
1431 self.data.descriptions = self._descriptions
1432
1439
1441 _HSPConsumer.end_hsp(self)
1442 try:
1443 self._alignment.hsps.append(self._hsp)
1444 except AttributeError:
1445 raise ValueError, "Found an HSP before an alignment"
1446
1450
1454
1455 -class _PSIBlastConsumer(AbstractConsumer,
1456 _HeaderConsumer,
1457 _DescriptionConsumer,
1458 _AlignmentConsumer,
1459 _HSPConsumer,
1460 _DatabaseReportConsumer,
1461 _ParametersConsumer
1462 ):
1465
1469
1473
1478
1480 _DescriptionConsumer.end_descriptions(self)
1481 self._round.number = self._roundnum
1482 if self._descriptions:
1483 self._round.new_seqs.extend(self._descriptions)
1484 self._round.reused_seqs.extend(self._model_sequences)
1485 self._round.new_seqs.extend(self._nonmodel_sequences)
1486 if self._converged:
1487 self.data.converged = 1
1488
1490 _AlignmentConsumer.end_alignment(self)
1491 if self._alignment.hsps:
1492 self._round.alignments.append(self._alignment)
1493 if self._multiple_alignment:
1494 self._round.multiple_alignment = self._multiple_alignment
1495
1497 _HSPConsumer.end_hsp(self)
1498 try:
1499 self._alignment.hsps.append(self._hsp)
1500 except AttributeError:
1501 raise ValueError, "Found an HSP before an alignment"
1502
1506
1510
1512 """Iterates over a file of multiple BLAST results.
1513
1514 Methods:
1515 next Return the next record from the stream, or None.
1516
1517 """
1518 - def __init__(self, handle, parser=None):
1519 """__init__(self, handle, parser=None)
1520
1521 Create a new iterator. handle is a file-like object. parser
1522 is an optional Parser object to change the results into another form.
1523 If set to None, then the raw contents of the file will be returned.
1524
1525 """
1526 try:
1527 handle.readline
1528 except AttributeError:
1529 raise ValueError(
1530 "I expected a file handle or file-like object, got %s"
1531 % type(handle))
1532 self._uhandle = File.UndoHandle(handle)
1533 self._parser = parser
1534
1536 """next(self) -> object
1537
1538 Return the next Blast record from the file. If no more records,
1539 return None.
1540
1541 """
1542 lines = []
1543 while 1:
1544 line = self._uhandle.readline()
1545 if not line:
1546 break
1547
1548 if lines and (line.startswith('BLAST')
1549 or line.startswith('BLAST', 1)
1550 or line.startswith('<?xml ')):
1551 self._uhandle.saveline(line)
1552 break
1553 lines.append(line)
1554
1555 if not lines:
1556 return None
1557
1558 data = ''.join(lines)
1559 if self._parser is not None:
1560 return self._parser.parse(File.StringHandle(data))
1561 return data
1562
1564 return iter(self.next, None)
1565
1566 -def blastall(blastcmd, program, database, infile, align_view='7', **keywds):
1567 """blastall(blastcmd, program, database, infile, align_view='7', **keywds)
1568 -> read, error Undohandles
1569
1570 Execute and retrieve data from blastall. blastcmd is the command
1571 used to launch the 'blastall' executable. program is the blast program
1572 to use, e.g. 'blastp', 'blastn', etc. database is the path to the database
1573 to search against. infile is the path to the file containing
1574 the sequence to search with.
1575
1576 You may pass more parameters to **keywds to change the behavior of
1577 the search. Otherwise, optional values will be chosen by blastall.
1578 The Blast output is by default in XML format. Use the align_view keyword
1579 for output in a different format.
1580
1581 Scoring
1582 matrix Matrix to use.
1583 gap_open Gap open penalty.
1584 gap_extend Gap extension penalty.
1585 nuc_match Nucleotide match reward. (BLASTN)
1586 nuc_mismatch Nucleotide mismatch penalty. (BLASTN)
1587 query_genetic_code Genetic code for Query.
1588 db_genetic_code Genetic code for database. (TBLAST[NX])
1589
1590 Algorithm
1591 gapped Whether to do a gapped alignment. T/F (not for TBLASTX)
1592 expectation Expectation value cutoff.
1593 wordsize Word size.
1594 strands Query strands to search against database.([T]BLAST[NX])
1595 keep_hits Number of best hits from a region to keep.
1596 xdrop Dropoff value (bits) for gapped alignments.
1597 hit_extend Threshold for extending hits.
1598 region_length Length of region used to judge hits.
1599 db_length Effective database length.
1600 search_length Effective length of search space.
1601
1602 Processing
1603 filter Filter query sequence for low complexity (with SEG)? T/F
1604 believe_query Believe the query defline. T/F
1605 restrict_gi Restrict search to these GI's.
1606 nprocessors Number of processors to use.
1607 oldengine Force use of old engine T/F
1608
1609 Formatting
1610 html Produce HTML output? T/F
1611 descriptions Number of one-line descriptions.
1612 alignments Number of alignments.
1613 align_view Alignment view. Integer 0-11,
1614 passed as a string or integer.
1615 show_gi Show GI's in deflines? T/F
1616 seqalign_file seqalign file to output.
1617
1618 """
1619
1620 _security_check_parameters(keywds)
1621
1622
1623 _security_check_parameters(keywds)
1624
1625 att2param = {
1626 'matrix' : '-M',
1627 'gap_open' : '-G',
1628 'gap_extend' : '-E',
1629 'nuc_match' : '-r',
1630 'nuc_mismatch' : '-q',
1631 'query_genetic_code' : '-Q',
1632 'db_genetic_code' : '-D',
1633
1634 'gapped' : '-g',
1635 'expectation' : '-e',
1636 'wordsize' : '-W',
1637 'strands' : '-S',
1638 'keep_hits' : '-K',
1639 'xdrop' : '-X',
1640 'hit_extend' : '-f',
1641 'region_length' : '-L',
1642 'db_length' : '-z',
1643 'search_length' : '-Y',
1644
1645 'program' : '-p',
1646 'database' : '-d',
1647 'infile' : '-i',
1648 'filter' : '-F',
1649 'believe_query' : '-J',
1650 'restrict_gi' : '-l',
1651 'nprocessors' : '-a',
1652 'oldengine' : '-V',
1653
1654 'html' : '-T',
1655 'descriptions' : '-v',
1656 'alignments' : '-b',
1657 'align_view' : '-m',
1658 'show_gi' : '-I',
1659 'seqalign_file' : '-O'
1660 }
1661
1662 if not os.path.exists(blastcmd):
1663 raise ValueError, "blastall does not exist at %s" % blastcmd
1664
1665 params = []
1666
1667 params.extend([att2param['program'], program])
1668 params.extend([att2param['database'], database])
1669 params.extend([att2param['infile'], infile])
1670 params.extend([att2param['align_view'], str(align_view)])
1671
1672 for attr in keywds.keys():
1673 params.extend([att2param[attr], str(keywds[attr])])
1674
1675 w, r, e = os.popen3(" ".join([blastcmd] + params))
1676 w.close()
1677 return File.UndoHandle(r), File.UndoHandle(e)
1678
1679
1680 -def blastpgp(blastcmd, database, infile, align_view='7', **keywds):
1681 """blastpgp(blastcmd, database, infile, align_view='7', **keywds) ->
1682 read, error Undohandles
1683
1684 Execute and retrieve data from blastpgp. blastcmd is the command
1685 used to launch the 'blastpgp' executable. database is the path to the
1686 database to search against. infile is the path to the file containing
1687 the sequence to search with.
1688
1689 You may pass more parameters to **keywds to change the behavior of
1690 the search. Otherwise, optional values will be chosen by blastpgp.
1691 The Blast output is by default in XML format. Use the align_view keyword
1692 for output in a different format.
1693
1694 Scoring
1695 matrix Matrix to use.
1696 gap_open Gap open penalty.
1697 gap_extend Gap extension penalty.
1698 window_size Multiple hits window size.
1699 npasses Number of passes.
1700 passes Hits/passes. Integer 0-2.
1701
1702 Algorithm
1703 gapped Whether to do a gapped alignment. T/F
1704 expectation Expectation value cutoff.
1705 wordsize Word size.
1706 keep_hits Number of beset hits from a region to keep.
1707 xdrop Dropoff value (bits) for gapped alignments.
1708 hit_extend Threshold for extending hits.
1709 region_length Length of region used to judge hits.
1710 db_length Effective database length.
1711 search_length Effective length of search space.
1712 nbits_gapping Number of bits to trigger gapping.
1713 pseudocounts Pseudocounts constants for multiple passes.
1714 xdrop_final X dropoff for final gapped alignment.
1715 xdrop_extension Dropoff for blast extensions.
1716 model_threshold E-value threshold to include in multipass model.
1717 required_start Start of required region in query.
1718 required_end End of required region in query.
1719
1720 Processing
1721 XXX should document default values
1722 program The blast program to use. (PHI-BLAST)
1723 filter Filter query sequence for low complexity (with SEG)? T/F
1724 believe_query Believe the query defline? T/F
1725 nprocessors Number of processors to use.
1726
1727 Formatting
1728 html Produce HTML output? T/F
1729 descriptions Number of one-line descriptions.
1730 alignments Number of alignments.
1731 align_view Alignment view. Integer 0-11,
1732 passed as a string or integer.
1733 show_gi Show GI's in deflines? T/F
1734 seqalign_file seqalign file to output.
1735 align_outfile Output file for alignment.
1736 checkpoint_outfile Output file for PSI-BLAST checkpointing.
1737 restart_infile Input file for PSI-BLAST restart.
1738 hit_infile Hit file for PHI-BLAST.
1739 matrix_outfile Output file for PSI-BLAST matrix in ASCII.
1740
1741 _security_check_parameters(keywds)
1742
1743 align_infile Input alignment file for PSI-BLAST restart.
1744
1745 """
1746
1747 _security_check_parameters(keywds)
1748
1749 att2param = {
1750 'matrix' : '-M',
1751 'gap_open' : '-G',
1752 'gap_extend' : '-E',
1753 'window_size' : '-A',
1754 'npasses' : '-j',
1755 'passes' : '-P',
1756
1757 'gapped' : '-g',
1758 'expectation' : '-e',
1759 'wordsize' : '-W',
1760 'keep_hits' : '-K',
1761 'xdrop' : '-X',
1762 'hit_extend' : '-f',
1763 'region_length' : '-L',
1764 'db_length' : '-Z',
1765 'search_length' : '-Y',
1766 'nbits_gapping' : '-N',
1767 'pseudocounts' : '-c',
1768 'xdrop_final' : '-Z',
1769 'xdrop_extension' : '-y',
1770 'model_threshold' : '-h',
1771 'required_start' : '-S',
1772 'required_end' : '-H',
1773
1774 'program' : '-p',
1775 'database' : '-d',
1776 'infile' : '-i',
1777 'filter' : '-F',
1778 'believe_query' : '-J',
1779 'nprocessors' : '-a',
1780
1781 'html' : '-T',
1782 'descriptions' : '-v',
1783 'alignments' : '-b',
1784 'align_view' : '-m',
1785 'show_gi' : '-I',
1786 'seqalign_file' : '-O',
1787 'align_outfile' : '-o',
1788 'checkpoint_outfile' : '-C',
1789 'restart_infile' : '-R',
1790 'hit_infile' : '-k',
1791 'matrix_outfile' : '-Q',
1792 'align_infile' : '-B'
1793 }
1794
1795 if not os.path.exists(blastcmd):
1796 raise ValueError, "blastpgp does not exist at %s" % blastcmd
1797
1798 params = []
1799
1800 params.extend([att2param['database'], database])
1801 params.extend([att2param['infile'], infile])
1802 params.extend([att2param['align_view'], str(align_view)])
1803
1804 for attr in keywds.keys():
1805 params.extend([att2param[attr], str(keywds[attr])])
1806
1807 w, r, e = os.popen3(" ".join([blastcmd] + params))
1808 w.close()
1809 return File.UndoHandle(r), File.UndoHandle(e)
1810
1811
1812 -def rpsblast(blastcmd, database, infile, align_view="7", **keywds):
1813 """rpsblast(blastcmd, database, infile, **keywds) ->
1814 read, error Undohandles
1815
1816 Execute and retrieve data from standalone RPS-BLAST. blastcmd is the
1817 command used to launch the 'rpsblast' executable. database is the path
1818 to the database to search against. infile is the path to the file
1819 containing the sequence to search with.
1820
1821 You may pass more parameters to **keywds to change the behavior of
1822 the search. Otherwise, optional values will be chosen by rpsblast.
1823
1824 Please note that this function will give XML output by default, by
1825 setting align_view to seven (i.e. command line option -m 7).
1826 You should use the NCBIXML.parse() function to read the resulting output.
1827 This is because NCBIStandalone.BlastParser() does not understand the
1828 plain text output format from rpsblast.
1829
1830 WARNING - The following text and associated parameter handling has not
1831 received extensive testing. Please report any errors we might have made...
1832
1833 Algorithm/Scoring
1834 gapped Whether to do a gapped alignment. T/F
1835 multihit 0 for multiple hit (default), 1 for single hit
1836 expectation Expectation value cutoff.
1837 range_restriction Range restriction on query sequence (Format: start,stop) blastp only
1838 0 in 'start' refers to the beginning of the sequence
1839 0 in 'stop' refers to the end of the sequence
1840 Default = 0,0
1841 xdrop Dropoff value (bits) for gapped alignments.
1842 xdrop_final X dropoff for final gapped alignment (in bits).
1843 xdrop_extension Dropoff for blast extensions (in bits).
1844 search_length Effective length of search space.
1845 nbits_gapping Number of bits to trigger gapping.
1846 protein Query sequence is protein. T/F
1847 db_length Effective database length.
1848
1849 Processing
1850 filter Filter query sequence for low complexity? T/F
1851 case_filter Use lower case filtering of FASTA sequence T/F, default F
1852 believe_query Believe the query defline. T/F
1853 nprocessors Number of processors to use.
1854 logfile Name of log file to use, default rpsblast.log
1855
1856 Formatting
1857 html Produce HTML output? T/F
1858 descriptions Number of one-line descriptions.
1859 alignments Number of alignments.
1860 align_view Alignment view. Integer 0-11,
1861
1862 _security_check_parameters(keywds)
1863
1864 passed as a string or integer.
1865 show_gi Show GI's in deflines? T/F
1866 seqalign_file seqalign file to output.
1867 align_outfile Output file for alignment.
1868
1869 """
1870
1871 _security_check_parameters(keywds)
1872
1873 att2param = {
1874 'multihit' : '-P',
1875 'gapped' : '-g',
1876 'expectation' : '-e',
1877 'range_restriction' : '-L',
1878 'xdrop' : '-X',
1879 'xdrop_final' : '-Z',
1880 'xdrop_extension' : '-y',
1881 'search_length' : '-Y',
1882 'nbits_gapping' : '-N',
1883 'protein' : '-p',
1884 'db_length' : '-z',
1885
1886 'database' : '-d',
1887 'infile' : '-i',
1888 'filter' : '-F',
1889 'case_filter' : '-U',
1890 'believe_query' : '-J',
1891 'nprocessors' : '-a',
1892 'logfile' : '-l',
1893
1894 'html' : '-T',
1895 'descriptions' : '-v',
1896 'alignments' : '-b',
1897 'align_view' : '-m',
1898 'show_gi' : '-I',
1899 'seqalign_file' : '-O',
1900 'align_outfile' : '-o'
1901 }
1902
1903 if not os.path.exists(blastcmd):
1904 raise ValueError, "rpsblast does not exist at %s" % blastcmd
1905
1906 params = []
1907
1908 params.extend([att2param['database'], database])
1909 params.extend([att2param['infile'], infile])
1910 params.extend([att2param['align_view'], str(align_view)])
1911
1912 for attr in keywds.keys():
1913 params.extend([att2param[attr], str(keywds[attr])])
1914
1915 w, r, e = os.popen3(" ".join([blastcmd] + params))
1916 w.close()
1917 return File.UndoHandle(r), File.UndoHandle(e)
1918
1920 m = re.search(regex, line)
1921 if not m:
1922 raise ValueError, error_msg
1923 return m.groups()
1924
1925 -def _get_cols(line, cols_to_get, ncols=None, expected={}):
1926 cols = line.split()
1927
1928
1929 if ncols is not None and len(cols) != ncols:
1930 raise ValueError, "I expected %d columns (got %d) in line\n%s" % \
1931 (ncols, len(cols), line)
1932
1933
1934 for k in expected.keys():
1935 if cols[k] != expected[k]:
1936 raise ValueError, "I expected '%s' in column %d in line\n%s" % (
1937 expected[k], k, line)
1938
1939
1940 results = []
1941 for c in cols_to_get:
1942 results.append(cols[c])
1943 return tuple(results)
1944
1946 try:
1947 return int(str)
1948 except ValueError:
1949
1950
1951 str = str.replace(',', '')
1952 try:
1953
1954 return int(str)
1955 except ValueError:
1956 pass
1957
1958
1959 return long(float(str))
1960
1962
1963
1964
1965
1966
1967 if str and str[0] in ['E', 'e']:
1968 str = '1' + str
1969 try:
1970 return float(str)
1971 except ValueError:
1972
1973 str = str.replace(',', '')
1974
1975 return float(str)
1976
1978 """Look for any attempt to insert a command into a parameter.
1979
1980 e.g. blastall(..., matrix='IDENTITY -F 0; rm -rf /etc/passwd')
1981
1982 Looks for ";" or "&&" in the strings (Unix and Windows syntax
1983 for appending a command line), and if found raises an exception.
1984 """
1985 for key, value in param_dict.iteritems() :
1986 if ";" in value or "&&" in value :
1987 raise ValueError("Rejecting suspicious argument for %s" % key)
1988
1989
2000
2002 """Attempt to catch and diagnose BLAST errors while parsing.
2003
2004 This utilizes the BlastParser module but adds an additional layer
2005 of complexity on top of it by attempting to diagnose ValueErrors
2006 that may actually indicate problems during BLAST parsing.
2007
2008 Current BLAST problems this detects are:
2009 o LowQualityBlastError - When BLASTing really low quality sequences
2010 (ie. some GenBank entries which are just short streches of a single
2011 nucleotide), BLAST will report an error with the sequence and be
2012 unable to search with this. This will lead to a badly formatted
2013 BLAST report that the parsers choke on. The parser will convert the
2014 ValueError to a LowQualityBlastError and attempt to provide useful
2015 information.
2016
2017 """
2018 - def __init__(self, bad_report_handle = None):
2019 """Initialize a parser that tries to catch BlastErrors.
2020
2021 Arguments:
2022 o bad_report_handle - An optional argument specifying a handle
2023 where bad reports should be sent. This would allow you to save
2024 all of the bad reports to a file, for instance. If no handle
2025 is specified, the bad reports will not be saved.
2026 """
2027 self._bad_report_handle = bad_report_handle
2028
2029
2030 self._scanner = _Scanner()
2031 self._consumer = _BlastErrorConsumer()
2032
2033 - def parse(self, handle):
2034 """Parse a handle, attempting to diagnose errors.
2035 """
2036 results = handle.read()
2037
2038 try:
2039 self._scanner.feed(File.StringHandle(results), self._consumer)
2040 except ValueError, msg:
2041
2042 if self._bad_report_handle:
2043
2044 self._bad_report_handle.write(results)
2045
2046
2047 self._diagnose_error(
2048 File.StringHandle(results), self._consumer.data)
2049
2050
2051
2052 raise
2053 return self._consumer.data
2054
2056 """Attempt to diagnose an error in the passed handle.
2057
2058 Arguments:
2059 o handle - The handle potentially containing the error
2060 o data_record - The data record partially created by the consumer.
2061 """
2062 line = handle.readline()
2063
2064 while line:
2065
2066
2067
2068 if line.startswith('Searchingdone'):
2069 raise LowQualityBlastError("Blast failure occured on query: ",
2070 data_record.query)
2071 line = handle.readline()
2072