1 import warnings
2 warnings.warn("Bio.expressions was deprecated, as it does not work with recent versions of mxTextTools. If you want to continue to use this module, please get in contact with the Biopython developers at biopython-dev@biopython.org to avoid permanent removal of this module from Biopython", DeprecationWarning)
3
4
5
6 from Martel import *
7 from Bio import Std
8
9 spaces_line = Opt(Spaces()) + AnyEol()
10
11 blast_filter = Str("BLASTN", "BLASTP", "BLASTX", "TBLASTX", "TBLASTN")
12
13 ncbi_version = Std.application_version(Group("appversion", Re(r"2.\d+\.\d+")))
14
15
16 blastn_appheader = (Std.application_name(Str("BLASTN"), {"app": "blastn"}) +
17 Str(" ") +
18 ncbi_version +
19 Str(" ") +
20 ToEol())
21
22
23
24 blastp_appheader = (Std.application_name(Str("BLASTP"), {"app": "blastp"}) +
25 Str(" ") +
26 ncbi_version +
27 Str(" ") +
28 ToEol())
29
30
31 blastx_appheader = (Std.application_name(Str("BLASTX"), {"app": "blastx"}) +
32 Str(" ") +
33 ncbi_version +
34 Str(" ") +
35 ToEol())
36
37
38 tblastn_appheader = (Std.application_name(Str("TBLASTN"), {"app": "tblastn"}) +
39 Str(" ") +
40 ncbi_version +
41 Str(" ") +
42 ToEol())
43
44
45 tblastx_appheader = (Std.application_name(Str("TBLASTX"), {"app": "tblastx"}) +
46 Str(" ") +
47 ncbi_version +
48 Str(" ") +
49 ToEol())
50
51
52
53 query_letters = (Spaces() +
54 Str("(") +
55 Std.query_size(Digits()) +
56 Str(" letters)") +
57 AnyEol() +
58 Assert(AnyEol())
59 )
60
61 query_descr = (Str("Query=") +
62 Opt(Spaces()) +
63 Std.query_description(UntilEol()) +
64 AnyEol() +
65 Rep(AssertNot(query_letters) +
66 Std.query_description(UntilEol()) +
67 AnyEol())
68 )
69
70 query_descr = Std.query_description_block(query_descr)
71
72
73 _comma_digits = Re(r"\d(\d\d?)?(,\d\d\d)+")
74 -def Number(name = None, attrs = {}):
80
81
82 query_database = (Str("Database:") + Opt(Spaces()) +
83 Std.database_name(UntilEol()) + AnyEol() +
84 Spaces() +
85 Std.database_num_sequences(Number(),
86 {"bioformat:decode": "int.comma"}) +
87 Str(" sequences;") + Spaces() +
88 Std.database_num_letters(Number(),
89 {"bioformat:decode": "int.comma"}) +
90 Spaces() + Str("total letters") + ToEol())
91
92
93
94
95
96
97
98
99 to_table = Group("to_table",
100 (SkipLinesUntil(Str("Sequences producing significant")) +
101 ToEol() +
102 AnyEol()))
103
104 table_entry = Std.search_table_entry(
105 Group("table_entry",
106 (AssertNot(AnyEol()) +
107 Std.search_table_description(Re(r".{66}")) + Spaces() +
108 Std.search_table_value(Float(), {"name": "score",
109 "bioformat:decode": "float"}) +
110 Spaces() +
111 Std.search_table_value(Float(), {"name": "evalue",
112 "bioformat:decode": "float"}) +
113 AnyEol())
114 )
115 )
116
117 table = Std.search_table(Rep(table_entry))
118
119
120 header = Std.search_header(
121 SkipLinesUntil(Str("Query=")) +
122 query_descr +
123 query_letters +
124 AnyEol() +
125 Group("to_database", NullOp()) +
126 query_database +
127 Opt(to_table + table) +
128 SkipLinesUntil(Str(">"))
129 )
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144 hit_length = (Spaces() + Str("Length = ") +
145 Std.hit_length(Digits()) + AnyEol() +
146 Opt(Spaces()) + AnyEol())
147
148 hit_descr = (Str(">") + Std.hit_description(UntilEol()) + AnyEol() +
149 Rep(AssertNot(hit_length) +
150 Std.hit_description(UntilEol()) + AnyEol())
151 )
152
153
154 num_bits = Std.hsp_value(Float(), {"name": "bits",
155 "bioformat:decode": "float",
156 })
157
158 expect = Std.hsp_value(Float(), {"name": "expect",
159 "bioformat:decode": "float"})
160
161 num_identical = Std.hsp_value(Digits(), {"name": "identical",
162 "bioformat:decode": "int"})
163
164 hsp_length = Std.hsp_value(Digits(), {"name": "length",
165 "bioformat:decode": "int"})
166
167 num_positives = Std.hsp_value(Digits(), {"name": "positives",
168 "bioformat:decode": "int"})
169
170
171 score = (Re(r" *Score = *") + num_bits +
172 Re(r" bits \((?P<XXX1>\d+)\), Expect = *") +
173 expect + AnyEol())
174
175
176 tblastn_score = (Re(r" *Score = *") + num_bits +
177 Re(r" bits \((?P<XXX1>\d+)\), "
178 r"Expect(\((?P<expect_num>\d+)\))? = *") +
179 expect + AnyEol())
180
181 blastn_identical = (Re(r" *Identities = *") +
182 num_identical +
183 Str("/") +
184 hsp_length +
185 ToEol())
186
187
188 blastp_identical = (Re(r" *Identities = *") +
189 num_identical +
190 Str("/") +
191 hsp_length +
192 ToSep(sep = ",") +
193 Re(" Positives = *") +
194 num_positives +
195 ToEol())
196
197
198
199 frame = (Str(" Frame = ") + Std.hsp_frame(UntilEol(), {"which": "query"}) +
200 AnyEol())
201 tblastx_frame = (Str(" Frame = ") +
202 Std.hsp_frame(UntilSep(sep = " "), {"which": "query"}) +
203 Str(" / ") +
204 Std.hsp_frame(UntilEol(), {"which": "subject"}) +
205 AnyEol())
206
207
208 strand = (Re(r" *Strand = ") +
209 (Std.hsp_strand(Str("Plus"), {"strand": "+1", "which": "query"}) |
210 Std.hsp_strand(Str("Minus"), {"strand": "-1", "which": "query"})) +
211 Str(" / ") +
212 (Std.hsp_strand(Str("Plus"), {"strand": "+1", "which": "subject"}) |
213 Std.hsp_strand(Str("Minus"), {"strand": "-1", "which": "subject"}))+
214 AnyEol())
215
216 query_line = (
217 Std.hsp_seqalign_query_leader(
218 Std.hsp_seqalign_query_name(Str("Query")) +
219 Re(": *") +
220 Std.hsp_seqalign_query_start(Digits()) +
221 Re(" *")
222 ) +
223 Std.hsp_seqalign_query_seq(UntilSep(sep = " ")) +
224 Re(" *") +
225 Std.hsp_seqalign_query_end(Digits()) +
226 AnyEol()
227 )
228
229 homology_line = (
230 Std.hsp_seqalign_homology_seq(
231 Str(" ") +
232 UntilEol()
233 ) +
234 AnyEol()
235 )
236
237 subject_line = (
238 Std.hsp_seqalign_subject_name(Str("Sbjct")) +
239 Re(": *") +
240 Std.hsp_seqalign_subject_start(Digits()) +
241 Re(" *") +
242 Std.hsp_seqalign_subject_seq(UntilSep(sep = " ")) +
243 Re(" *") +
244 Std.hsp_seqalign_subject_end(Digits()) +
245 AnyEol()
246 )
247
248
249 blastn_hsp_header = (score + blastn_identical + strand +
250 spaces_line + spaces_line)
251
252 blastp_hsp_header = score + blastp_identical + spaces_line
253
254 blastx_hsp_header = score + blastp_identical + frame + spaces_line
255
256 tblastn_hsp_header = tblastn_score + blastp_identical + frame + spaces_line
257
258 tblastx_hsp_header = (tblastn_score + blastp_identical + tblastx_frame +
259 spaces_line + spaces_line)
260
261
262 alignment = Std.hsp_seqalign(query_line + homology_line + subject_line +
263 Rep1(spaces_line))
264
265 hsp = Std.hsp(Group("hsp_header", NullOp()) +
266 Rep(alignment))
267
268 hit = Std.hit(hit_descr + hit_length +
269 Rep(Group("hsp", hsp))
270 )
271
272
273
274
281 d = {"name": name,
282 "bioformat:decode": "int"}
283 if signed:
284 exp = Integer()
285 elif comma:
286 exp = Number()
287 d["bioformat:decode"] = "int.comma"
288 else:
289 exp = Digits()
290 return exp, d
291
292 -def int_stat(name, signed = 0, comma = 0):
295
299
303
304
305 db_stats = (Str(" Database:") + ToEol() +
306 Str(" Posted date:") + ToEol() +
307 Str(" Number of letters") + ToEol() +
308 Str(" Number of sequences") + ToEol() +
309 spaces_line)
310
311 ungapped_lambda_stats = (
312 Str("Lambda K H") + AnyEol() +
313 ( Spaces() + float_stat("lambda") +
314 Spaces() + float_stat("k") +
315 Spaces() + float_stat("h") +
316 ToEol() )
317 )
318 gapped_lambda_stats = (
319 Opt(Str("Gapped") + AnyEol()) +
320 Str("Lambda K H") + AnyEol() +
321 ( Spaces() + float_stat("gapped_lambda") +
322 Spaces() + float_stat("gapped_k") +
323 Spaces() + float_stat("gapped_h") +
324 ToEol() )
325 )
326 lambda_stats = (ungapped_lambda_stats +
327 AnyEol() +
328 gapped_lambda_stats)
329
330 matrix_stats = (
331 SkipLinesUntil(Str("Matrix:")) +
332
333 _nv("Matrix: ", Std.search_statistic(UntilEol(), {"name": "matrix"}))
334 )
335
336 gap_penalties_stats = (
337
338 _nv("Gap Penalties: Existence: ",
339 int_param("existence_gap_penalty") +
340 Str(", Extension: ") +
341 int_param("extension_gap_penalty"))
342 )
343
344 generic_info1 = (
345
346
347
348 _nv("Number of Hits to DB: ", int_stat("num_hits_to_db", signed = 1)) +
349
350
351 _nv("Number of Sequences: ", int_stat("num_sequences")) +
352
353
354 _nv("Number of extensions: ", int_stat("num_extensions")) +
355
356
357 _nv("Number of successful extensions: ",
358 int_stat("num_successful_extensions")) +
359
360
361 _nv("Number of sequences better than 10.0: ",
362 int_stat("num_sequences_better_than_10"))
363 )
364
365 generic_info2 = (
366
367 _nv("length of query: ", int_stat("query_length")) +
368
369
370 _nv("length of database: ", int_stat("database_length", comma = 1)) +
371
372
373 _nv("effective HSP length: ", int_stat("effective_hsp_length"))+
374
375
376 _nv("effective length of query: ", int_stat("effective_query_length")) +
377
378
379 _nv("effective length of database: ",
380 int_stat("effective_database_length", comma = 1)) +
381
382
383 _nv("effective search space: ", int_stat("effective_search_space")) +
384
385
386 _nv("effective search space used: ",
387 int_stat("effective_search_space_used"))
388 )
389
390 hsp_info = (
391
392 _nv("Number of HSP's better than 10.0 without gapping: ",
393 int_stat("num_hsps_better_than_10_without_gapping")) +
394
395
396 _nv("Number of HSP's successfully gapped in prelim test: ",
397 int_stat("num_hsps_successfully_gapped_in_prelim_test")) +
398
399
400 _nv("Number of HSP's that attempted gapping in prelim test: ",
401 int_stat("num_hsps_attempted_gapping_in_prelim_test")) +
402
403
404 _nv("Number of HSP's gapped (non-prelim): ",
405 int_stat("num_hsps_gapped"))
406 )
407
408
409 frameshift_info = _nv("frameshift window, decay const: ",
410 int_stat("frameshift_window") +
411 Re(r", *") +
412 float_stat("frameshift_decat_const"))
413
420
421
422 t_info = _nv("T: ", int_stat("t"))
423
424
425 a_info = _nv("A: ", int_stat("1"))
426
427
428 x1_info = _bit_info("X1: ", "x1")
429
430
431 x2_info = _bit_info("X2: ", "x2")
432
433
434 x3_info = _bit_info("X3: ", "x3")
435
436
437 s1_info = _bit_info("S1: ", "s1")
438
439
440 s2_info = _bit_info("S2: ", "s2")
441
442
443 blastn_ending = (db_stats +
444 lambda_stats +
445 matrix_stats +
446 gap_penalties_stats +
447 generic_info1 +
448 Opt(hsp_info) +
449 generic_info2 +
450 t_info +
451 a_info +
452 x1_info +
453 x2_info +
454 s1_info +
455 s2_info
456 )
457
458 blastp_ending = (db_stats +
459 lambda_stats +
460 matrix_stats +
461 gap_penalties_stats +
462 generic_info1 +
463 hsp_info +
464 generic_info2 +
465 t_info +
466 a_info +
467 x1_info +
468 x2_info +
469 x3_info +
470 s1_info +
471 s2_info
472 )
473
474 blastx_ending = (db_stats +
475 lambda_stats +
476 matrix_stats +
477 gap_penalties_stats +
478 generic_info1 +
479 hsp_info +
480 generic_info2 +
481 frameshift_info +
482 t_info +
483 a_info +
484 x1_info +
485 x2_info +
486 x3_info +
487 s1_info +
488 s2_info
489 )
490 tblastn_ending = blastx_ending
491 tblastx_ending = (db_stats +
492 ungapped_lambda_stats +
493 matrix_stats +
494 generic_info1 +
495 generic_info2 +
496 frameshift_info +
497 t_info +
498 a_info +
499 x1_info +
500 x2_info +
501 s1_info +
502 s2_info
503 )
504
505 format = Std.record(Group("appheader", NullOp()) +
506 header +
507 Rep(hit) +
508 Group("ending", NullOp()))
509
510 blastn = replace_groups(format,
511 (("appheader", blastn_appheader),
512 ("hsp_header", blastn_hsp_header),
513 ("ending", blastn_ending)))
514
515 blastp = replace_groups(format,
516 (("appheader", blastp_appheader),
517 ("hsp_header", blastp_hsp_header),
518 ("ending", blastp_ending)))
519
520 blastx = replace_groups(format,
521 (("appheader", blastx_appheader),
522 ("hsp_header", blastx_hsp_header),
523 ("ending", blastx_ending)))
524
525 tblastn = replace_groups(format,
526 (("appheader", tblastn_appheader),
527 ("hsp_header", tblastn_hsp_header),
528 ("ending", tblastn_ending)))
529
530 tblastx = replace_groups(format,
531 (("appheader", tblastx_appheader),
532 ("hsp_header", tblastx_hsp_header),
533 ("ending", tblastx_ending)))
534