1
2
3
4
5
6 from string import join
7 from Bio.Alphabet import IUPAC
8 from Bio import File
9 from Bio.ParserSupport import *
10 from Bio import Seq
11 from Bio.MEME import Motif
12 import re
13
15 """A class for holding the results of a MEME run.
16
17 A MEMERecord is an object that holds the results from running
18 MEME. It implements no methods of its own.
19
20 """
22 """__init__ (self)"""
23 self.motifs = []
24 self.version = ""
25 self.datafile = ""
26 self.command = ""
27 self.alphabet = None
28 self.sequence_names = []
29
31 for m in self.motifs:
32 if m.name == name:
33 return m
34
36 """A parser for the text output of the MEME program.
37 Parses the output into an object of the MEMERecord class.
38
39 Methods:
40 parse (handle): parses the contents of the file handle passed to it.
41
42 Example:
43
44 f = open("meme.output.txt")
45 parser = MEMEParser()
46 meme_record = parser.parse(f)
47 for motif in meme_record.motifs:
48 for instance in motif.instances:
49 print instance.motif_name, instance.sequence_name, instance.strand, instance.pvalue
50
51 """
56
57 - def parse (self, handle):
58 """parse (self, handle)"""
59 self._scanner.feed(handle, self._consumer)
60 return self._consumer.data
61
62
63
65 """Scanner for MEME output.
66
67 Methods:
68 feed
69
70 """
71
72 - def feed (self, handle, consumer):
73 """
74 Feeds in MEME output for scanning. handle should
75 implement the readline method. consumer is
76 a Consumer object that can receive the salient events.
77 """
78 if isinstance(handle, File.UndoHandle):
79 uhandle = handle
80 else:
81 uhandle = File.UndoHandle(handle)
82
83 self._scan_header(uhandle, consumer)
84 self._scan_motifs (uhandle, consumer)
85
87 try:
88 read_and_call_until(uhandle, consumer.noevent, contains = 'MEME version')
89 except SyntaxError:
90 raise SyntaxError, "Improper input file. File should contain a line starting MEME version."
91 read_and_call(uhandle, consumer._version, start = 'MEME version')
92 read_and_call_until(uhandle, consumer.noevent, start = 'TRAINING SET')
93 read_and_call(uhandle, consumer.noevent, start = 'TRAINING SET')
94 read_and_call(uhandle, consumer.noevent, start = '****')
95 read_and_call(uhandle, consumer._datafile, start = 'DATAFILE')
96 read_and_call(uhandle, consumer._alphabet, start = 'ALPHABET')
97 read_and_call(uhandle, consumer.noevent, start = 'Sequence name')
98 read_and_call(uhandle, consumer.noevent, start = '----')
99 read_and_call_until(uhandle, consumer._sequence_name, start = '***')
100 read_and_call_until(uhandle, consumer.noevent, start = 'command:')
101 read_and_call(uhandle, consumer._commandline, start = 'command:')
102 read_and_call_until(uhandle, consumer.noevent, start = 'MOTIF 1')
103
105 while 1:
106 read_and_call(uhandle, consumer._add_motif_with_info, start = 'MOTIF')
107 read_and_call_until(uhandle, consumer.noevent, contains = 'sorted by position p-value')
108 read_and_call(uhandle, consumer.motif_name, contains = 'sorted by position p-value')
109 read_and_call(uhandle, consumer.noevent, start = '---')
110 read_and_call(uhandle, consumer.noevent, start = 'Sequence name')
111 read_and_call(uhandle, consumer.noevent, start = '---')
112 read_and_call_until(uhandle, consumer.add_instance, start = '---')
113 read_and_call_until(uhandle, consumer.noevent, start = 'log-odds matrix')
114 read_and_call(uhandle, consumer.noevent)
115 read_and_call_until(uhandle, consumer.add_to_logodds, start = '---')
116 read_and_call_until(uhandle, consumer.noevent, start = 'letter-probability matrix')
117 read_and_call(uhandle, consumer.noevent, start = 'letter-probability matrix')
118 read_and_call_until(uhandle, consumer.add_to_pssm, start = '---')
119 read_and_call_until(uhandle, consumer.noevent, start = 'Time')
120 read_and_call(uhandle, consumer.noevent, start = 'Time')
121 read_and_call(uhandle, consumer.noevent, blank = 1)
122 read_and_call(uhandle, consumer.noevent, start = '***')
123 read_and_call_while(uhandle, consumer.noevent, blank = 1)
124 read_and_call(uhandle, consumer.noevent, start = '***')
125 line = safe_peekline(uhandle)
126 if line.startswith("SUMMARY OF MOTIFS"):
127 break
128
129
130
132 """
133 Consumer that can receive events from MEME Scanner.
134
135 This is the Consumer object that should be passed to the
136 MEME Scanner.
137 """
138
140 self.current_motif = None
141 self.sequence_names = []
142 self.data = MEMERecord()
143
148
150 line = line.strip()
151 line = line.replace('DATAFILE= ','')
152 self.data.datafile = line
153
162
164 line = line.strip()
165 ls = line.split()
166 self.data.sequence_names.append(ls[0])
167 if len(ls) == 6:
168 self.data.sequence_names.append(ls[3])
169
171 line = line.strip()
172 line = line.replace('command: ','')
173 self.data.command = line
174
185
191
201
203 line = line.strip()
204 sl = line.split()
205 thisposition = tuple([float(i) for i in sl])
206 self.current_motif.add_to_pssm(thisposition)
207
209 line = line.strip()
210 sl = line.split()
211 thisposition = tuple([float(i) for i in sl])
212 self.current_motif.add_to_logodds(thisposition)
213
216
217
218
220 """
221 Consumer that can receive events from _MASTScanner.
222
223 A _MASTConsumer parses lines from a mast text output file.
224 The motif match diagrams are parsed using line buffering.
225 Each of the buffering functions have a dummy variable that is
226 required for testing using the Bio.ParserSupport.TaggingConsumer.
227 If this variable isn't there, the TaggingConsumer barfs. In
228 the _MASTScanner, None is passed in the place of this variable.
229 """
231 self.data = MASTRecord()
232 self._current_seq = ""
233 self._line_buffer = []
234 self._buffer_size = 0
235 self._buffered_seq_start = 0
236
241
253
264
288
313
339
345
347 line = line.strip()
348 if not line.startswith('*****'):
349 self._line_buffer.append(line)
350 else:
351 return -1
352
354 """Parses the line buffer to get e-values for each instance of a motif.
355 This buffer parser is the most likely point of failure for the
356 MASTParser.
357 """
358 insts = self.data.get_motif_matches_for_sequence(self._current_seq)
359 if len(insts) > 0:
360
361 fullSeq = self._line_buffer[self._buffer_size-1]
362 pvals = self._line_buffer[1].split()
363 p = 0
364 lpval = len(pvals)
365 while p < lpval:
366 if pvals[p].count('e') > 1:
367
368
369 pvs = []
370 spe = pvals[p].split('e')
371 spe.reverse()
372 dotind = spe[1].find('.')
373 if dotind == -1:
374 thispval = spe[1][-1] + 'e' + spe[0]
375 else:
376 thispval = spe[1][dotind-1:] + 'e' + spe[0]
377 pvs.append(thispval)
378 for spi in range(2,len(spe)):
379 dotind = spe[spi].find('.')
380 prevdotind = spe[spi-1].find('.')
381 if dotind != -1:
382 if prevdotind == -1:
383 thispval = spe[spi][dotind-1:] + 'e' + spe[spi-1][:-1]
384 else:
385 thispval = spe[spi][dotind-1:] + 'e' + spe[spi-1][0:prevdotind-1]
386 else:
387 if prevdotind == -1:
388 thispval = spe[spi][-1] + 'e' + spe[spi-1][:-1]
389 else:
390 thispval = spe[spi][-1] + 'e' + spe[spi-1][0:prevdotind-1]
391 pvs.append(thispval)
392 pvs.reverse()
393 if p > 0:
394 pvals = pvals[0:p] + pvs + pvals[p+1:]
395 else:
396 pvals = pvs + pvals[p+1:]
397 lpval = len(pvals)
398 p += 1
399 i = 0
400 if len(pvals) != len(insts):
401 sys.stderr.write("Failure to parse p-values for " + self._current_seq + ": " + self._line_buffer[1] + " to: " + str(pvals) + "\n")
402 pvals = []
403
404
405 for i in range(0,len(insts)):
406 inst = insts[i]
407 start = inst.start - self._buffered_seq_start + 1
408 thisSeq = fullSeq[start:start+inst.length]
409 thisSeq = Seq.Seq(thisSeq, self.data.alphabet)
410 inst._sequence(thisSeq)
411 if pvals:
412 inst._pvalue(float(pvals[i]))
413
415 self._line_buffer = []
416 self._buffer_size = 0
417
419 if self._buffer_size == 0:
420 if len(self._line_buffer) > 0:
421 self._buffer_size = len(self._line_buffer)
422 ll = self._line_buffer[self._buffer_size-1].split()
423 self._line_buffer[self._buffer_size-1] = ll[1]
424 self._buffered_seq_start = int(ll[0])
425 else:
426 i = 0
427 for i in range(self._buffer_size, len(self._line_buffer)-1):
428 self._line_buffer[i-self._buffer_size] = self._line_buffer[i-self._buffer_size] + self._line_buffer[i].strip()
429 ll = self._line_buffer[len(self._line_buffer)-1].split()
430 if int(ll[0]) == self._buffered_seq_start + len(self._line_buffer[self._buffer_size-1]):
431 self._line_buffer[self._buffer_size-1] += ll[1]
432 else:
433 differ = int(ll[0]) - (self._buffered_seq_start + len(self._line_buffer[self._buffer_size-1]))
434 self._line_buffer[self._buffer_size-1] += "N"*differ
435 self._line_buffer[self._buffer_size-1] += ll[1]
436 self._line_buffer = self._line_buffer[0:self._buffer_size]
437
439 line = line.strip()
440 if line.find('[') != -1 or line.find('<') != -1:
441 pass
442 elif line.find('e') != -1:
443 pass
444 elif line.find('+') != -1:
445 pass
446
449
450
451
453 """
454 Parser for MAST text output. HTML output cannot be parsed, yet. Returns a MASTRecord
455
456 A MASTParser takes a file handle for a MAST text output file and
457 returns a MASTRecord, containing the hits between motifs and
458 sequences. The parser does some unusual line buffering to parse out
459 match diagrams. Really complex diagrams often lead to an error message
460 and p-values not being parsed for a given line.
461
462 Methods:
463 parse (handle): parses the data from the file handle passed to it.
464
465 Example:
466
467 f = open("mast_file.txt")
468 parser = MASTParser()
469 mast_record = parser.parse(f)
470 for motif in mast_record.motifs:
471 for instance in motif.instances:
472 print instance.motif_name, instance.sequence_name, instance.strand, instance.pvalue
473 """
477
478 - def parse (self, handle):
479 self._scanner.feed(handle, self._consumer)
480 return self._consumer.data
481
482
483
485 """
486 Scanner for MAST text output.
487
488 """
489 - def feed (self, handle, consumer):
498
500 try:
501 read_and_call_until(uhandle, consumer.noevent, contains = "MAST version")
502 except SyntaxError:
503 raise SyntaxError, "Improper input file. Does not begin with a line with 'MAST version'"
504 read_and_call(uhandle, consumer._version, contains = 'MAST version')
505 read_and_call_until(uhandle, consumer.noevent, start = 'DATABASE AND MOTIFS')
506 read_and_call(uhandle, consumer.noevent, start = 'DATABASE')
507 read_and_call(uhandle, consumer.noevent, start = '****')
508 read_and_call(uhandle, consumer._database, contains = 'DATABASE')
509 read_and_call_until(uhandle, consumer.noevent, contains = 'MOTIF WIDTH')
510 read_and_call(uhandle, consumer.noevent, contains = 'MOTIF')
511 read_and_call(uhandle, consumer.noevent, contains = '----')
512 read_and_call_until(uhandle, consumer._add_motif, blank = 1)
513 read_and_call_until(uhandle, consumer.noevent, start = 'SECTION II:')
514
516 read_and_call_until(uhandle, consumer.noevent, start = 'SEQUENCE NAME')
517 read_and_call(uhandle, consumer.noevent, start = 'SEQUENCE NAME')
518 read_and_call(uhandle, consumer.noevent, start = '---')
519
520 read_and_call_until(uhandle, consumer.noevent, blank = 1)
521 read_and_call(uhandle, consumer.noevent, blank = 1)
522
524 read_and_call_until(uhandle, consumer.noevent, start = 'SECTION III:')
525 read_and_call(uhandle, consumer.noevent, start = 'SECTION III:')
526 read_and_call_until(uhandle, consumer.noevent, start = '****')
527 read_and_call(uhandle, consumer.noevent, start = '****')
528 read_and_call_until(uhandle, consumer.noevent, start = '*****')
529 read_and_call(uhandle, consumer.noevent)
530 read_and_call_while(uhandle, consumer.noevent, blank = 1)
531 readMatches = 1
532 while readMatches == 1:
533 if consumer._current_seq:
534 if consumer._buffer_size != 0:
535 consumer._parse_buffer(None)
536 consumer._blank_buffer(None)
537 read_and_call(uhandle, consumer._set_current_seq)
538 read_and_call_until(uhandle, consumer.noevent, start = ' DIAGRAM')
539 read_and_call_until(uhandle, consumer._add_line_to_buffer, blank = 1)
540 consumer._add_diagram_from_buffer(None)
541 consumer._blank_buffer(None)
542 read_and_call(uhandle, consumer.noevent, blank = 1)
543 while 1:
544 line = safe_peekline(uhandle)
545 if line.startswith('****'):
546 consumer._parse_buffer(None)
547 readMatches = 0
548 break
549 read_and_call_until(uhandle, consumer._add_line_to_buffer, blank = 1)
550 read_and_call(uhandle, consumer.noevent, blank = 1)
551 consumer._collapse_buffer(None)
552 if attempt_read_and_call(uhandle, consumer.noevent, blank = 1):
553 break
554 elif attempt_read_and_call(uhandle, consumer.noevent, start = '*****'):
555 consumer._parse_buffer(None)
556 consumer._blank_buffer(None)
557 readMatches = 0
558 break
559
560
561
563 """The class for holding the results from a MAST run.
564
565 A MASTRecord holds data about matches between motifs and sequences.
566 The motifs held by the MASTRecord are objects of the class MEMEMotif.
567
568 Methods:
569 get_motif_matches_for_sequence(sequence_name): returns all of the
570 motif matches within a given sequence. The matches are objects of
571 the class MEME.Motif.Instance
572 get_motif_matches (motif_name): returns all of the matches for a motif
573 in the sequences searched. The matches returned are of class
574 MEME.Motif.Instance
575 get_motif_by_name (motif_name): returns a MEMEMotif with the given
576 name.
577 """
586
589
595
598
607
611
613 self.diagrams[seq] = diagram
614
617
620
623
625 for m in self.motifs:
626 if m.name == name:
627 return m
628