1 """
2 Parser for (new) ACE files output by PHRAP.
3
4 version 1.3, 05/06/2004
5 Written by Frank Kauff (fkauff@duke.edu) and
6 Cymon J. Cox (cymon@duke.edu)
7
8 Uses the Biopython Parser interface: ParserSupport.py
9
10 Usage:
11
12 There are two ways of reading an ace file: The ACEParser() reads
13 the whole file at once and the RecordParser() reads contig after
14 contig.
15
16 1) Parse whole ace file at once:
17 aceparser=Ace.ACEParser()
18 acefilerecord=aceparser.parse(open('my_ace_file.ace','r'))
19
20 gives you
21 acefilerecord.ncontigs (the number of contigs in the ace file)
22 acefilerecord.nreads (the number of reads in the ace file)
23 acefilerecord.contigs[] (one instance of the Contig class for each contig)
24 The Contig class holds the info of the CO tag, CT and WA tags, and all the reads used
25 for this contig in a list of instances of the Read class, e.g.:
26 contig3=acefilerecord.contigs[2]
27 read4=contig3.reads[3]
28 RD_of_read4=read4.rd
29 DS_of_read4=read4.ds
30
31 CT, WA, RT tags from the end of the file can appear anywhere are automatically
32 sorted into the right place.
33
34 see _RecordConsumer for details.
35
36 2) *** DEPRECATED: not entirely suitable for ACE files!
37 Or you can iterate over the contigs of an ace file one by one in the ususal way:
38 recordparser=Ace.RecordParser()
39 iterator=Ace.Iterator(open('my_ace_file.ace','r'),recordparser)
40 while 1:
41 contig=iterator.next()
42 if contig is None:
43 break
44 ...
45
46 if WA, CT, RT, WR tags are at the end and the iterator is used, they will be returned
47 with the last contig record. This is is necessarily the case when using an interator.
48 Thus an ace file does not entirerly suit the concept of iterating. If WA, CT, RT, WR tags
49 are needed, the ACEParser instead of the RecordParser might be appropriate.
50
51 """
52 import os
53 from types import *
54
55 from Bio import File
56 from Bio import Index
57
58
59 from Bio.ParserSupport import *
60 from Bio.Alphabet import IUPAC
61
62
65 self.name=''
66 self.padded_bases=None
67 self.info_items=None
68 self.read_tags=None
69 self.sequence=''
70
73 self.qual_clipping_start=None
74 self.qual_clipping_end=None
75 self.align_clipping_start=None
76 self.align_clipping_end=None
77
80 self.chromat_file=''
81 self.phd_file=''
82 self.time=''
83 self.chem=''
84 self.dye=''
85 self.template=''
86 self.direction=''
87
90 self.name=''
91 self.coru=None
92 self.padded_start=None
93
96 self.name=''
97 self.padded_start=None
98 self.padded_end=None
99
102 self.name=''
103 self.tag_type=''
104 self.program=''
105 self.padded_start=None
106 self.padded_end=None
107 self.date=''
108
111 self.name=''
112 self.tag_type=''
113 self.program=''
114 self.padded_start=None
115 self.padded_end=None
116 self.date=''
117 self.notrans=''
118 self.info=[]
119
122 self.tag_type=''
123 self.program=''
124 self.date=''
125 self.info=[]
126
129 self.name=''
130 self.aligned=''
131 self.program=''
132 self.date=[]
133
136 self.rd=None
137 self.qa=None
138 self.ds=None
139 self.rt=None
140 self.wr=None
141
143 """Hold information from a ACE record """
145 self.name = ''
146 self.nbases=None
147 self.nreads=None
148 self.nsegments=None
149 self.uorc=None
150 self.sequence=None
151 self.quality=None
152 self.af=[]
153 self.bs=[]
154 self.reads=[]
155 self.ct=None
156 self.wa=None
157
159 """Iterates over a ACE-file with multiple contigs
160
161 Methods:
162 next Return the next record from the stream, or None.
163 """
164
165 - def __init__(self, handle, parser=None):
166 """__init__(self, handle, parser=None)
167
168 Create a new iterator. handle is a file-like object. parser
169 is an optional Parser object to change the results into another form.
170 If set to None, then the raw contents of the file will be returned.
171 """
172
173 if type(handle) is not FileType and type(handle) is not InstanceType:
174 raise ValueError, "I expected a file handle or file-like object"
175 self._uhandle = File.UndoHandle(handle)
176 self._parser = parser
177
179 """next(self) -> object
180
181 Return the next contig record from the file. If no more records
182 return None.
183 """
184
185 lines = []
186 while 1:
187
188 line=self._uhandle.readline()
189 if not line:
190 return None
191 if line[:2]=='CO':
192 lines.append(line)
193 break
194 while 1:
195 line = self._uhandle.readline()
196 if not line:
197 break
198
199 if lines and line[:2] == 'CO':
200 self._uhandle.saveline(line)
201 break
202 lines.append(line)
203
204 if not lines:
205 return None
206
207 data = ''.join(lines)
208 if self._parser is not None:
209 return self._parser.parse(File.StringHandle(data))
210 return data
211
213 """Parses ACE file data into a Record object
214
215 """
219
220 - def parse(self, handle):
227
229 """Holds data of ACE file.
230 """
232 self.ncontigs=None
233 self.nreads=None
234 self.contigs=[]
235 self.wa=None
236
238 """Sorts wr, rt and ct tags into the appropriate contig / read instance, if possible. """
239
240 ct=[]
241 rt=[]
242 wr=[]
243
244 for i in range(len(self.contigs)):
245 c = self.contigs[i]
246 if c.wa:
247 if not self.wa:
248 self.wa=[]
249 self.wa.extend(c.wa)
250 if c.ct:
251 newcts=[ct_tag for ct_tag in c.ct if ct_tag.name!=c.name]
252 map(self.contigs[i].ct.remove,newcts)
253 ct.extend(newcts)
254 for j in range(len(c.reads)):
255 r = c.reads[j]
256 if r.rt:
257 newrts=[rt_tag for rt_tag in r.rt if rt_tag.name!=r.rd.name]
258 map(self.contigs[i].reads[j].rt.remove,newrts)
259 rt.extend(newrts)
260 if r.wr:
261 newwrs=[wr_tag for wr_tag in r.wr if wr_tag.name!=r.rd.name]
262 map(self.contigs[i].reads[j].wr.remove,newwrs)
263 wr.extend(newwrs)
264
265 for i in range(len(self.contigs)):
266 c = self.contigs[i]
267 for ct_tag in ct:
268 if ct_tag.name==c.name:
269 if self.contigs[i].ct is None:
270 self.contigs[i].ct=[]
271 self.contigs[i].ct.append(ct_tag)
272 if rt or wr:
273 for j in range(len(c.reads)):
274 r = c.reads[j]
275 for rt_tag in rt:
276 if rt_tag.name==r.rd.name:
277 if self.contigs[i].reads[j].rt is None:
278 self.contigs[i].reads[j].rt=[]
279 self.contigs[i].reads[j].rt.append(rt_tag)
280 for wr_tag in wr:
281 if wr_tag.name==r.rd.name:
282 if self.contigs[i].reads[j].wr is None:
283 self.contigs[i].reads[j].wr=[]
284 self.contigs[i].reads[j].wr.append(wr_tag)
285
287 """Parses full ACE file in list of contigs.
288
289 """
290
293
295 firstline=handle.readline()
296
297 if firstline[:2]!='AS':
298 raise SyntaxError, "File does not start with 'AS'."
299 self.data.ncontigs=eval(firstline.split()[1])
300 self.data.nreads=eval(firstline.split()[2])
301
302 recparser=RecordParser()
303 iter=Iterator(handle,recparser)
304 while 1:
305 rec=iter.next()
306 if not rec:
307 break
308 self.data.contigs.append(rec)
309
310
311
312
313
314
315 self.data.sort()
316 return self.data
317
319 """Scans a ACE-formatted file
320
321 Methods:
322 feed - Feed one ACE record.
323 """
324 - def feed(self, handle, consumer):
325 """feed(self, handle, consumer)
326
327 Feed in ACE data for scanning. handle is a file-like object
328 containing ACE data. consumer is a Consumer object that will
329 receive events as the ACE data is scanned.
330 """
331 assert isinstance(handle, File.UndoHandle), \
332 "handle must be an UndoHandle"
333 if handle.peekline():
334 self._scan_record(handle, consumer)
335
337 consumer.begin_contig()
338 read_and_call(uhandle,consumer.co_header,start='CO ')
339 consumer.co_data(self._scan_sequence_data(uhandle))
340 read_and_call_while(uhandle,consumer.noevent,blank=1)
341 read_and_call(uhandle,consumer.bq_header,start='BQ')
342 consumer.bq_data(self._scan_bq_data(uhandle, consumer))
343 read_and_call_while(uhandle,consumer.noevent,blank=1)
344 read_and_call_while(uhandle,consumer.af,start='AF ')
345 read_and_call_while(uhandle,consumer.noevent,blank=1)
346 read_and_call_while(uhandle,consumer.bs,start='BS ')
347
348
349
350
351
352
353 while 1:
354
355 read_and_call_until(uhandle,consumer.noevent,start='RD ')
356 read_and_call(uhandle,consumer.rd_header,start='RD ')
357 consumer.rd_data(self._scan_sequence_data(uhandle))
358 read_and_call_while(uhandle,consumer.noevent,blank=1)
359 read_and_call(uhandle,consumer.qa,start='QA ')
360
361 try:
362 read_and_call_while(uhandle,consumer.noevent,blank=1)
363 attempt_read_and_call(uhandle,consumer.ds,start='DS ')
364 except SyntaxError:
365
366 consumer.end_contig()
367 return
368
369
370 while 1:
371
372 try:
373 read_and_call_while(uhandle,consumer.noevent,blank=1)
374 except SyntaxError:
375
376 consumer.end_contig()
377 return
378 else:
379 if attempt_read_and_call(uhandle,consumer.rt_start,start='RT'):
380 consumer.rt_data(self._scan_bracket_tags(uhandle))
381 elif attempt_read_and_call(uhandle,consumer.wr_start,start='WR'):
382 consumer.wr_data(self._scan_bracket_tags(uhandle))
383 elif attempt_read_and_call(uhandle,consumer.wa_start,start='WA'):
384 consumer.wa_data(self._scan_bracket_tags(uhandle))
385 elif attempt_read_and_call(uhandle,consumer.ct_start,start='CT'):
386 consumer.ct_data(self._scan_bracket_tags(uhandle))
387 else:
388 line=safe_peekline(uhandle)
389 break
390 if not line.startswith('RD'):
391 consumer.end_contig()
392 break
393
395 """Scans multiple lines of quality data and concatenates them."""
396
397 qual=''
398 while 1:
399 line=uhandle.readline()
400 if is_blank_line(line):
401 uhandle.saveline(line)
402 break
403 qual+=' '+line
404 return qual
405
407 """Scans multiple lines of sequence data and concatenates them."""
408
409 seq=''
410 while 1:
411 line=uhandle.readline()
412 if is_blank_line(line):
413 uhandle.saveline(line)
414 break
415 seq+=line.strip()
416 return seq
417
431
433 """Reads the ace tags into data records."""
434
437
440
443
451
454
457
460
468
476
487
490
492 header=map(eval,line.split()[1:])
493 qadata=qa()
494 qadata.qual_clipping_start=header[0]
495 qadata.qual_clipping_end=header[1]
496 qadata.align_clipping_start=header[2]
497 qadata.align_clipping_end=header[3]
498 self.data.reads[-1].qa=qadata
499
501 dsdata=ds()
502 tags=['CHROMAT_FILE','PHD_FILE','TIME','CHEM','DYE','TEMPLATE','DIRECTION']
503 poss=map(line.find,tags)
504 tagpos=dict(zip(poss,tags))
505 if tagpos.has_key(-1):
506 del tagpos[-1]
507 ps=tagpos.keys()
508 ps.sort()
509 for (p1,p2) in zip(ps,ps[1:]+[len(line)+1]):
510 setattr(dsdata,tagpos[p1].lower(),line[p1+len(tagpos[p1])+1:p2].strip())
511 self.data.reads[-1].ds=dsdata
512
514 if not line.strip().endswith('{'):
515 print line
516 raise SyntaxError, 'CT tag does not start with CT{'
517 ctdata=ct()
518 if self.data.ct is None:
519 self.data.ct=[]
520 self.data.ct.append(ctdata)
521
535
537 if not line.strip().endswith('{'):
538 raise SyntaxError, 'RT tag does not start with RT{'
539 rtdata=rt()
540
541
542 if self.data.reads[-1].rt is None:
543 self.data.reads[-1].rt=[]
544 self.data.reads[-1].rt.append(rtdata)
545
556
558 if not line.strip().endswith('{'):
559 raise SyntaxError, 'WA tag does not start with WA{'
560 wadata=wa()
561 if self.data.wa is None:
562 self.data.wa=[]
563 self.data.wa.append(wadata)
564
573
575 if not line.strip().endswith('{'):
576 raise SyntaxError, 'WR tag does not start with WR{'
577 wrdata=wr()
578 if self.data.reads[-1].wr is None:
579 self.data.reads[-1].wr=[]
580 self.data.reads[-1].wr.append(wrdata)
581
590