Package Bio :: Package Sequencing :: Module Ace
[hide private]
[frames] | no frames]

Source Code for Module Bio.Sequencing.Ace

  1  # Copyright 2004 by Frank Kauff and Cymon J. Cox.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """ 
  6  Parser for (new) ACE files output by PHRAP. 
  7   
  8  version 1.3, 05/06/2004 
  9  Written by Frank Kauff (fkauff@duke.edu) and 
 10  Cymon J. Cox (cymon@duke.edu) 
 11   
 12  Uses the Biopython Parser interface: ParserSupport.py 
 13   
 14  Usage: 
 15   
 16  There are two ways of reading an ace file: The ACEParser() reads 
 17  the whole file at once and the RecordParser() reads contig after 
 18  contig. 
 19       
 20  1) Parse whole ace file at once: 
 21   
 22          from Bio.Sequencing import Ace 
 23          aceparser=Ace.ACEParser() 
 24          acefilerecord=aceparser.parse(open('my_ace_file.ace','r')) 
 25   
 26  This gives you: 
 27          acefilerecord.ncontigs (the number of contigs in the ace file) 
 28          acefilerecord.nreads (the number of reads in the ace file) 
 29          acefilerecord.contigs[] (one instance of the Contig class for each contig) 
 30   
 31  The Contig class holds the info of the CO tag, CT and WA tags, and all the reads used 
 32  for this contig in a list of instances of the Read class, e.g.: 
 33   
 34          contig3=acefilerecord.contigs[2] 
 35          read4=contig3.reads[3] 
 36          RD_of_read4=read4.rd 
 37          DS_of_read4=read4.ds 
 38   
 39  CT, WA, RT tags from the end of the file can appear anywhere are automatically 
 40  sorted into the right place. 
 41   
 42  see _RecordConsumer for details. 
 43   
 44  2) Or you can iterate over the contigs of an ace file one by one in the ususal way: 
 45   
 46          from Bio.Sequencing import Ace 
 47          recordparser=Ace.RecordParser() 
 48          iterator=Ace.Iterator(open('my_ace_file.ace','r'),recordparser) 
 49          for contig in iterator : 
 50              print contig.name 
 51              ... 
 52   
 53  Please note that for memory efficiency, when using the iterator approach, only one 
 54  contig is kept in memory at once.  However, there can be a footer to the ACE file 
 55  containing WA, CT, RT or WR tags which contain additional meta-data on the contigs. 
 56  Because the parser doesn't see this data until the final record, it cannot be added to 
 57  the appropriate records.  Instead these tags will be returned with the last contig record. 
 58  Thus an ace file does not entirerly suit the concept of iterating. If WA, CT, RT, WR tags 
 59  are needed, the ACEParser instead of the RecordParser might be appropriate. 
 60  """ 
 61  import os 
 62  from Bio import File 
 63  from Bio.ParserSupport import * 
 64  from Bio.Alphabet import IUPAC 
 65   
 66   
67 -class rd:
68 """RD (reads), store a read with its name, sequence etc."""
69 - def __init__(self):
70 self.name='' 71 self.padded_bases=None 72 self.info_items=None 73 self.read_tags=None 74 self.sequence=''
75
76 -class qa:
77 """QA (read quality), including which part if any was used as the consensus."""
78 - def __init__(self):
79 self.qual_clipping_start=None 80 self.qual_clipping_end=None 81 self.align_clipping_start=None 82 self.align_clipping_end=None
83
84 -class ds:
85 """DS lines, include file name of a read's chromatogram file."""
86 - def __init__(self):
87 self.chromat_file='' 88 self.phd_file='' 89 self.time='' 90 self.chem='' 91 self.dye='' 92 self.template='' 93 self.direction=''
94
95 -class af:
96 """AF lines, define the location of the read within the contig."""
97 - def __init__(self):
98 self.name='' 99 self.coru=None 100 self.padded_start=None
101
102 -class bs:
103 """"BS (base segment), which read was chosen as the consensus at each position."""
104 - def __init__(self):
105 self.name='' 106 self.padded_start=None 107 self.padded_end=None
108
109 -class rt:
110 """RT (transient read tags), generated by crossmatch and phrap."""
111 - def __init__(self):
112 self.name='' 113 self.tag_type='' 114 self.program='' 115 self.padded_start=None 116 self.padded_end=None 117 self.date=''
118
119 -class ct:
120 """CT (consensus tags)."""
121 - def __init__(self):
122 self.name='' 123 self.tag_type='' 124 self.program='' 125 self.padded_start=None 126 self.padded_end=None 127 self.date='' 128 self.notrans='' 129 self.info=[]
130
131 -class wa:
132 """WA (whole assembly tag), holds the assembly program name, version, etc."""
133 - def __init__(self):
134 self.tag_type='' 135 self.program='' 136 self.date='' 137 self.info=[]
138
139 -class wr:
140 """WR lines."""
141 - def __init__(self):
142 self.name='' 143 self.aligned='' 144 self.program='' 145 self.date=[]
146
147 -class Reads:
148 """Holds information about a read supporting an ACE contig."""
149 - def __init__(self):
150 self.rd=None # one per read 151 self.qa=None # one per read 152 self.ds=None # none or one per read 153 self.rt=None # none or many per read 154 self.wr=None # none or many per read
155
156 -class Contig:
157 """Holds information about a contig from an ACE record."""
158 - def __init__(self):
159 self.name = '' 160 self.nbases=None 161 self.nreads=None 162 self.nsegments=None 163 self.uorc=None 164 self.sequence=None 165 self.quality=None 166 self.af=[] 167 self.bs=[] 168 self.reads=[] 169 self.ct=None # none or many 170 self.wa=None # none or many
171
172 -class Iterator:
173 """Iterates over an ACE-file with multiple contigs. 174 175 Methods: 176 next Return the next record from the stream, or None. 177 """ 178
179 - def __init__(self, handle, parser=None):
180 """__init__(self, handle, parser=None) 181 182 Create a new iterator. handle is a file-like object. parser 183 is an optional Parser object to change the results into another form. 184 If set to None, then the raw contents of the file will be returned. 185 """ 186 self._uhandle = File.UndoHandle(handle) 187 self._parser = parser
188
189 - def next(self):
190 """next(self) -> object 191 192 Return the next contig record from the file. If no more records 193 return None. 194 """ 195 196 lines = [] 197 while 1: 198 # if at beginning, skip the AS and look for first CO command 199 line=self._uhandle.readline() 200 if not line: # empty or corrupt file 201 return None 202 if line[:2]=='CO': 203 lines.append(line) 204 break 205 while 1: 206 line = self._uhandle.readline() 207 if not line: 208 break 209 # If a new record, then put the line back and stop. 210 if lines and line[:2] == 'CO': 211 self._uhandle.saveline(line) 212 break 213 lines.append(line) 214 215 if not lines: 216 return None 217 218 data = ''.join(lines) 219 if self._parser is not None: 220 return self._parser.parse(File.StringHandle(data)) 221 return data
222
223 - def __iter__(self) :
224 """Iterate over the ACE file record by record.""" 225 return iter(self.next, None)
226
227 -class RecordParser(AbstractParser):
228 """Parses ACE file data into a Record object."""
229 - def __init__(self):
230 self._scanner = _Scanner() 231 self._consumer = _RecordConsumer()
232
233 - def parse(self, handle):
234 if isinstance(handle, File.UndoHandle): 235 uhandle = handle 236 else: 237 uhandle = File.UndoHandle(handle) 238 self._scanner.feed(uhandle, self._consumer) 239 return self._consumer.data
240
241 -class ACEFileRecord:
242 """Holds data of an ACE file. 243 """
244 - def __init__(self):
245 self.ncontigs=None 246 self.nreads=None 247 self.contigs=[] 248 self.wa=None # none or many
249
250 - def sort(self):
251 """Sorts wr, rt and ct tags into the appropriate contig / read instance, if possible. """ 252 253 ct=[] 254 rt=[] 255 wr=[] 256 # search for tags that aren't in the right position 257 for i in range(len(self.contigs)): 258 c = self.contigs[i] 259 if c.wa: 260 if not self.wa: 261 self.wa=[] 262 self.wa.extend(c.wa) 263 if c.ct: 264 newcts=[ct_tag for ct_tag in c.ct if ct_tag.name!=c.name] 265 map(self.contigs[i].ct.remove,newcts) 266 ct.extend(newcts) 267 for j in range(len(c.reads)): 268 r = c.reads[j] 269 if r.rt: 270 newrts=[rt_tag for rt_tag in r.rt if rt_tag.name!=r.rd.name] 271 map(self.contigs[i].reads[j].rt.remove,newrts) 272 rt.extend(newrts) 273 if r.wr: 274 newwrs=[wr_tag for wr_tag in r.wr if wr_tag.name!=r.rd.name] 275 map(self.contigs[i].reads[j].wr.remove,newwrs) 276 wr.extend(newwrs) 277 # now sort them into their proper place 278 for i in range(len(self.contigs)): 279 c = self.contigs[i] 280 for ct_tag in ct: 281 if ct_tag.name==c.name: 282 if self.contigs[i].ct is None: 283 self.contigs[i].ct=[] 284 self.contigs[i].ct.append(ct_tag) 285 if rt or wr: 286 for j in range(len(c.reads)): 287 r = c.reads[j] 288 for rt_tag in rt: 289 if rt_tag.name==r.rd.name: 290 if self.contigs[i].reads[j].rt is None: 291 self.contigs[i].reads[j].rt=[] 292 self.contigs[i].reads[j].rt.append(rt_tag) 293 for wr_tag in wr: 294 if wr_tag.name==r.rd.name: 295 if self.contigs[i].reads[j].wr is None: 296 self.contigs[i].reads[j].wr=[] 297 self.contigs[i].reads[j].wr.append(wr_tag)
298
299 -class ACEParser(AbstractParser):
300 """Parses full ACE file in list of contigs. 301 302 """ 303
304 - def __init__(self):
305 self.data=ACEFileRecord()
306
307 - def parse(self,handle):
308 firstline=handle.readline() 309 # check if the file starts correctly 310 if firstline[:2]!='AS': 311 raise ValueError, "File does not start with 'AS'." 312 self.data.ncontigs=eval(firstline.split()[1]) 313 self.data.nreads=eval(firstline.split()[2]) 314 # now read all the records 315 recparser=RecordParser() 316 iter=Iterator(handle,recparser) 317 while 1: 318 rec=iter.next() 319 if not rec: 320 break 321 self.data.contigs.append(rec) 322 # wa, ct, rt rags are usually at the end of the file, but not necessarily (correct?). 323 # If the iterator is used, the tags are returned with the contig or the read after which they appear, 324 # if all tags are at the end, they are read with the last contig. The concept of an 325 # iterator leaves no other choice. But if the user uses the ACEParser, we can check 326 # them and put them into the appropriate contig/read instance. 327 # Conclusion: An ACE file is not a filetype for which iteration is 100% suitable... 328 self.data.sort() 329 return self.data
330
331 -class _Scanner:
332 """Scans an ACE-formatted file. 333 334 Methods: 335 feed - Feed one ACE record. 336 """
337 - def feed(self, handle, consumer):
338 """feed(self, handle, consumer) 339 340 Feed in ACE data for scanning. handle is a file-like object 341 containing ACE data. consumer is a Consumer object that will 342 receive events as the ACE data is scanned. 343 """ 344 assert isinstance(handle, File.UndoHandle), \ 345 "handle must be an UndoHandle" 346 if handle.peekline(): 347 self._scan_record(handle, consumer)
348
349 - def _scan_record(self, uhandle, consumer):
350 consumer.begin_contig() 351 read_and_call(uhandle,consumer.co_header,start='CO ') 352 consumer.co_data(self._scan_sequence_data(uhandle)) 353 read_and_call_while(uhandle,consumer.noevent,blank=1) 354 read_and_call(uhandle,consumer.bq_header,start='BQ') 355 consumer.bq_data(self._scan_bq_data(uhandle, consumer)) 356 read_and_call_while(uhandle,consumer.noevent,blank=1) 357 read_and_call_while(uhandle,consumer.af,start='AF ') 358 read_and_call_while(uhandle,consumer.noevent,blank=1) 359 read_and_call_while(uhandle,consumer.bs,start='BS ') 360 # now read all the read data 361 # it starts with a 'RD', and then a mandatory QA 362 # then follows an optional DS 363 # CT,RT,WA,WR may or may not be there in unlimited quantity. They might refer to the actial read or contig, 364 # or, if encountered at the end of file, to any previous read or contig. the sort() method deals 365 # with that later. 366 while 1: 367 # each read must have a rd and qa 368 read_and_call_until(uhandle,consumer.noevent,start='RD ') 369 read_and_call(uhandle,consumer.rd_header,start='RD ') 370 consumer.rd_data(self._scan_sequence_data(uhandle)) 371 read_and_call_while(uhandle,consumer.noevent,blank=1) 372 read_and_call(uhandle,consumer.qa,start='QA ') 373 # now one ds can follow 374 try: 375 read_and_call_while(uhandle,consumer.noevent,blank=1) 376 attempt_read_and_call(uhandle,consumer.ds,start='DS ') 377 except ValueError: 378 # file ends 379 consumer.end_contig() 380 return 381 # the file could just end, or there's some more stuff. In ace files, everything can happen. 382 # the following tags are interspersed between reads and can ap[pear multiple times. 383 while 1: 384 # something left 385 try: 386 read_and_call_while(uhandle,consumer.noevent,blank=1) 387 except ValueError: 388 # file ends here 389 consumer.end_contig() 390 return 391 else: 392 if attempt_read_and_call(uhandle,consumer.rt_start,start='RT'): 393 consumer.rt_data(self._scan_bracket_tags(uhandle)) 394 elif attempt_read_and_call(uhandle,consumer.wr_start,start='WR'): 395 consumer.wr_data(self._scan_bracket_tags(uhandle)) 396 elif attempt_read_and_call(uhandle,consumer.wa_start,start='WA'): 397 consumer.wa_data(self._scan_bracket_tags(uhandle)) 398 elif attempt_read_and_call(uhandle,consumer.ct_start,start='CT'): 399 consumer.ct_data(self._scan_bracket_tags(uhandle)) 400 else: 401 line=safe_peekline(uhandle) 402 break 403 if not line.startswith('RD'): # another read? 404 consumer.end_contig() 405 break
406
407 - def _scan_bq_data(self, uhandle, consumer):
408 """Scans multiple lines of quality data and concatenates them.""" 409 410 qual='' 411 while 1: 412 line=uhandle.readline() 413 if is_blank_line(line): 414 uhandle.saveline(line) 415 break 416 qual+=' '+line 417 return qual
418
419 - def _scan_sequence_data(self,uhandle):
420 """Scans multiple lines of sequence data and concatenates them.""" 421 422 seq='' 423 while 1: 424 line=uhandle.readline() 425 if is_blank_line(line): 426 uhandle.saveline(line) 427 break 428 seq+=line.strip() 429 return seq
430
431 - def _scan_bracket_tags(self,uhandle):
432 """Reads the data lines of a {} tag.""" 433 434 fulltag=[] 435 while 1: 436 line=uhandle.readline().strip() 437 fulltag.append(line) 438 if line.endswith('}'): 439 fulltag[-1]=fulltag[-1][:-1] # delete the ending } 440 if fulltag[-1]=='': 441 fulltag=fulltag[:-1] # delete empty line 442 break 443 return fulltag
444
445 -class _RecordConsumer(AbstractConsumer):
446 """Reads the ace tags into data records.""" 447
448 - def __init__(self):
449 self.data = None
450
451 - def begin_contig(self):
452 self.data = Contig()
453
454 - def end_contig(self):
455 pass
456
457 - def co_header(self,line):
458 header=line.split() 459 self.data.name=header[1] 460 self.data.nbases=eval(header[2]) 461 self.data.nreads=eval(header[3]) 462 self.data.nsegments=eval(header[4]) 463 self.data.uorc=header[5]
464
465 - def co_data(self,seq):
466 self.data.sequence=seq
467
468 - def bq_header(self,line):
469 pass
470
471 - def bq_data(self,qual):
472 self.data.quality=map(eval,qual.split())
473
474 - def af(self,line):
475 header=line.split() 476 afdata=af() 477 afdata.name=header[1] 478 afdata.coru=header[2] 479 afdata.padded_start=eval(header[3]) 480 self.data.af.append(afdata)
481
482 - def bs(self,line):
483 header=line.split() 484 bsdata=bs() 485 bsdata.padded_start=eval(header[1]) 486 bsdata.padded_end=eval(header[2]) 487 bsdata.name=header[3] 488 self.data.bs.append(bsdata)
489
490 - def rd_header(self,line):
491 header=line.split() 492 # Reads start with rd, so we create a new read record here 493 self.data.reads.append(Reads()) 494 rddata=rd() 495 rddata.name=header[1] 496 rddata.padded_bases=eval(header[2]) 497 rddata.info_items=eval(header[3]) 498 rddata.read_tags=eval(header[4]) 499 self.data.reads[-1].rd=rddata
500
501 - def rd_data(self,seq):
502 self.data.reads[-1].rd.sequence=seq
503
504 - def qa(self,line):
505 header=map(eval,line.split()[1:]) 506 qadata=qa() 507 qadata.qual_clipping_start=header[0] 508 qadata.qual_clipping_end=header[1] 509 qadata.align_clipping_start=header[2] 510 qadata.align_clipping_end=header[3] 511 self.data.reads[-1].qa=qadata
512
513 - def ds(self,line):
514 dsdata=ds() 515 tags=['CHROMAT_FILE','PHD_FILE','TIME','CHEM','DYE','TEMPLATE','DIRECTION'] 516 poss=map(line.find,tags) 517 tagpos=dict(zip(poss,tags)) 518 if tagpos.has_key(-1): 519 del tagpos[-1] 520 ps=tagpos.keys() 521 ps.sort() 522 for (p1,p2) in zip(ps,ps[1:]+[len(line)+1]): 523 setattr(dsdata,tagpos[p1].lower(),line[p1+len(tagpos[p1])+1:p2].strip()) 524 self.data.reads[-1].ds=dsdata
525
526 - def ct_start(self,line):
527 if not line.strip().endswith('{'): 528 print line 529 raise ValueError, 'CT tag does not start with CT{' 530 ctdata=ct() 531 if self.data.ct is None: 532 self.data.ct=[] 533 self.data.ct.append(ctdata)
534
535 - def ct_data(self,taglines):
536 if len(taglines)<1: 537 raise ValueError, 'Missing header line in CT tag' 538 header=taglines[0].split() 539 self.data.ct[-1].name=header[0] 540 self.data.ct[-1].tag_type=header[1] 541 self.data.ct[-1].program=header[2] 542 self.data.ct[-1].padded_start=eval(header[3]) 543 self.data.ct[-1].padded_end=eval(header[4]) 544 self.data.ct[-1].date=header[5] 545 if len(header)==7: 546 self.data.ct[-1].notrans=header[6] 547 self.data.ct[-1].info=taglines[1:]
548
549 - def rt_start(self,line):
550 if not line.strip().endswith('{'): 551 raise ValueError, 'RT tag does not start with RT{' 552 rtdata=rt() 553 # now if we're at the end of the file, this rt could belong to a previous read, not the actual one 554 # we store it here were it appears, the user can sort later. 555 if self.data.reads[-1].rt is None: 556 self.data.reads[-1].rt=[] 557 self.data.reads[-1].rt.append(rtdata)
558
559 - def rt_data(self,taglines):
560 if len(taglines)<1: 561 raise ValueError, 'Missing header line in RT tag' 562 header=taglines[0].split() 563 self.data.reads[-1].rt[-1].name=header[0] 564 self.data.reads[-1].rt[-1].tag_type=header[1] 565 self.data.reads[-1].rt[-1].program=header[2] 566 self.data.reads[-1].rt[-1].padded_start=eval(header[3]) 567 self.data.reads[-1].rt[-1].padded_end=eval(header[4]) 568 self.data.reads[-1].rt[-1].date=header[5]
569
570 - def wa_start(self,line):
571 if not line.strip().endswith('{'): 572 raise ValueError, 'WA tag does not start with WA{' 573 wadata=wa() 574 if self.data.wa is None: 575 self.data.wa=[] 576 self.data.wa.append(wadata)
577
578 - def wa_data(self,taglines):
579 if len(taglines)<1: 580 raise ValueError, 'Missing header line in WA tag' 581 header=taglines[0].split() 582 self.data.wa[-1].tag_type=header[0] 583 self.data.wa[-1].program=header[1] 584 self.data.wa[-1].date=header[2] 585 self.data.wa[-1].info=taglines[1:]
586
587 - def wr_start(self,line):
588 if not line.strip().endswith('{'): 589 raise ValueError, 'WR tag does not start with WR{' 590 wrdata=wr() 591 if self.data.reads[-1].wr is None: 592 self.data.reads[-1].wr=[] 593 self.data.reads[-1].wr.append(wrdata)
594
595 - def wr_data(self,taglines):
596 if len(taglines)<1: 597 raise ValueError, 'Missing header line in WR tag' 598 header=taglines[0].split() 599 self.data.reads[-1].wr[-1].name=header[0] 600 self.data.reads[-1].wr[-1].aligned=header[1] 601 self.data.reads[-1].wr[-1].program=header[2] 602 self.data.reads[-1].wr[-1].date=header[3]
603 604 if __name__ == "__main__" : 605 print "Quick self test" 606 #Test the iterator, 607 handle = open("../../Tests/Ace/contig1.ace") 608 recordparser = RecordParser() 609 iterator = Iterator(handle,recordparser) 610 for contig in iterator : 611 print contig.name, len(contig.sequence) 612 handle.close() 613 print "Done" 614