Package Bio :: Package Sequencing :: Module Ace
[hide private]
[frames] | no frames]

Source Code for Module Bio.Sequencing.Ace

  1  """ 
  2  Parser for (new) ACE files output by PHRAP. 
  3   
  4  version 1.3, 05/06/2004 
  5  Written by Frank Kauff (fkauff@duke.edu) and 
  6  Cymon J. Cox (cymon@duke.edu) 
  7   
  8  Uses the Biopython Parser interface: ParserSupport.py 
  9   
 10  Usage: 
 11   
 12  There are two ways of reading an ace file: The ACEParser() reads 
 13  the whole file at once and the RecordParser() reads contig after 
 14  contig. 
 15       
 16  1) Parse whole ace file at once: 
 17          aceparser=Ace.ACEParser() 
 18          acefilerecord=aceparser.parse(open('my_ace_file.ace','r')) 
 19   
 20  gives you  
 21          acefilerecord.ncontigs (the number of contigs in the ace file) 
 22          acefilerecord.nreads (the number of reads in the ace file) 
 23          acefilerecord.contigs[] (one instance of the Contig class for each contig) 
 24          The Contig class holds the info of the CO tag, CT and WA tags, and all the reads used 
 25          for this contig in a list of instances of the Read class, e.g.: 
 26          contig3=acefilerecord.contigs[2] 
 27          read4=contig3.reads[3] 
 28          RD_of_read4=read4.rd 
 29          DS_of_read4=read4.ds 
 30   
 31          CT, WA, RT tags from the end of the file can appear anywhere are automatically 
 32          sorted into the right place. 
 33   
 34          see _RecordConsumer for details. 
 35   
 36  2) *** DEPRECATED: not entirely suitable for ACE files!  
 37          Or you can iterate over the contigs of an ace file one by one in the ususal way:         
 38          recordparser=Ace.RecordParser() 
 39          iterator=Ace.Iterator(open('my_ace_file.ace','r'),recordparser) 
 40          while 1: 
 41              contig=iterator.next() 
 42              if contig is None: 
 43                  break 
 44              ... 
 45   
 46          if WA, CT, RT, WR tags are at the end and the iterator is used, they will be returned 
 47          with the last contig record. This is is necessarily the case when using an interator. 
 48          Thus an ace file does not entirerly suit the concept of iterating. If WA, CT, RT, WR tags 
 49          are needed, the ACEParser instead of the RecordParser might be appropriate. 
 50           
 51  """ 
 52  import os 
 53  from types import * 
 54   
 55  from Bio import File 
 56  from Bio import Index 
 57  #from Bio import Seq 
 58  #from Bio import SeqRecord 
 59  from Bio.ParserSupport import * 
 60  from Bio.Alphabet import IUPAC 
 61   
 62   
63 -class rd:
64 - def __init__(self):
65 self.name='' 66 self.padded_bases=None 67 self.info_items=None 68 self.read_tags=None 69 self.sequence=''
70
71 -class qa:
72 - def __init__(self):
73 self.qual_clipping_start=None 74 self.qual_clipping_end=None 75 self.align_clipping_start=None 76 self.align_clipping_end=None
77
78 -class ds:
79 - def __init__(self):
80 self.chromat_file='' 81 self.phd_file='' 82 self.time='' 83 self.chem='' 84 self.dye='' 85 self.template='' 86 self.direction=''
87
88 -class af:
89 - def __init__(self):
90 self.name='' 91 self.coru=None 92 self.padded_start=None
93
94 -class bs:
95 - def __init__(self):
96 self.name='' 97 self.padded_start=None 98 self.padded_end=None
99
100 -class rt:
101 - def __init__(self):
102 self.name='' 103 self.tag_type='' 104 self.program='' 105 self.padded_start=None 106 self.padded_end=None 107 self.date=''
108
109 -class ct:
110 - def __init__(self):
111 self.name='' 112 self.tag_type='' 113 self.program='' 114 self.padded_start=None 115 self.padded_end=None 116 self.date='' 117 self.notrans='' 118 self.info=[]
119
120 -class wa:
121 - def __init__(self):
122 self.tag_type='' 123 self.program='' 124 self.date='' 125 self.info=[]
126
127 -class wr:
128 - def __init__(self):
129 self.name='' 130 self.aligned='' 131 self.program='' 132 self.date=[]
133
134 -class Reads:
135 - def __init__(self):
136 self.rd=None # one per read 137 self.qa=None # one per read 138 self.ds=None # none or one per read 139 self.rt=None # none or many per read 140 self.wr=None # none or many per read
141
142 -class Contig:
143 """Hold information from a ACE record """
144 - def __init__(self):
145 self.name = '' 146 self.nbases=None 147 self.nreads=None 148 self.nsegments=None 149 self.uorc=None 150 self.sequence=None 151 self.quality=None 152 self.af=[] 153 self.bs=[] 154 self.reads=[] 155 self.ct=None # none or many 156 self.wa=None # none or many
157
158 -class Iterator:
159 """Iterates over a ACE-file with multiple contigs 160 161 Methods: 162 next Return the next record from the stream, or None. 163 """ 164
165 - def __init__(self, handle, parser=None):
166 """__init__(self, handle, parser=None) 167 168 Create a new iterator. handle is a file-like object. parser 169 is an optional Parser object to change the results into another form. 170 If set to None, then the raw contents of the file will be returned. 171 """ 172 173 if type(handle) is not FileType and type(handle) is not InstanceType: 174 raise ValueError, "I expected a file handle or file-like object" 175 self._uhandle = File.UndoHandle(handle) 176 self._parser = parser
177
178 - def next(self):
179 """next(self) -> object 180 181 Return the next contig record from the file. If no more records 182 return None. 183 """ 184 185 lines = [] 186 while 1: 187 # if at beginning, skip the AS and look for first CO command 188 line=self._uhandle.readline() 189 if not line: # empty or corrupt file 190 return None 191 if line[:2]=='CO': 192 lines.append(line) 193 break 194 while 1: 195 line = self._uhandle.readline() 196 if not line: 197 break 198 # If a new record, then put the line back and stop. 199 if lines and line[:2] == 'CO': 200 self._uhandle.saveline(line) 201 break 202 lines.append(line) 203 204 if not lines: 205 return None 206 207 data = ''.join(lines) 208 if self._parser is not None: 209 return self._parser.parse(File.StringHandle(data)) 210 return data
211
212 -class RecordParser(AbstractParser):
213 """Parses ACE file data into a Record object 214 215 """
216 - def __init__(self):
217 self._scanner = _Scanner() 218 self._consumer = _RecordConsumer()
219
220 - def parse(self, handle):
221 if isinstance(handle, File.UndoHandle): 222 uhandle = handle 223 else: 224 uhandle = File.UndoHandle(handle) 225 self._scanner.feed(uhandle, self._consumer) 226 return self._consumer.data
227
228 -class ACEFileRecord:
229 """Holds data of ACE file. 230 """
231 - def __init__(self):
232 self.ncontigs=None 233 self.nreads=None 234 self.contigs=[] 235 self.wa=None # none or many
236
237 - def sort(self):
238 """Sorts wr, rt and ct tags into the appropriate contig / read instance, if possible. """ 239 240 ct=[] 241 rt=[] 242 wr=[] 243 # search for tags that aren't in the right position 244 for i in range(len(self.contigs)): 245 c = self.contigs[i] 246 if c.wa: 247 if not self.wa: 248 self.wa=[] 249 self.wa.extend(c.wa) 250 if c.ct: 251 newcts=[ct_tag for ct_tag in c.ct if ct_tag.name!=c.name] 252 map(self.contigs[i].ct.remove,newcts) 253 ct.extend(newcts) 254 for j in range(len(c.reads)): 255 r = c.reads[j] 256 if r.rt: 257 newrts=[rt_tag for rt_tag in r.rt if rt_tag.name!=r.rd.name] 258 map(self.contigs[i].reads[j].rt.remove,newrts) 259 rt.extend(newrts) 260 if r.wr: 261 newwrs=[wr_tag for wr_tag in r.wr if wr_tag.name!=r.rd.name] 262 map(self.contigs[i].reads[j].wr.remove,newwrs) 263 wr.extend(newwrs) 264 # now sort them into their proper place 265 for i in range(len(self.contigs)): 266 c = self.contigs[i] 267 for ct_tag in ct: 268 if ct_tag.name==c.name: 269 if self.contigs[i].ct is None: 270 self.contigs[i].ct=[] 271 self.contigs[i].ct.append(ct_tag) 272 if rt or wr: 273 for j in range(len(c.reads)): 274 r = c.reads[j] 275 for rt_tag in rt: 276 if rt_tag.name==r.rd.name: 277 if self.contigs[i].reads[j].rt is None: 278 self.contigs[i].reads[j].rt=[] 279 self.contigs[i].reads[j].rt.append(rt_tag) 280 for wr_tag in wr: 281 if wr_tag.name==r.rd.name: 282 if self.contigs[i].reads[j].wr is None: 283 self.contigs[i].reads[j].wr=[] 284 self.contigs[i].reads[j].wr.append(wr_tag)
285
286 -class ACEParser(AbstractParser):
287 """Parses full ACE file in list of contigs. 288 289 """ 290
291 - def __init__(self):
292 self.data=ACEFileRecord()
293
294 - def parse(self,handle):
295 firstline=handle.readline() 296 # check if the file starts correctly 297 if firstline[:2]!='AS': 298 raise SyntaxError, "File does not start with 'AS'." 299 self.data.ncontigs=eval(firstline.split()[1]) 300 self.data.nreads=eval(firstline.split()[2]) 301 # now read all the records 302 recparser=RecordParser() 303 iter=Iterator(handle,recparser) 304 while 1: 305 rec=iter.next() 306 if not rec: 307 break 308 self.data.contigs.append(rec) 309 # wa, ct, rt rags are usually at the end of the file, but not necessarily (correct?). 310 # If the iterator is used, the tags are returned with the contig or the read after which they appear, 311 # if all tags are at the end, they are read with the last contig. The concept of an 312 # iterator leaves no other choice. But if the user uses the ACEParser, we can check 313 # them and put them into the appropriate contig/read instance. 314 # Conclusion: An ACE file is not a filetype for which iteration is 100% suitable... 315 self.data.sort() 316 return self.data
317
318 -class _Scanner:
319 """Scans a ACE-formatted file 320 321 Methods: 322 feed - Feed one ACE record. 323 """
324 - def feed(self, handle, consumer):
325 """feed(self, handle, consumer) 326 327 Feed in ACE data for scanning. handle is a file-like object 328 containing ACE data. consumer is a Consumer object that will 329 receive events as the ACE data is scanned. 330 """ 331 assert isinstance(handle, File.UndoHandle), \ 332 "handle must be an UndoHandle" 333 if handle.peekline(): 334 self._scan_record(handle, consumer)
335
336 - def _scan_record(self, uhandle, consumer):
337 consumer.begin_contig() 338 read_and_call(uhandle,consumer.co_header,start='CO ') 339 consumer.co_data(self._scan_sequence_data(uhandle)) 340 read_and_call_while(uhandle,consumer.noevent,blank=1) 341 read_and_call(uhandle,consumer.bq_header,start='BQ') 342 consumer.bq_data(self._scan_bq_data(uhandle, consumer)) 343 read_and_call_while(uhandle,consumer.noevent,blank=1) 344 read_and_call_while(uhandle,consumer.af,start='AF ') 345 read_and_call_while(uhandle,consumer.noevent,blank=1) 346 read_and_call_while(uhandle,consumer.bs,start='BS ') 347 # now read all the read data 348 # it starts with a 'RD', and then a mandatory QA 349 # then follows an optional DS 350 # CT,RT,WA,WR may or may not be there in unlimited quantity. They might refer to the actial read or contig, 351 # or, if encountered at the end of file, to any previous read or contig. the sort() method deals 352 # with that later. 353 while 1: 354 # each read must have a rd and qa 355 read_and_call_until(uhandle,consumer.noevent,start='RD ') 356 read_and_call(uhandle,consumer.rd_header,start='RD ') 357 consumer.rd_data(self._scan_sequence_data(uhandle)) 358 read_and_call_while(uhandle,consumer.noevent,blank=1) 359 read_and_call(uhandle,consumer.qa,start='QA ') 360 # now one ds can follow 361 try: 362 read_and_call_while(uhandle,consumer.noevent,blank=1) 363 attempt_read_and_call(uhandle,consumer.ds,start='DS ') 364 except SyntaxError: 365 # file ends 366 consumer.end_contig() 367 return 368 # the file could just end, or there's some more stuff. In ace files, everything can happen. 369 # the following tags are interspersed between reads and can ap[pear multiple times. 370 while 1: 371 # something left 372 try: 373 read_and_call_while(uhandle,consumer.noevent,blank=1) 374 except SyntaxError: 375 # file ends here 376 consumer.end_contig() 377 return 378 else: 379 if attempt_read_and_call(uhandle,consumer.rt_start,start='RT'): 380 consumer.rt_data(self._scan_bracket_tags(uhandle)) 381 elif attempt_read_and_call(uhandle,consumer.wr_start,start='WR'): 382 consumer.wr_data(self._scan_bracket_tags(uhandle)) 383 elif attempt_read_and_call(uhandle,consumer.wa_start,start='WA'): 384 consumer.wa_data(self._scan_bracket_tags(uhandle)) 385 elif attempt_read_and_call(uhandle,consumer.ct_start,start='CT'): 386 consumer.ct_data(self._scan_bracket_tags(uhandle)) 387 else: 388 line=safe_peekline(uhandle) 389 break 390 if not line.startswith('RD'): # another read? 391 consumer.end_contig() 392 break
393
394 - def _scan_bq_data(self, uhandle, consumer):
395 """Scans multiple lines of quality data and concatenates them.""" 396 397 qual='' 398 while 1: 399 line=uhandle.readline() 400 if is_blank_line(line): 401 uhandle.saveline(line) 402 break 403 qual+=' '+line 404 return qual
405
406 - def _scan_sequence_data(self,uhandle):
407 """Scans multiple lines of sequence data and concatenates them.""" 408 409 seq='' 410 while 1: 411 line=uhandle.readline() 412 if is_blank_line(line): 413 uhandle.saveline(line) 414 break 415 seq+=line.strip() 416 return seq
417
418 - def _scan_bracket_tags(self,uhandle):
419 """Reads the data lines of a {} tag.""" 420 421 fulltag=[] 422 while 1: 423 line=uhandle.readline().strip() 424 fulltag.append(line) 425 if line.endswith('}'): 426 fulltag[-1]=fulltag[-1][:-1] # delete the ending } 427 if fulltag[-1]=='': 428 fulltag=fulltag[:-1] # delete empty line 429 break 430 return fulltag
431
432 -class _RecordConsumer(AbstractConsumer):
433 """Reads the ace tags into data records.""" 434
435 - def __init__(self):
436 self.data = None
437
438 - def begin_contig(self):
439 self.data = Contig()
440
441 - def end_contig(self):
442 pass
443
444 - def co_header(self,line):
445 header=line.split() 446 self.data.name=header[1] 447 self.data.nbases=eval(header[2]) 448 self.data.nreads=eval(header[3]) 449 self.data.nsegments=eval(header[4]) 450 self.data.uorc=header[5]
451
452 - def co_data(self,seq):
453 self.data.sequence=seq
454
455 - def bq_header(self,line):
456 pass
457
458 - def bq_data(self,qual):
459 self.data.quality=map(eval,qual.split())
460
461 - def af(self,line):
462 header=line.split() 463 afdata=af() 464 afdata.name=header[1] 465 afdata.coru=header[2] 466 afdata.padded_start=eval(header[3]) 467 self.data.af.append(afdata)
468
469 - def bs(self,line):
470 header=line.split() 471 bsdata=bs() 472 bsdata.padded_start=eval(header[1]) 473 bsdata.padded_end=eval(header[2]) 474 bsdata.name=header[3] 475 self.data.bs.append(bsdata)
476
477 - def rd_header(self,line):
478 header=line.split() 479 # Reads start with rd, so we create a new read record here 480 self.data.reads.append(Reads()) 481 rddata=rd() 482 rddata.name=header[1] 483 rddata.padded_bases=eval(header[2]) 484 rddata.info_items=eval(header[3]) 485 rddata.read_tags=eval(header[4]) 486 self.data.reads[-1].rd=rddata
487
488 - def rd_data(self,seq):
489 self.data.reads[-1].rd.sequence=seq
490
491 - def qa(self,line):
492 header=map(eval,line.split()[1:]) 493 qadata=qa() 494 qadata.qual_clipping_start=header[0] 495 qadata.qual_clipping_end=header[1] 496 qadata.align_clipping_start=header[2] 497 qadata.align_clipping_end=header[3] 498 self.data.reads[-1].qa=qadata
499
500 - def ds(self,line):
501 dsdata=ds() 502 tags=['CHROMAT_FILE','PHD_FILE','TIME','CHEM','DYE','TEMPLATE','DIRECTION'] 503 poss=map(line.find,tags) 504 tagpos=dict(zip(poss,tags)) 505 if tagpos.has_key(-1): 506 del tagpos[-1] 507 ps=tagpos.keys() 508 ps.sort() 509 for (p1,p2) in zip(ps,ps[1:]+[len(line)+1]): 510 setattr(dsdata,tagpos[p1].lower(),line[p1+len(tagpos[p1])+1:p2].strip()) 511 self.data.reads[-1].ds=dsdata
512
513 - def ct_start(self,line):
514 if not line.strip().endswith('{'): 515 print line 516 raise SyntaxError, 'CT tag does not start with CT{' 517 ctdata=ct() 518 if self.data.ct is None: 519 self.data.ct=[] 520 self.data.ct.append(ctdata)
521
522 - def ct_data(self,taglines):
523 if len(taglines)<1: 524 raise SyntaxError, 'Missing header line in CT tag' 525 header=taglines[0].split() 526 self.data.ct[-1].name=header[0] 527 self.data.ct[-1].tag_type=header[1] 528 self.data.ct[-1].program=header[2] 529 self.data.ct[-1].padded_start=eval(header[3]) 530 self.data.ct[-1].padded_end=eval(header[4]) 531 self.data.ct[-1].date=header[5] 532 if len(header)==7: 533 self.data.ct[-1].notrans=header[6] 534 self.data.ct[-1].info=taglines[1:]
535
536 - def rt_start(self,line):
537 if not line.strip().endswith('{'): 538 raise SyntaxError, 'RT tag does not start with RT{' 539 rtdata=rt() 540 # now if we're at the end of the file, this rt could belong to a previous read, not the actual one 541 # we store it here were it appears, the user can sort later. 542 if self.data.reads[-1].rt is None: 543 self.data.reads[-1].rt=[] 544 self.data.reads[-1].rt.append(rtdata)
545
546 - def rt_data(self,taglines):
547 if len(taglines)<1: 548 raise SyntaxError, 'Missing header line in RT tag' 549 header=taglines[0].split() 550 self.data.reads[-1].rt[-1].name=header[0] 551 self.data.reads[-1].rt[-1].tag_type=header[1] 552 self.data.reads[-1].rt[-1].program=header[2] 553 self.data.reads[-1].rt[-1].padded_start=eval(header[3]) 554 self.data.reads[-1].rt[-1].padded_end=eval(header[4]) 555 self.data.reads[-1].rt[-1].date=header[5]
556
557 - def wa_start(self,line):
558 if not line.strip().endswith('{'): 559 raise SyntaxError, 'WA tag does not start with WA{' 560 wadata=wa() 561 if self.data.wa is None: 562 self.data.wa=[] 563 self.data.wa.append(wadata)
564
565 - def wa_data(self,taglines):
566 if len(taglines)<1: 567 raise SyntaxError, 'Missing header line in WA tag' 568 header=taglines[0].split() 569 self.data.wa[-1].tag_type=header[0] 570 self.data.wa[-1].program=header[1] 571 self.data.wa[-1].date=header[2] 572 self.data.wa[-1].info=taglines[1:]
573
574 - def wr_start(self,line):
575 if not line.strip().endswith('{'): 576 raise SyntaxError, 'WR tag does not start with WR{' 577 wrdata=wr() 578 if self.data.reads[-1].wr is None: 579 self.data.reads[-1].wr=[] 580 self.data.reads[-1].wr.append(wrdata)
581
582 - def wr_data(self,taglines):
583 if len(taglines)<1: 584 raise SyntaxError, 'Missing header line in WR tag' 585 header=taglines[0].split() 586 self.data.reads[-1].wr[-1].name=header[0] 587 self.data.reads[-1].wr[-1].aligned=header[1] 588 self.data.reads[-1].wr[-1].program=header[2] 589 self.data.reads[-1].wr[-1].date=header[3]
590