Package Bio :: Package SeqIO :: Module StockholmIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.StockholmIO

  1  # Copyright 2006, 2007 by Peter Cock.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  from Bio.Alphabet import single_letter_alphabet 
  7  from Bio.Seq import Seq 
  8  from Bio.SeqRecord import SeqRecord 
  9  from Interfaces import InterlacedSequenceIterator, SequentialSequenceWriter 
 10   
11 -class StockholmIterator(InterlacedSequenceIterator) :
12 """Loads a Stockholm file from PFAM into SeqRecord objects 13 14 The entire file is loaded, and any sequence can be accessed using 15 the [index] notation. 16 17 This parser will detect if the Stockholm file follows the PFAM conventions 18 for sequence specific meta-data (lines starting #=GS and #=GR) and 19 populates the SeqRecord fields accordingly. 20 21 Any annotation which does not follow the PFAM conventions is currently 22 ignored. 23 24 If an accession is provided for an entry in the meta data, IT WILL NOT 25 be used as the record.id (it will be recorded in the record's annotations). 26 This is because some files have (sub) sequences from different parts of 27 the same accession (differentiated by different start-end positions). 28 29 Wrap-around alignments are not supported - each sequences must be on 30 a single line. However, interlaced sequences should work. 31 32 For more information on the file format, please see: 33 http://www.bioperl.org/wiki/Stockholm_multiple_alignment_format 34 http://www.cgb.ki.se/cgb/groups/sonnhammer/Stockholm.html 35 36 For consistency with BioPerl and EMBOSS we call this the "stockholm" 37 format. 38 """ 39 40 #These dictionaries should be kept in sync with those 41 #defined in the PfamStockholmWriter class. 42 pfam_gr_mapping = {"SS" : "secondary_structure", 43 "SA" : "surface_accessibility", 44 "TM" : "transmembrane", 45 "PP" : "posterior_probability", 46 "LI" : "ligand_binding", 47 "AS" : "active_site", 48 "IN" : "intron"} 49 #Following dictionary deliberately does not cover AC, DE or DR 50 pfam_gs_mapping = {"OS" : "organism", 51 "OC" : "organism_classification", 52 "LO" : "look"} 53 54 #TODO - Should the default be Gapped(single_letter_alphabet) instead?
55 - def __init__(self, handle, alphabet = single_letter_alphabet) :
56 """Create a StockholmIterator object (which returns SeqRecord objects). 57 58 handle - input file 59 alphabet - optional alphabet 60 """ 61 self.handle = handle 62 self.alphabet = alphabet 63 64 self.sequences = {} 65 self.ids = [] 66 self.ParseAlignment() 67 self.move_start()
68
69 - def ParseAlignment(self):
70 line = self.handle.readline() 71 if not line: 72 #Empty file - just give up. 73 return 74 if not line.strip() == '# STOCKHOLM 1.0': 75 raise SyntaxError("Did not find STOCKHOLM header") 76 #import sys 77 #print >> sys.stderr, 'Warning file does not start with STOCKHOLM 1.0' 78 79 # Note: If this file follows the PFAM conventions, there should be 80 # a line containing the number of sequences, e.g. "#=GF SQ 67" 81 # We do not check for this - perhaps we should, and verify that 82 # if present it agrees with our parsing. 83 84 seqs = {} 85 ids = [] 86 gs = {} 87 gr = {} 88 passed_end_alignment = False 89 while 1: 90 line = self.handle.readline() 91 if not line: break #end of file 92 line = line.strip() #remove trailing \n 93 if line == "//" : 94 #The "//" line indicates the end of the alignment. 95 #There may still be more meta-data 96 passed_end_alignment = True 97 elif line == "" : 98 #blank line, ignore 99 pass 100 elif line[0] <> "#" : 101 #Sequence 102 #Format: "<seqname> <sequence>" 103 assert not passed_end_alignment 104 parts = [x.strip() for x in line.split(" ",1)] 105 if len(parts) <> 2 : 106 #This might be someone attempting to store a zero length sequence? 107 raise SyntaxError("Could not split line into identifier " \ 108 + "and sequence:\n" + line) 109 id, seq = parts 110 if id not in ids : 111 ids.append(id) 112 seqs.setdefault(id, '') 113 seqs[id] += seq.replace(".","-") 114 elif len(line) >= 5 : 115 #Comment line or meta-data 116 if line[:5] == "#=GF " : 117 #Generic per-File annotation, free text 118 #Format: #=GF <feature> <free text> 119 pass 120 elif line[:5] == '#=GC ': 121 #Generic per-Column annotation, exactly 1 char per column 122 #Format: "#=GC <feature> <exactly 1 char per column>" 123 pass 124 elif line[:5] == '#=GS ': 125 #Generic per-Sequence annotation, free text 126 #Format: "#=GS <seqname> <feature> <free text>" 127 id, feature, text = line[5:].strip().split(None,2) 128 #if id not in ids : 129 # ids.append(id) 130 if id not in gs : 131 gs[id] = {} 132 if feature not in gs[id] : 133 gs[id][feature] = [text] 134 else : 135 gs[id][feature].append(text) 136 elif line[:5] == "#=GR " : 137 #Generic per-Sequence AND per-Column markup 138 #Format: "#=GR <seqname> <feature> <exactly 1 char per column>" 139 id, feature, text = line[5:].strip().split(None,2) 140 #if id not in ids : 141 # ids.append(id) 142 if id not in gr : 143 gr[id] = {} 144 if feature not in gr[id]: 145 gr[id][feature] = "" 146 gr[id][feature] += text.strip() # append to any previous entry 147 #TODO - Should we check the length matches the alignment length? 148 # For iterlaced sequences the GR data can be split over 149 # multiple lines 150 #Next line... 151 152 153 assert len(seqs) <= len(ids) 154 #assert len(gs) <= len(ids) 155 #assert len(gr) <= len(ids) 156 157 #This is some paranoia based on some pathelogical test cases. 158 if ids and seqs : 159 align_len = None 160 for id, seq in seqs.iteritems() : 161 assert id in ids 162 if align_len is None : 163 align_len = len(seq) 164 elif align_len <> len(seq) : 165 raise SyntaxError("Sequences have different lengths, or repeated identifier") 166 167 168 self.ids = ids 169 self.sequences = seqs 170 self.seq_annotation = gs 171 self.seq_col_annotation = gr 172 self.move_start()
173
174 - def __len__(self) :
175 return len(self.ids)
176
177 - def _identifier_split(self, identifier) :
178 """Returns (name,start,end) string tuple from an identier""" 179 if identifier.find("/")<>-1 : 180 start_end = identifier.split("/",1)[1] 181 if start_end.count("-")==1 : 182 start, end = start_end.split("-") 183 name = identifier.split("/",1)[0] 184 return (name, start, end) 185 return (identifier, None, None)
186
187 - def _get_meta_data(self, identifier, meta_dict) :
188 """Takes an itentifier and returns dict of all meta-data matching it. 189 190 For example, given "Q9PN73_CAMJE/149-220" will return all matches to 191 this or "Q9PN73_CAMJE" which the identifier without its /start-end 192 suffix. 193 194 In the example below, the suffix is required to match the AC, but must 195 be removed to match the OS and OC meta-data. 196 197 # STOCKHOLM 1.0 198 #=GS Q9PN73_CAMJE/149-220 AC Q9PN73 199 ... 200 Q9PN73_CAMJE/149-220 NKA... 201 ... 202 #=GS Q9PN73_CAMJE OS Campylobacter jejuni 203 #=GS Q9PN73_CAMJE OC Bacteria 204 205 This function will return an empty dictionary if no data is found""" 206 name, start, end = self._identifier_split(identifier) 207 if name==identifier : 208 identifier_keys = [identifier] 209 else : 210 identifier_keys = [identifier, name] 211 answer = {} 212 for identifier_key in identifier_keys : 213 try : 214 for feature_key in meta_dict[identifier_key] : 215 answer[feature_key] = meta_dict[identifier_key][feature_key] 216 except KeyError : 217 pass 218 return answer
219
220 - def _populate_meta_data(self, identifier, record) :
221 """Adds meta-date to a SecRecord's annotations dictionary 222 223 This function applies the PFAM conventions.""" 224 225 seq_data = self._get_meta_data(identifier, self.seq_annotation) 226 for feature in seq_data : 227 #Note this dictionary contains lists! 228 if feature=="AC" : #ACcession number 229 assert len(seq_data[feature])==1 230 record.annotations["accession"]=seq_data[feature][0] 231 elif feature=="DE" : #DEscription 232 record.description = "\n".join(seq_data[feature]) 233 elif feature=="DR" : #Database Reference 234 #Should we try and parse the strings? 235 record.dbxrefs = seq_data[feature] 236 elif feature in self.pfam_gs_mapping : 237 record.annotations[self.pfam_gs_mapping[feature]] = ", ".join(seq_data[feature]) 238 else : 239 #Ignore it? 240 record.annotations["GS:" + feature] = ", ".join(seq_data[feature]) 241 242 seq_col_data = self._get_meta_data(identifier, self.seq_col_annotation) 243 for feature in seq_col_data : 244 #Note this dictionary contains strings! 245 if feature in self.pfam_gr_mapping : 246 record.annotations[self.pfam_gr_mapping[feature]] = seq_col_data[feature] 247 else : 248 #Ignore it? 249 record.annotations["GR:" + feature] = seq_col_data[feature]
250
251 - def __getitem__(self, i):
252 """Provides random access to the SeqRecords""" 253 if i < 0 or i >= len(self.ids) : raise ValueError 254 id = self.ids[i] 255 seq_len = len(self.sequences[id]) 256 257 name, start, end = self._identifier_split(id) 258 259 record = SeqRecord(Seq(self.sequences[id], self.alphabet), id=id, name=name, description=id) 260 261 if start : record.annotations["start"] = int(start) 262 if end : record.annotations["end"] = int(end) 263 264 #will be overridden by _populate_meta_data if an explicit accession is provided: 265 record.annotations["accession"]=name 266 267 self._populate_meta_data(id, record) 268 269 #DO NOT TOUCH self._n 270 return record
271
272 -class StockholmWriter(SequentialSequenceWriter):
273 """Class to write PFAM style Stockholm format files 274 275 Note that sequences and their annotation are recorded 276 together (rather than having a block of annotation followed 277 by a block of aligned sequences). 278 """ 279 280 #These dictionaries should be kept in sync with those 281 #defined in the PfamStockholmIterator class. 282 pfam_gr_mapping = { "secondary_structure" : "SS", 283 "surface_accessibility" : "SA", 284 "transmembrane" : "TM", 285 "posterior_probability" : "PP", 286 "ligand_binding" : "LI", 287 "active_site" : "AS", 288 "intron" : "IN"} 289 #Following dictionary deliberately does not cover AC, DE or DR 290 pfam_gs_mapping = {"organism" : "OS", 291 "organism_classification" : "OC", 292 "look" : "LO"} 293
294 - def __init__(self, handle):
295 """Creates the writer object 296 297 Use the method write_file() to actually record your sequence records.""" 298 SequentialSequenceWriter.__init__(self, handle) 299 self._ids_written = [] 300 self._length_of_sequences = None
301
302 - def write_header(self, count):
303 """Must supply the number of records (count)""" 304 SequentialSequenceWriter.write_header(self) # sets flags 305 self.handle.write("# STOCKHOLM 1.0\n") 306 self.handle.write("#=GF SQ %i\n" % count)
307
308 - def write_file(self, records) :
309 """Use this to write an entire file containing the given records. 310 311 records - A list or iterator returning SeqRecord objects. 312 If len(records) is not supported, then it will be 313 converted into a list. 314 315 This method should only be called once for each file.""" 316 try : 317 #This will work for a list, and some of the SeqIO 318 #iterators too, like the StockholmIterator 319 count = len(records) 320 except TypeError : 321 #Probably have an standard iterator, not a list... 322 records = list(records) 323 count = len(records) 324 325 if count == 0 : 326 raise ValueError("Must have at least one sequence") 327 328 self.write_header(count) 329 self.write_records(records) 330 self.write_footer()
331 #Don't automatically close the file. This would prevent 332 #things like writing concatenated alignments as used for 333 #phylogenetic bootstrapping (usually done with phylip). 334 #self.close() 335
336 - def write_record(self, record):
337 """Write a single Stockholm record to the file""" 338 assert self._header_written 339 assert not self._footer_written 340 self._record_written = True 341 342 if self._length_of_sequences is None : 343 self._length_of_sequences = len(record.seq) 344 elif self._length_of_sequences <> len(record.seq) : 345 raise ValueError("Sequences must all be the same length") 346 347 if self._length_of_sequences == 0 : 348 raise ValueError("Non-empty sequences are required") 349 350 #For the case for stockholm to stockholm, try and use record.name 351 seq_name = record.id 352 if record.name is not None : 353 if "accession" in record.annotations : 354 if record.id == record.annotations["accession"] : 355 seq_name = record.name 356 357 #In the Stockholm file format, spaces are not allowed in the id 358 seq_name = seq_name.replace(" ","_") 359 360 if "start" in record.annotations \ 361 and "end" in record.annotations : 362 suffix = "/%s-%s" % (str(record.annotations["start"]), 363 str(record.annotations["end"])) 364 if seq_name[-len(suffix):] <> suffix : 365 seq_name = "%s/%s-%s" % (seq_name, 366 str(record.annotations["start"]), 367 str(record.annotations["end"])) 368 369 if seq_name in self._ids_written : 370 raise ValueError("Duplicate record identifier: %s" % seq_name) 371 self._ids_written.append(seq_name) 372 self.handle.write("%s %s\n" % (seq_name, record.seq.tostring())) 373 374 #The recommended placement for GS lines (per sequence annotation) 375 #is above the alignment (as a header block) or just below the 376 #corresponding sequence. 377 # 378 #The recommended placement for GR lines (per sequence per column 379 #annotation such as secondary structure) is just below the 380 #corresponding sequence. 381 # 382 #We put both just below the corresponding sequence as this allows 383 #us to write the file using a single pass through the records. 384 385 #AC = Accession 386 if "accession" in record.annotations : 387 self.handle.write("#=GS %s AC %s\n" % (seq_name, self.clean(record.annotations["accession"]))) 388 elif record.id : 389 self.handle.write("#=GS %s AC %s\n" % (seq_name, self.clean(record.id))) 390 391 #DE = description 392 if record.description : 393 self.handle.write("#=GS %s DE %s\n" % (seq_name, self.clean(record.description))) 394 395 #DE = database links 396 for xref in record.dbxrefs : 397 self.handle.write("#=GS %s DR %s\n" % (seq_name, self.clean(xref))) 398 399 #GS/GR = other per sequence annotation 400 for key in record.annotations : 401 if key in self.pfam_gs_mapping : 402 self.handle.write("#=GS %s %s %s\n" \ 403 % (seq_name, 404 self.clean(self.pfam_gs_mapping[key]), 405 self.clean(str(record.annotations[key])))) 406 elif key in self.pfam_gr_mapping : 407 if len(str(record.annotations[key]))==len(record.seq) : 408 self.handle.write("#=GR %s %s %s\n" \ 409 % (seq_name, 410 self.clean(self.pfam_gr_mapping[key]), 411 self.clean((record.annotations[key])))) 412 else : 413 #Should we print a warning? 414 pass 415 else : 416 #It doesn't follow the PFAM standards, but should we record this data anyway? 417 pass
418 419 420 421
433 434 if __name__ == "__main__" : 435 print "Testing..." 436 from cStringIO import StringIO 437 438 439 # This example with its slightly odd (partial) annotation is from here: 440 # http://www.cgb.ki.se/cgb/groups/sonnhammer/Stockholm.html 441 # I don't know what the "GR_O31699/88-139_IN ..." line is meant to be. 442 sth_example = \ 443 """# STOCKHOLM 1.0 444 #=GF ID CBS 445 #=GF AC PF00571 446 #=GF DE CBS domain 447 #=GF AU Bateman A 448 #=GF CC CBS domains are small intracellular modules mostly found 449 #=GF CC in 2 or four copies within a protein. 450 #=GF SQ 67 451 #=GS O31698/18-71 AC O31698 452 #=GS O83071/192-246 AC O83071 453 #=GS O83071/259-312 AC O83071 454 #=GS O31698/88-139 AC O31698 455 #=GS O31698/88-139 OS Bacillus subtilis 456 O83071/192-246 MTCRAQLIAVPRASSLAE..AIACAQKM....RVSRVPVYERS 457 #=GR O83071/192-246 SA 999887756453524252..55152525....36463774777 458 O83071/259-312 MQHVSAPVFVFECTRLAY..VQHKLRAH....SRAVAIVLDEY 459 #=GR O83071/259-312 SS CCCCCHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEEEE 460 O31698/18-71 MIEADKVAHVQVGNNLEH..ALLVLTKT....GYTAIPVLDPS 461 #=GR O31698/18-71 SS CCCHHHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEHHH 462 O31698/88-139 EVMLTDIPRLHINDPIMK..GFGMVINN......GFVCVENDE 463 #=GR O31698/88-139 SS CCCCCCCHHHHHHHHHHH..HEEEEEEE....EEEEEEEEEEH 464 #=GC SS_cons CCCCCHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEEEH 465 O31699/88-139 EVMLTDIPRLHINDPIMK..GFGMVINN......GFVCVENDE 466 #=GR O31699/88-139 AS ________________*__________________________ 467 #=GR_O31699/88-139_IN ____________1______________2__________0____ 468 // 469 """ 470 471 # Interlaced example from BioPerl documentation. Also note the blank line. 472 # http://www.bioperl.org/wiki/Stockholm_multiple_alignment_format 473 sth_example2 = \ 474 """# STOCKHOLM 1.0 475 #=GC SS_cons .................<<<<<<<<...<<<<<<<........>>>>>>>.. 476 AP001509.1 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGU 477 #=GR AP001509.1 SS -----------------<<<<<<<<---..<<-<<-------->>->>..-- 478 AE007476.1 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGU 479 #=GR AE007476.1 SS -----------------<<<<<<<<-----<<.<<-------->>.>>---- 480 481 #=GC SS_cons ......<<<<<<<.......>>>>>>>..>>>>>>>>............... 482 AP001509.1 CUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU 483 #=GR AP001509.1 SS -------<<<<<--------->>>>>--->>>>>>>>--------------- 484 AE007476.1 UUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU 485 #=GR AE007476.1 SS ------.<<<<<--------->>>>>.-->>>>>>>>--------------- 486 //""" 487 488 print "--------" 489 print "StockholmIterator(stockholm alignment file)" 490 iterator = StockholmIterator(StringIO(sth_example)) 491 count=0 492 for record in iterator : 493 count=count+1 494 assert count == 5 495 #Check the first record... 496 assert record.id == 'O31699/88-139' 497 assert record.name == 'O31699' 498 assert record.description == 'O31699/88-139' 499 assert len(record.annotations)==4 500 assert record.annotations["accession"]=='O31699' 501 assert record.annotations["start"]==88 502 assert record.annotations["end"]==139 503 assert record.annotations["active_site"]=='________________*__________________________' 504 505 iterator = StockholmIterator(StringIO(sth_example)) 506 count=0 507 for record in iterator : 508 count=count+1 509 break 510 assert count==1 511 #Check the last record... 512 assert record.id == 'O83071/192-246' 513 assert record.name == 'O83071' 514 assert record.description == 'O83071/192-246' 515 assert len(record.annotations)==4 516 assert record.annotations["accession"]=='O83071' 517 assert record.annotations["start"]==192 518 assert record.annotations["end"]==246 519 assert record.annotations["surface_accessibility"]=="999887756453524252..55152525....36463774777" 520 521 assert [r.id for r in StockholmIterator(StringIO(sth_example))] \ 522 == ['O83071/192-246', 'O83071/259-312', 'O31698/18-71', 'O31698/88-139', 'O31699/88-139'] 523 524 assert [r.name for r in StockholmIterator(StringIO(sth_example))] \ 525 == ['O83071', 'O83071', 'O31698', 'O31698', 'O31699'] 526 527 assert [r.description for r in StockholmIterator(StringIO(sth_example))] \ 528 == ['O83071/192-246', 'O83071/259-312', 'O31698/18-71', 'O31698/88-139', 'O31699/88-139'] 529 530 531 print "--------" 532 print "StockholmIterator(interlaced stockholm alignment file)" 533 iterator = StockholmIterator(StringIO(sth_example2)) 534 record = iterator.next() 535 assert record.id == "AP001509.1" 536 assert len(record.seq) == 104 537 assert "secondary_structure" in record.annotations 538 assert len(record.annotations["secondary_structure"]) == 104 539 record = iterator.next() 540 assert record.id == "AE007476.1" 541 assert len(record.seq) == 104 542 assert "secondary_structure" in record.annotations 543 assert len(record.annotations["secondary_structure"]) == 104 544 record = iterator.next() 545 assert record is None 546