1
2
3
4
5
6 from Bio.Alphabet import single_letter_alphabet
7 from Bio.Seq import Seq
8 from Bio.SeqRecord import SeqRecord
9 from Interfaces import InterlacedSequenceIterator, SequentialSequenceWriter
10
12 """Loads a Stockholm file from PFAM into SeqRecord objects
13
14 The entire file is loaded, and any sequence can be accessed using
15 the [index] notation.
16
17 This parser will detect if the Stockholm file follows the PFAM conventions
18 for sequence specific meta-data (lines starting #=GS and #=GR) and
19 populates the SeqRecord fields accordingly.
20
21 Any annotation which does not follow the PFAM conventions is currently
22 ignored.
23
24 If an accession is provided for an entry in the meta data, IT WILL NOT
25 be used as the record.id (it will be recorded in the record's annotations).
26 This is because some files have (sub) sequences from different parts of
27 the same accession (differentiated by different start-end positions).
28
29 Wrap-around alignments are not supported - each sequences must be on
30 a single line. However, interlaced sequences should work.
31
32 For more information on the file format, please see:
33 http://www.bioperl.org/wiki/Stockholm_multiple_alignment_format
34 http://www.cgb.ki.se/cgb/groups/sonnhammer/Stockholm.html
35
36 For consistency with BioPerl and EMBOSS we call this the "stockholm"
37 format.
38 """
39
40
41
42 pfam_gr_mapping = {"SS" : "secondary_structure",
43 "SA" : "surface_accessibility",
44 "TM" : "transmembrane",
45 "PP" : "posterior_probability",
46 "LI" : "ligand_binding",
47 "AS" : "active_site",
48 "IN" : "intron"}
49
50 pfam_gs_mapping = {"OS" : "organism",
51 "OC" : "organism_classification",
52 "LO" : "look"}
53
54
68
70 line = self.handle.readline()
71 if not line:
72
73 return
74 if not line.strip() == '# STOCKHOLM 1.0':
75 raise SyntaxError("Did not find STOCKHOLM header")
76
77
78
79
80
81
82
83
84 seqs = {}
85 ids = []
86 gs = {}
87 gr = {}
88 passed_end_alignment = False
89 while 1:
90 line = self.handle.readline()
91 if not line: break
92 line = line.strip()
93 if line == "//" :
94
95
96 passed_end_alignment = True
97 elif line == "" :
98
99 pass
100 elif line[0] <> "#" :
101
102
103 assert not passed_end_alignment
104 parts = [x.strip() for x in line.split(" ",1)]
105 if len(parts) <> 2 :
106
107 raise SyntaxError("Could not split line into identifier " \
108 + "and sequence:\n" + line)
109 id, seq = parts
110 if id not in ids :
111 ids.append(id)
112 seqs.setdefault(id, '')
113 seqs[id] += seq.replace(".","-")
114 elif len(line) >= 5 :
115
116 if line[:5] == "#=GF " :
117
118
119 pass
120 elif line[:5] == '#=GC ':
121
122
123 pass
124 elif line[:5] == '#=GS ':
125
126
127 id, feature, text = line[5:].strip().split(None,2)
128
129
130 if id not in gs :
131 gs[id] = {}
132 if feature not in gs[id] :
133 gs[id][feature] = [text]
134 else :
135 gs[id][feature].append(text)
136 elif line[:5] == "#=GR " :
137
138
139 id, feature, text = line[5:].strip().split(None,2)
140
141
142 if id not in gr :
143 gr[id] = {}
144 if feature not in gr[id]:
145 gr[id][feature] = ""
146 gr[id][feature] += text.strip()
147
148
149
150
151
152
153 assert len(seqs) <= len(ids)
154
155
156
157
158 if ids and seqs :
159 align_len = None
160 for id, seq in seqs.iteritems() :
161 assert id in ids
162 if align_len is None :
163 align_len = len(seq)
164 elif align_len <> len(seq) :
165 raise SyntaxError("Sequences have different lengths, or repeated identifier")
166
167
168 self.ids = ids
169 self.sequences = seqs
170 self.seq_annotation = gs
171 self.seq_col_annotation = gr
172 self.move_start()
173
176
186
219
250
271
273 """Class to write PFAM style Stockholm format files
274
275 Note that sequences and their annotation are recorded
276 together (rather than having a block of annotation followed
277 by a block of aligned sequences).
278 """
279
280
281
282 pfam_gr_mapping = { "secondary_structure" : "SS",
283 "surface_accessibility" : "SA",
284 "transmembrane" : "TM",
285 "posterior_probability" : "PP",
286 "ligand_binding" : "LI",
287 "active_site" : "AS",
288 "intron" : "IN"}
289
290 pfam_gs_mapping = {"organism" : "OS",
291 "organism_classification" : "OC",
292 "look" : "LO"}
293
295 """Creates the writer object
296
297 Use the method write_file() to actually record your sequence records."""
298 SequentialSequenceWriter.__init__(self, handle)
299 self._ids_written = []
300 self._length_of_sequences = None
301
307
309 """Use this to write an entire file containing the given records.
310
311 records - A list or iterator returning SeqRecord objects.
312 If len(records) is not supported, then it will be
313 converted into a list.
314
315 This method should only be called once for each file."""
316 try :
317
318
319 count = len(records)
320 except TypeError :
321
322 records = list(records)
323 count = len(records)
324
325 if count == 0 :
326 raise ValueError("Must have at least one sequence")
327
328 self.write_header(count)
329 self.write_records(records)
330 self.write_footer()
331
332
333
334
335
337 """Write a single Stockholm record to the file"""
338 assert self._header_written
339 assert not self._footer_written
340 self._record_written = True
341
342 if self._length_of_sequences is None :
343 self._length_of_sequences = len(record.seq)
344 elif self._length_of_sequences <> len(record.seq) :
345 raise ValueError("Sequences must all be the same length")
346
347 if self._length_of_sequences == 0 :
348 raise ValueError("Non-empty sequences are required")
349
350
351 seq_name = record.id
352 if record.name is not None :
353 if "accession" in record.annotations :
354 if record.id == record.annotations["accession"] :
355 seq_name = record.name
356
357
358 seq_name = seq_name.replace(" ","_")
359
360 if "start" in record.annotations \
361 and "end" in record.annotations :
362 suffix = "/%s-%s" % (str(record.annotations["start"]),
363 str(record.annotations["end"]))
364 if seq_name[-len(suffix):] <> suffix :
365 seq_name = "%s/%s-%s" % (seq_name,
366 str(record.annotations["start"]),
367 str(record.annotations["end"]))
368
369 if seq_name in self._ids_written :
370 raise ValueError("Duplicate record identifier: %s" % seq_name)
371 self._ids_written.append(seq_name)
372 self.handle.write("%s %s\n" % (seq_name, record.seq.tostring()))
373
374
375
376
377
378
379
380
381
382
383
384
385
386 if "accession" in record.annotations :
387 self.handle.write("#=GS %s AC %s\n" % (seq_name, self.clean(record.annotations["accession"])))
388 elif record.id :
389 self.handle.write("#=GS %s AC %s\n" % (seq_name, self.clean(record.id)))
390
391
392 if record.description :
393 self.handle.write("#=GS %s DE %s\n" % (seq_name, self.clean(record.description)))
394
395
396 for xref in record.dbxrefs :
397 self.handle.write("#=GS %s DR %s\n" % (seq_name, self.clean(xref)))
398
399
400 for key in record.annotations :
401 if key in self.pfam_gs_mapping :
402 self.handle.write("#=GS %s %s %s\n" \
403 % (seq_name,
404 self.clean(self.pfam_gs_mapping[key]),
405 self.clean(str(record.annotations[key]))))
406 elif key in self.pfam_gr_mapping :
407 if len(str(record.annotations[key]))==len(record.seq) :
408 self.handle.write("#=GR %s %s %s\n" \
409 % (seq_name,
410 self.clean(self.pfam_gr_mapping[key]),
411 self.clean((record.annotations[key]))))
412 else :
413
414 pass
415 else :
416
417 pass
418
419
420
421
433
434 if __name__ == "__main__" :
435 print "Testing..."
436 from cStringIO import StringIO
437
438
439
440
441
442 sth_example = \
443 """# STOCKHOLM 1.0
444 #=GF ID CBS
445 #=GF AC PF00571
446 #=GF DE CBS domain
447 #=GF AU Bateman A
448 #=GF CC CBS domains are small intracellular modules mostly found
449 #=GF CC in 2 or four copies within a protein.
450 #=GF SQ 67
451 #=GS O31698/18-71 AC O31698
452 #=GS O83071/192-246 AC O83071
453 #=GS O83071/259-312 AC O83071
454 #=GS O31698/88-139 AC O31698
455 #=GS O31698/88-139 OS Bacillus subtilis
456 O83071/192-246 MTCRAQLIAVPRASSLAE..AIACAQKM....RVSRVPVYERS
457 #=GR O83071/192-246 SA 999887756453524252..55152525....36463774777
458 O83071/259-312 MQHVSAPVFVFECTRLAY..VQHKLRAH....SRAVAIVLDEY
459 #=GR O83071/259-312 SS CCCCCHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEEEE
460 O31698/18-71 MIEADKVAHVQVGNNLEH..ALLVLTKT....GYTAIPVLDPS
461 #=GR O31698/18-71 SS CCCHHHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEHHH
462 O31698/88-139 EVMLTDIPRLHINDPIMK..GFGMVINN......GFVCVENDE
463 #=GR O31698/88-139 SS CCCCCCCHHHHHHHHHHH..HEEEEEEE....EEEEEEEEEEH
464 #=GC SS_cons CCCCCHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEEEH
465 O31699/88-139 EVMLTDIPRLHINDPIMK..GFGMVINN......GFVCVENDE
466 #=GR O31699/88-139 AS ________________*__________________________
467 #=GR_O31699/88-139_IN ____________1______________2__________0____
468 //
469 """
470
471
472
473 sth_example2 = \
474 """# STOCKHOLM 1.0
475 #=GC SS_cons .................<<<<<<<<...<<<<<<<........>>>>>>>..
476 AP001509.1 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGU
477 #=GR AP001509.1 SS -----------------<<<<<<<<---..<<-<<-------->>->>..--
478 AE007476.1 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGU
479 #=GR AE007476.1 SS -----------------<<<<<<<<-----<<.<<-------->>.>>----
480
481 #=GC SS_cons ......<<<<<<<.......>>>>>>>..>>>>>>>>...............
482 AP001509.1 CUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU
483 #=GR AP001509.1 SS -------<<<<<--------->>>>>--->>>>>>>>---------------
484 AE007476.1 UUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU
485 #=GR AE007476.1 SS ------.<<<<<--------->>>>>.-->>>>>>>>---------------
486 //"""
487
488 print "--------"
489 print "StockholmIterator(stockholm alignment file)"
490 iterator = StockholmIterator(StringIO(sth_example))
491 count=0
492 for record in iterator :
493 count=count+1
494 assert count == 5
495
496 assert record.id == 'O31699/88-139'
497 assert record.name == 'O31699'
498 assert record.description == 'O31699/88-139'
499 assert len(record.annotations)==4
500 assert record.annotations["accession"]=='O31699'
501 assert record.annotations["start"]==88
502 assert record.annotations["end"]==139
503 assert record.annotations["active_site"]=='________________*__________________________'
504
505 iterator = StockholmIterator(StringIO(sth_example))
506 count=0
507 for record in iterator :
508 count=count+1
509 break
510 assert count==1
511
512 assert record.id == 'O83071/192-246'
513 assert record.name == 'O83071'
514 assert record.description == 'O83071/192-246'
515 assert len(record.annotations)==4
516 assert record.annotations["accession"]=='O83071'
517 assert record.annotations["start"]==192
518 assert record.annotations["end"]==246
519 assert record.annotations["surface_accessibility"]=="999887756453524252..55152525....36463774777"
520
521 assert [r.id for r in StockholmIterator(StringIO(sth_example))] \
522 == ['O83071/192-246', 'O83071/259-312', 'O31698/18-71', 'O31698/88-139', 'O31699/88-139']
523
524 assert [r.name for r in StockholmIterator(StringIO(sth_example))] \
525 == ['O83071', 'O83071', 'O31698', 'O31698', 'O31699']
526
527 assert [r.description for r in StockholmIterator(StringIO(sth_example))] \
528 == ['O83071/192-246', 'O83071/259-312', 'O31698/18-71', 'O31698/88-139', 'O31699/88-139']
529
530
531 print "--------"
532 print "StockholmIterator(interlaced stockholm alignment file)"
533 iterator = StockholmIterator(StringIO(sth_example2))
534 record = iterator.next()
535 assert record.id == "AP001509.1"
536 assert len(record.seq) == 104
537 assert "secondary_structure" in record.annotations
538 assert len(record.annotations["secondary_structure"]) == 104
539 record = iterator.next()
540 assert record.id == "AE007476.1"
541 assert len(record.seq) == 104
542 assert "secondary_structure" in record.annotations
543 assert len(record.annotations["secondary_structure"]) == 104
544 record = iterator.next()
545 assert record is None
546