1
2
3
4
5
6
7
8 from Bio.Alphabet import single_letter_alphabet
9 from Bio.Seq import Seq
10 from Bio.SeqRecord import SeqRecord
11
12
13 from Bio.SeqIO.Interfaces import SequenceWriter
14 from Bio.Clustalw import ClustalAlignment
15
16
17
19 """Reads a Clustalw file returning a SeqRecord object iterator
20
21 The entire file is loaded at once, but the SeqRecord objects
22 are only created "on request".
23
24 For more information on the file format, please see:
25 http://www.bioperl.org/wiki/ClustalW_multiple_alignment_format
26
27 You might like to look at Bio.Clustalw which has an interface
28 to the command line tool clustalw, and can also clustal alignment
29 files into Bio.Clustalw.ClustalAlignment objects.
30
31 We call this the "clustal" format which is consistent with EMBOSS.
32 Sadly BioPerl calls it the "clustalw" format, so we can't match
33 them both.
34 """
35 line = handle.readline()
36 if not line: return
37 if not line[:7] == 'CLUSTAL':
38 raise SyntaxError("Did not find CLUSTAL header")
39
40
41 line = handle.readline()
42 while line.strip() == "" :
43 line = handle.readline()
44
45
46
47
48 ids = []
49 seqs = []
50
51
52 while line.strip() <> "" :
53 if line[0] <> " " :
54
55 fields = line.rstrip().split()
56
57
58
59 if len(fields) < 2 or len(fields) > 3:
60 raise SyntaxError("Could not parse line:\n%s" % line)
61
62 ids.append(fields[0])
63 seqs.append(fields[1])
64
65 if len(fields) == 3 :
66
67 try :
68 letters = int(fields[2])
69 except ValueError :
70 raise SyntaxError("Could not parse line, bad sequence number:\n%s" % line)
71 if len(fields[1].replace("-","")) <> letters :
72 raise SyntaxError("Could not parse line, invalid sequence number:\n%s" % line)
73 else :
74
75 pass
76 line = handle.readline()
77 if not line : break
78
79 assert line.strip() == ""
80
81
82 while True :
83
84
85
86 while (not line) or line.strip() == "" or line[0]==" ":
87 line = handle.readline()
88 if not line : break
89 if not line : break
90
91 for i in range(len(ids)) :
92 fields = line.rstrip().split()
93
94
95
96 if len(fields) < 2 or len(fields) > 3:
97 raise SyntaxError("Could not parse line:\n%s" % line)
98
99 if fields[0] <> ids[i] :
100 raise SyntaxError("Identifiers out of order? Got '%s' but expected '%s'" \
101 % (fields[0], ids[i]))
102
103
104 seqs[i] += fields[1]
105
106 if len(fields) == 3 :
107
108 try :
109 letters = int(fields[2])
110 except ValueError :
111 raise SyntaxError("Could not parse line, bad sequence number:\n%s" % line)
112 if len(seqs[i].replace("-","")) <> letters :
113 raise SyntaxError("Could not parse line, invalid sequence number:\n%s" % line)
114
115
116 line = handle.readline()
117
118 assert len(ids) == len(seqs)
119 alignment_length = len(seqs[0])
120 for i in range(len(ids)) :
121 if len(seqs[i]) <> alignment_length:
122 raise SyntaxError("Error parsing alignment - sequences of different length?")
123 yield SeqRecord(Seq(seqs[i], alphabet), id=ids[i])
124
126 """Write Clustal sequence alignments"""
128 """Creates the writer object
129
130 Use the method write_file() to actually record your sequence records."""
131 self.handle = handle
132
134 """Use this to write an entire file containing the given records.
135
136 records - a SeqRecord iterator, or list of SeqRecords
137
138 Right now this code uses Bio.Clustalw.ClustalAlignment to do
139 the hard work - this may change in the future.
140 """
141
142
143
144
145
146
147
148
149
150 length_of_sequences = None
151 alignment = ClustalAlignment()
152 for record in records :
153 if length_of_sequences is None :
154 length_of_sequences = len(record.seq)
155 elif length_of_sequences <> len(record.seq) :
156 raise ValueError("Sequences must all be the same length")
157
158 if length_of_sequences <= 0 :
159 raise ValueError("Non-empty sequences are required")
160
161
162
163
164
165
166
167
168
169
170
171
172
173 alignment.add_sequence(record.id.replace(" ","_"),
174 record.seq.tostring())
175
176 if len(alignment.get_all_seqs()) == 0 :
177 raise ValueError("Must have at least one sequence")
178
179 self.handle.write(str(alignment))
180
181
182
183
184
185
186 if __name__ == "__main__" :
187
188
189
190
191 aln_example1 = \
192 """CLUSTAL W (1.81) multiple sequence alignment
193
194
195 gi|4959044|gb|AAD34209.1|AF069 MENSDSNDKGSDQSAAQRRSQMDRLDREEAFYQFVNNLSEEDYRLMRDNN 50
196 gi|671626|emb|CAA85685.1| ---------MSPQTETKASVGFKAGVKEYKLTYYTPEYETKDTDILAAFR 41
197 * *: :: :. :* : :. : . :* :: .
198
199 gi|4959044|gb|AAD34209.1|AF069 LLGTPGESTEEELLRRLQQIKEGPPPQSPDENRAGESSDDVTNSDSIIDW 100
200 gi|671626|emb|CAA85685.1| VTPQPG-----------------VPPEEAGAAVAAESSTGT--------- 65
201 : ** **:... *.*** ..
202
203 gi|4959044|gb|AAD34209.1|AF069 LNSVRQTGNTTRSRQRGNQSWRAVSRTNPNSGDFRFSLEINVNRNNGSQT 150
204 gi|671626|emb|CAA85685.1| WTTVWTDGLTSLDRYKG-----RCYHIEPVPG------------------ 92
205 .:* * *: .* :* : :* .*
206
207 gi|4959044|gb|AAD34209.1|AF069 SENESEPSTRRLSVENMESSSQRQMENSASESASARPSRAERNSTEAVTE 200
208 gi|671626|emb|CAA85685.1| -EKDQCICYVAYPLDLFEEGSVTNMFTSIVGNVFGFKALRALRLEDLRIP 141
209 *::. . .:: :*..* :* .* .. . : . :
210
211 gi|4959044|gb|AAD34209.1|AF069 VPTTRAQRRA 210
212 gi|671626|emb|CAA85685.1| VAYVKTFQGP 151
213 *. .:: : .
214
215 """
216
217
218
219
220 aln_example2 = \
221 """CLUSTAL X (1.83) multiple sequence alignment
222
223
224 V_Harveyi_PATH --MKNWIKVAVAAIA--LSAA------------------TVQAATEVKVG
225 B_subtilis_YXEM MKMKKWTVLVVAALLAVLSACG------------NGNSSSKEDDNVLHVG
226 B_subtilis_GlnH_homo_YCKK MKKALLALFMVVSIAALAACGAGNDNQSKDNAKDGDLWASIKKKGVLTVG
227 YA80_HAEIN MKKLLFTTALLTGAIAFSTF-----------SHAGEIADRVEKTKTLLVG
228 FLIY_ECOLI MKLAHLGRQALMGVMAVALVAG---MSVKSFADEG-LLNKVKERGTLLVG
229 E_coli_GlnH --MKSVLKVSLAALTLAFAVS------------------SHAADKKLVVA
230 Deinococcus_radiodurans -MKKSLLSLKLSGLLVPSVLALS--------LSACSSPSSTLNQGTLKIA
231 HISJ_E_COLI MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG
232 HISJ_E_COLI MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG
233 : . : :.
234
235 V_Harveyi_PATH MSGRYFPFTFVKQ--DKLQGFEVDMWDEIGKRNDYKIEYVTANFSGLFGL
236 B_subtilis_YXEM ATGQSYPFAYKEN--GKLTGFDVEVMEAVAKKIDMKLDWKLLEFSGLMGE
237 B_subtilis_GlnH_homo_YCKK TEGTYEPFTYHDKDTDKLTGYDVEVITEVAKRLGLKVDFKETQWGSMFAG
238 YA80_HAEIN TEGTYAPFTFHDK-SGKLTGFDVEVIRKVAEKLGLKVEFKETQWDAMYAG
239 FLIY_ECOLI LEGTYPPFSFQGD-DGKLTGFEVEFAQQLAKHLGVEASLKPTKWDGMLAS
240 E_coli_GlnH TDTAFVPFEFKQG--DKYVGFDVDLWAAIAKELKLDYELKPMDFSGIIPA
241 Deinococcus_radiodurans MEGTYPPFTSKNE-QGELVGFDVDIAKAVAQKLNLKPEFVLTEWSGILAG
242 HISJ_E_COLI TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS
243 HISJ_E_COLI TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS
244 ** .: *::::. : :. . ..:
245
246 V_Harveyi_PATH LETGRIDTISNQITMTDARKAKYLFADPYVVDG-AQI
247 B_subtilis_YXEM LQTGKLDTISNQVAVTDERKETYNFTKPYAYAG-TQI
248 B_subtilis_GlnH_homo_YCKK LNSKRFDVVANQVG-KTDREDKYDFSDKYTTSR-AVV
249 YA80_HAEIN LNAKRFDVIANQTNPSPERLKKYSFTTPYNYSG-GVI
250 FLIY_ECOLI LDSKRIDVVINQVTISDERKKKYDFSTPYTISGIQAL
251 E_coli_GlnH LQTKNVDLALAGITITDERKKAIDFSDGYYKSG-LLV
252 Deinococcus_radiodurans LQANKYDVIVNQVGITPERQNSIGFSQPYAYSRPEII
253 HISJ_E_COLI LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV
254 HISJ_E_COLI LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV
255 *.: . * . * *: :
256
257 """
258
259 from StringIO import StringIO
260
261 records = list(ClustalIterator(StringIO(aln_example1)))
262 assert 2 == len(records)
263 assert records[0].id == "gi|4959044|gb|AAD34209.1|AF069"
264 assert records[1].id == "gi|671626|emb|CAA85685.1|"
265 assert records[0].seq.tostring() == \
266 "MENSDSNDKGSDQSAAQRRSQMDRLDREEAFYQFVNNLSEEDYRLMRDNN" + \
267 "LLGTPGESTEEELLRRLQQIKEGPPPQSPDENRAGESSDDVTNSDSIIDW" + \
268 "LNSVRQTGNTTRSRQRGNQSWRAVSRTNPNSGDFRFSLEINVNRNNGSQT" + \
269 "SENESEPSTRRLSVENMESSSQRQMENSASESASARPSRAERNSTEAVTE" + \
270 "VPTTRAQRRA"
271
272 records = list(ClustalIterator(StringIO(aln_example2)))
273 assert 9 == len(records)
274 assert records[-1].id == "HISJ_E_COLI"
275 assert records[-1].seq.tostring() == \
276 "MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG" + \
277 "TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS" + \
278 "LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV"
279