1
2
3
4
5
6 """
7 Note
8 ====
9 In TREE_PUZZLE (Schmidt et al. 2003) and PHYML (Guindon and Gascuel 2003)
10 a dot/period (".") in a sequence is interpreted as meaning the same
11 character as in the first sequence. The PHYLIP 3.6 documentation says:
12
13 "a period was also previously allowed but it is no longer allowed,
14 because it sometimes is used in different senses in other programs"
15
16 At the time of writing, we do nothing special with a dot/period.
17 """
18
19 from Bio.Alphabet import single_letter_alphabet
20 from Bio.Seq import Seq
21 from Bio.SeqRecord import SeqRecord
22 from Interfaces import SequenceWriter
23 from sets import Set
24
25
26
28 """Reads a Phylip alignment file returning a SeqRecord object iterator
29
30 Record identifiers are limited to at most 10 characters.
31
32 It only copes with interlaced phylip files! Sequential files won't work
33 where the sequences are split over multiple lines.
34
35 For more information on the file format, please see:
36 http://evolution.genetics.washington.edu/phylip/doc/sequence.html
37 http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles
38 """
39 line = handle.readline()
40 if not line: return
41 line = line.strip()
42 parts = filter(None, line.split())
43 if len(parts)<>2 :
44 raise SyntaxError("First line should have two integers")
45 try :
46 number_of_seqs = int(parts[0])
47 length_of_seqs = int(parts[1])
48 except ValueError:
49 raise SyntaxError("First line should have two integers")
50
51 ids = []
52 seqs = []
53
54
55
56 for i in range(0,number_of_seqs) :
57 line = handle.readline().rstrip()
58 ids.append(line[:10].strip())
59 seqs.append([line[10:].strip().replace(" ","")])
60
61 line=""
62 while True :
63
64 while ""==line.strip():
65 line = handle.readline()
66 if not line : break
67 if not line : break
68
69 for i in range(0,number_of_seqs) :
70 seqs[i].append(line.strip().replace(" ",""))
71 line = handle.readline()
72 if (not line) and i+1 < number_of_seqs :
73 raise SyntaxError("End of file mid-block")
74 if not line : break
75
76 for i in range(0,number_of_seqs) :
77 seq = "".join(seqs[i])
78 if len(seq)<>length_of_seqs :
79 raise SyntaxError("Sequence %i length %i, expected length %i" \
80 % (i+1, len(seq), length_of_seqs))
81 yield SeqRecord(Seq(seq, alphabet), id=ids[i], name=ids[i], description="")
82
84 """Write interlaced Phylip sequence alignments
85
86 For more information on the file format, please see:
87 http://evolution.genetics.washington.edu/phylip/doc/sequence.html
88 http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles
89
90 All sequences must be the same length."""
91 - def __init__(self, handle, truncate=10):
92 """Creates the writer object
93
94 Use the method write_file() to actually record your sequence records."""
95 self.handle = handle
96 self.truncate = truncate
97
99 """Use this to write an entire file containing the given records.
100
101 If records is an iterator that does not support len(records) or
102 records[index] then it is converted into a list.
103 """
104
105 records = list(records)
106
107 if len(records)==0 :
108 raise ValueError("Must have at least one sequence")
109 length_of_sequences = len(records[0].seq)
110 for record in records :
111 if length_of_sequences <> len(record.seq) :
112 raise ValueError("Sequences must all be the same length")
113 if length_of_sequences <= 0 :
114 raise ValueError("Non-empty sequences are required")
115
116 if len(records) > len(Set([r.id[:self.truncate] for r in records])) :
117 raise ValueError("Repeated identifier, possibly due to truncation")
118
119 handle = self.handle
120
121
122
123
124 handle.write(" %i %s\n" % (len(records), length_of_sequences))
125 block=0
126 while True :
127 for record in records :
128 if block==0 :
129
130 """
131 Quoting the PHYLIP version 3.6 documentation:
132
133 The name should be ten characters in length, filled out to
134 the full ten characters by blanks if shorter. Any printable
135 ASCII/ISO character is allowed in the name, except for
136 parentheses ("(" and ")"), square brackets ("[" and "]"),
137 colon (":"), semicolon (";") and comma (","). If you forget
138 to extend the names to ten characters in length by blanks,
139 the program [i.e. PHYLIP] will get out of synchronization with
140 the contents of the data file, and an error message will result.
141
142 Note that Tab characters count as only one character in the
143 species names. Their inclusion can cause trouble.
144 """
145 name = record.id.strip()
146
147
148 for char in "[]()," :
149 name = name.replace(char,"")
150 for char in ":;" :
151 name = name.replace(char,"|")
152
153
154 handle.write(name[:self.truncate].ljust(self.truncate))
155 else :
156
157 handle.write(" "*self.truncate)
158
159 for chunk in range(0,5) :
160 i = block*50 + chunk*10
161 seq_segment = record.seq.tostring()[i:i+10]
162
163
164 handle.write(" %s" % seq_segment)
165 if i+10 > length_of_sequences : break
166 handle.write("\n")
167 block=block+1
168 if block*50 > length_of_sequences : break
169 handle.write("\n")
170
171
172
173
174
175 if __name__=="__main__" :
176 print "Testing"
177
178 phylip_text=""" 8 286
179 V_Harveyi_ --MKNWIKVA VAAIA--LSA A--------- ---------T VQAATEVKVG
180 B_subtilis MKMKKWTVLV VAALLAVLSA CG-------- ----NGNSSS KEDDNVLHVG
181 B_subtilis MKKALLALFM VVSIAALAAC GAGNDNQSKD NAKDGDLWAS IKKKGVLTVG
182 YA80_HAEIN MKKLLFTTAL LTGAIAFSTF ---------- -SHAGEIADR VEKTKTLLVG
183 FLIY_ECOLI MKLAHLGRQA LMGVMAVALV AG---MSVKS FADEG-LLNK VKERGTLLVG
184 E_coli_Gln --MKSVLKVS LAALTLAFAV S--------- ---------S HAADKKLVVA
185 Deinococcu -MKKSLLSLK LSGLLVPSVL ALS------- -LSACSSPSS TLNQGTLKIA
186 HISJ_E_COL MKKLVLSLSL VLAFSSATAA F--------- ---------- AAIPQNIRIG
187
188 MSGRYFPFTF VKQ--DKLQG FEVDMWDEIG KRNDYKIEYV TANFSGLFGL
189 ATGQSYPFAY KEN--GKLTG FDVEVMEAVA KKIDMKLDWK LLEFSGLMGE
190 TEGTYEPFTY HDKDTDKLTG YDVEVITEVA KRLGLKVDFK ETQWGSMFAG
191 TEGTYAPFTF HDK-SGKLTG FDVEVIRKVA EKLGLKVEFK ETQWDAMYAG
192 LEGTYPPFSF QGD-DGKLTG FEVEFAQQLA KHLGVEASLK PTKWDGMLAS
193 TDTAFVPFEF KQG--DKYVG FDVDLWAAIA KELKLDYELK PMDFSGIIPA
194 MEGTYPPFTS KNE-QGELVG FDVDIAKAVA QKLNLKPEFV LTEWSGILAG
195 TDPTYAPFES KNS-QGELVG FDIDLAKELC KRINTQCTFV ENPLDALIPS
196
197 LETGRIDTIS NQITMTDARK AKYLFADPYV VDG-AQITVR KGNDSIQGVE
198 LQTGKLDTIS NQVAVTDERK ETYNFTKPYA YAG-TQIVVK KDNTDIKSVD
199 LNSKRFDVVA NQVG-KTDRE DKYDFSDKYT TSR-AVVVTK KDNNDIKSEA
200 LNAKRFDVIA NQTNPSPERL KKYSFTTPYN YSG-GVIVTK SSDNSIKSFE
201 LDSKRIDVVI NQVTISDERK KKYDFSTPYT ISGIQALVKK GNEGTIKTAD
202 LQTKNVDLAL AGITITDERK KAIDFSDGYY KSG-LLVMVK ANNNDVKSVK
203 LQANKYDVIV NQVGITPERQ NSIGFSQPYA YSRPEIIVAK NNTFNPQSLA
204 LKAKKIDAIM SSLSITEKRQ QEIAFTDKLY AADSRLVVAK NSDIQP-TVE
205
206 DLAGKTVAVN LGSNFEQLLR DYDKDGKINI KTYDT--GIE HDVALGRADA
207 DLKGKTVAAV LGSNHAKNLE SKDPDKKINI KTYETQEGTL KDVAYGRVDA
208 DVKGKTSAQS LTSNYNKLAT N----AGAKV EGVEGMAQAL QMIQQARVDM
209 DLKGRKSAQS ATSNWGKDAK A----AGAQI LVVDGLAQSL ELIKQGRAEA
210 DLKGKKVGVG LGTNYEEWLR QNV--QGVDV RTYDDDPTKY QDLRVGRIDA
211 DLDGKVVAVK SGTGSVDYAK AN--IKTKDL RQFPNIDNAY MELGTNRADA
212 DLKGKRVGST LGSNYEKQLI DTG---DIKI VTYPGAPEIL ADLVAGRIDA
213 SLKGKRVGVL QGTTQETFGN EHWAPKGIEI VSYQGQDNIY SDLTAGRIDA
214
215 FIMDRLSALE -LIKKT-GLP LQLAGEPFET I-----QNAW PFVDNEKGRK
216 YVNSRTVLIA -QIKKT-GLP LKLAGDPIVY E-----QVAF PFAKDDAHDK
217 TYNDKLAVLN -YLKTSGNKN VKIAFETGEP Q-----STYF TFRKGS--GE
218 TINDKLAVLD -YFKQHPNSG LKIAYDRGDK T-----PTAF AFLQGE--DA
219 ILVDRLAALD -LVKKT-NDT LAVTGEAFSR Q-----ESGV ALRKGN--ED
220 VLHDTPNILY -FIKTAGNGQ FKAVGDSLEA Q-----QYGI AFPKGS--DE
221 AYNDRLVVNY -IINDQ-KLP VRGAGQIGDA A-----PVGI ALKKGN--SA
222 AFQDEVAASE GFLKQPVGKD YKFGGPSVKD EKLFGVGTGM GLRKED--NE
223
224 LQAEVNKALA EMRADGTVEK ISVKWFGADI TK----
225 LRKKVNKALD ELRKDGTLKK LSEKYFNEDI TVEQKH
226 VVDQVNKALK EMKEDGTLSK ISKKWFGEDV SK----
227 LITKFNQVLE ALRQDGTLKQ ISIEWFGYDI TQ----
228 LLKAVNDAIA EMQKDGTLQA LSEKWFGADV TK----
229 LRDKVNGALK TLRENGTYNE IYKKWFGTEP K-----
230 LKDQIDKALT EMRSDGTFEK ISQKWFGQDV GQP---
231 LREALNKAFA EMRADGTYEK LAKKYFDFDV YGG---
232 """
233
234 from cStringIO import StringIO
235 handle = StringIO(phylip_text)
236 count=0
237 for record in PhylipIterator(handle) :
238 count=count+1
239 print record.id
240
241 assert count == 8
242
243 expected="""mkklvlslsl vlafssataa faaipqniri gtdptyapfe sknsqgelvg
244 fdidlakelc krintqctfv enpldalips lkakkidaim sslsitekrq qeiaftdkly
245 aadsrlvvak nsdiqptves lkgkrvgvlq gttqetfgne hwapkgieiv syqgqdniys
246 dltagridaafqdevaaseg flkqpvgkdy kfggpsvkde klfgvgtgmg lrkednelre
247 alnkafaemradgtyeklak kyfdfdvygg""".replace(" ","").replace("\n","").upper()
248 assert record.seq.tostring().replace("-","") == expected
249
250
251
252 phylip_text2="""5 60
253 Tax1 CCATCTCACGGTCGGTACGATACACCTGCTTTTGGCAG
254 Tax2 CCATCTCACGGTCAGTAAGATACACCTGCTTTTGGCGG
255 Tax3 CCATCTCCCGCTCAGTAAGATACCCCTGCTGTTGGCGG
256 Tax4 TCATCTCATGGTCAATAAGATACTCCTGCTTTTGGCGG
257 Tax5 CCATCTCACGGTCGGTAAGATACACCTGCTTTTGGCGG
258
259 GAAATGGTCAATATTACAAGGT
260 GAAATGGTCAACATTAAAAGAT
261 GAAATCGTCAATATTAAAAGGT
262 GAAATGGTCAATCTTAAAAGGT
263 GAAATGGTCAATATTAAAAGGT"""
264
265 phylip_text3="""5 60
266 Tax1 CCATCTCACGGTCGGTACGATACACCTGCTTTTGGCAGGAAATGGTCAATATTACAAGGT
267 Tax2 CCATCTCACGGTCAGTAAGATACACCTGCTTTTGGCGGGAAATGGTCAACATTAAAAGAT
268 Tax3 CCATCTCCCGCTCAGTAAGATACCCCTGCTGTTGGCGGGAAATCGTCAATATTAAAAGGT
269 Tax4 TCATCTCATGGTCAATAAGATACTCCTGCTTTTGGCGGGAAATGGTCAATCTTAAAAGGT
270 Tax5 CCATCTCACGGTCGGTAAGATACACCTGCTTTTGGCGGGAAATGGTCAATATTAAAAGGT"""
271
272 handle = StringIO(phylip_text2)
273 list2 = list(PhylipIterator(handle))
274 handle.close()
275 assert len(list2)==5
276
277 handle = StringIO(phylip_text3)
278 list3 = list(PhylipIterator(handle))
279 handle.close()
280 assert len(list3)==5
281
282 for i in range(0,5) :
283 list2[i].id == list3[i].id
284 list2[i].seq.tostring() == list3[i].seq.tostring()
285
286
287
288
289 phylip_text4=""" 5 42
290 Turkey AAGCTNGGGC ATTTCAGGGT
291 Salmo gairAAGCCTTGGC AGTGCAGGGT
292 H. SapiensACCGGTTGGC CGTTCAGGGT
293 Chimp AAACCCTTGC CGTTACGCTT
294 Gorilla AAACCCTTGC CGGTACGCTT
295
296 GAGCCCGGGC AATACAGGGT AT
297 GAGCCGTGGC CGGGCACGGT AT
298 ACAGGTTGGC CGTTCAGGGT AA
299 AAACCGAGGC CGGGACACTC AT
300 AAACCATTGC CGGTACGCTT AA"""
301
302
303
304 phylip_text5=""" 5 42
305 Turkey AAGCTNGGGC ATTTCAGGGT
306 GAGCCCGGGC AATACAGGGT AT
307 Salmo gairAAGCCTTGGC AGTGCAGGGT
308 GAGCCGTGGC CGGGCACGGT AT
309 H. SapiensACCGGTTGGC CGTTCAGGGT
310 ACAGGTTGGC CGTTCAGGGT AA
311 Chimp AAACCCTTGC CGTTACGCTT
312 AAACCGAGGC CGGGACACTC AT
313 Gorilla AAACCCTTGC CGGTACGCTT
314 AAACCATTGC CGGTACGCTT AA"""
315
316 phylip_text5a=""" 5 42
317 Turkey AAGCTNGGGC ATTTCAGGGT GAGCCCGGGC AATACAGGGT AT
318 Salmo gairAAGCCTTGGC AGTGCAGGGT GAGCCGTGGC CGGGCACGGT AT
319 H. SapiensACCGGTTGGC CGTTCAGGGT ACAGGTTGGC CGTTCAGGGT AA
320 Chimp AAACCCTTGC CGTTACGCTT AAACCGAGGC CGGGACACTC AT
321 Gorilla AAACCCTTGC CGGTACGCTT AAACCATTGC CGGTACGCTT AA"""
322
323 handle = StringIO(phylip_text4)
324 list4 = list(PhylipIterator(handle))
325 handle.close()
326 assert len(list4)==5
327
328 handle = StringIO(phylip_text5)
329 try :
330 list5 = list(PhylipIterator(handle))
331 assert len(list5)==5
332 print "That should have failed..."
333 except SyntaxError :
334 print "Evil multiline non-interlaced example failed as expected"
335 handle.close()
336
337 handle = StringIO(phylip_text5a)
338 list5 = list(PhylipIterator(handle))
339 handle.close()
340 assert len(list5)==5
341
342 for i in range(0,5) :
343 list4[i].id == list5[i].id
344 list4[i].seq.tostring() == list5[i].seq.tostring()
345
346
347
348 """
349 handle = StringIO(phylip_text)
350 out_handle=open("/tmp/test.phy","w")
351 writer = PhylipWriter(out_handle)
352 writer.write_file(PhylipIterator(handle))
353 out_handle.close()
354
355 print "---------------------"
356
357 print open("/tmp/test.phy").read()
358 """
359