Package Bio :: Package SeqIO :: Module PhylipIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.PhylipIO

  1  # Copyright 2006, 2007 by Peter Cock.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """ 
  7  Note 
  8  ==== 
  9  In TREE_PUZZLE (Schmidt et al. 2003) and PHYML (Guindon and Gascuel 2003) 
 10  a dot/period (".") in a sequence is interpreted as meaning the same 
 11  character as in the first sequence.  The PHYLIP 3.6 documentation says: 
 12   
 13     "a period was also previously allowed but it is no longer allowed, 
 14     because it sometimes is used in different senses in other programs" 
 15   
 16  At the time of writing, we do nothing special with a dot/period. 
 17  """ 
 18   
 19  from Bio.Alphabet import single_letter_alphabet 
 20  from Bio.Seq import Seq 
 21  from Bio.SeqRecord import SeqRecord 
 22  from Interfaces import SequenceWriter 
 23  from sets import Set 
 24   
 25  #This is a generator function! 
 26  #TODO - Should the default be Gapped(single_letter_alphabet) instead? 
27 -def PhylipIterator(handle, alphabet = single_letter_alphabet) :
28 """Reads a Phylip alignment file returning a SeqRecord object iterator 29 30 Record identifiers are limited to at most 10 characters. 31 32 It only copes with interlaced phylip files! Sequential files won't work 33 where the sequences are split over multiple lines. 34 35 For more information on the file format, please see: 36 http://evolution.genetics.washington.edu/phylip/doc/sequence.html 37 http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles 38 """ 39 line = handle.readline() 40 if not line: return 41 line = line.strip() 42 parts = filter(None, line.split()) 43 if len(parts)<>2 : 44 raise SyntaxError("First line should have two integers") 45 try : 46 number_of_seqs = int(parts[0]) 47 length_of_seqs = int(parts[1]) 48 except ValueError: 49 raise SyntaxError("First line should have two integers") 50 51 ids = [] 52 seqs = [] 53 54 #Expects STRICT truncation/padding to 10 characters 55 #Does not require any white space between name and seq. 56 for i in range(0,number_of_seqs) : 57 line = handle.readline().rstrip() 58 ids.append(line[:10].strip()) #first ten characters 59 seqs.append([line[10:].strip().replace(" ","")]) 60 61 line="" 62 while True : 63 #Skip any blank lines between blocks... 64 while ""==line.strip(): 65 line = handle.readline() 66 if not line : break #end of file 67 if not line : break 68 #print "New block..." 69 for i in range(0,number_of_seqs) : 70 seqs[i].append(line.strip().replace(" ","")) 71 line = handle.readline() 72 if (not line) and i+1 < number_of_seqs : 73 raise SyntaxError("End of file mid-block") 74 if not line : break #end of file 75 76 for i in range(0,number_of_seqs) : 77 seq = "".join(seqs[i]) 78 if len(seq)<>length_of_seqs : 79 raise SyntaxError("Sequence %i length %i, expected length %i" \ 80 % (i+1, len(seq), length_of_seqs)) 81 yield SeqRecord(Seq(seq, alphabet), id=ids[i], name=ids[i], description="")
82
83 -class PhylipWriter(SequenceWriter):
84 """Write interlaced Phylip sequence alignments 85 86 For more information on the file format, please see: 87 http://evolution.genetics.washington.edu/phylip/doc/sequence.html 88 http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles 89 90 All sequences must be the same length."""
91 - def __init__(self, handle, truncate=10):
92 """Creates the writer object 93 94 Use the method write_file() to actually record your sequence records.""" 95 self.handle = handle 96 self.truncate = truncate
97
98 - def write_file(self, records) :
99 """Use this to write an entire file containing the given records. 100 101 If records is an iterator that does not support len(records) or 102 records[index] then it is converted into a list. 103 """ 104 #Need length, and multiple passes - and iterator will not do. 105 records = list(records) 106 107 if len(records)==0 : 108 raise ValueError("Must have at least one sequence") 109 length_of_sequences = len(records[0].seq) 110 for record in records : 111 if length_of_sequences <> len(record.seq) : 112 raise ValueError("Sequences must all be the same length") 113 if length_of_sequences <= 0 : 114 raise ValueError("Non-empty sequences are required") 115 116 if len(records) > len(Set([r.id[:self.truncate] for r in records])) : 117 raise ValueError("Repeated identifier, possibly due to truncation") 118 119 handle = self.handle 120 121 # From experimentation, the use of tabs is not understood by the 122 # EMBOSS suite. The nature of the expected white space is not 123 # defined, simply "These are in free format, separated by blanks" 124 handle.write(" %i %s\n" % (len(records), length_of_sequences)) 125 block=0 126 while True : 127 for record in records : 128 if block==0 : 129 #Write name (truncated/padded to 10 characters) 130 """ 131 Quoting the PHYLIP version 3.6 documentation: 132 133 The name should be ten characters in length, filled out to 134 the full ten characters by blanks if shorter. Any printable 135 ASCII/ISO character is allowed in the name, except for 136 parentheses ("(" and ")"), square brackets ("[" and "]"), 137 colon (":"), semicolon (";") and comma (","). If you forget 138 to extend the names to ten characters in length by blanks, 139 the program [i.e. PHYLIP] will get out of synchronization with 140 the contents of the data file, and an error message will result. 141 142 Note that Tab characters count as only one character in the 143 species names. Their inclusion can cause trouble. 144 """ 145 name = record.id.strip() 146 #Either remove the banned characters, or map them to something 147 #else like an underscore "_" or pipe "|" character... 148 for char in "[]()," : 149 name = name.replace(char,"") 150 for char in ":;" : 151 name = name.replace(char,"|") 152 153 #Now truncate and right pad to expected length. 154 handle.write(name[:self.truncate].ljust(self.truncate)) 155 else : 156 #write 10 space indent 157 handle.write(" "*self.truncate) 158 #Write five chunks of ten letters per line... 159 for chunk in range(0,5) : 160 i = block*50 + chunk*10 161 seq_segment = record.seq.tostring()[i:i+10] 162 #TODO - Force any gaps to be '-' character? Look at the alphabet... 163 #TODO - How to cope with '?' or '.' in the sequence? 164 handle.write(" %s" % seq_segment) 165 if i+10 > length_of_sequences : break 166 handle.write("\n") 167 block=block+1 168 if block*50 > length_of_sequences : break 169 handle.write("\n")
170 171 #Don't close the handle. Doing so would prevent this code 172 #from writing concatenated phylip files which are used 173 #in phylogenetic bootstrapping 174 #handle.close() 175 if __name__=="__main__" : 176 print "Testing" 177 178 phylip_text=""" 8 286 179 V_Harveyi_ --MKNWIKVA VAAIA--LSA A--------- ---------T VQAATEVKVG 180 B_subtilis MKMKKWTVLV VAALLAVLSA CG-------- ----NGNSSS KEDDNVLHVG 181 B_subtilis MKKALLALFM VVSIAALAAC GAGNDNQSKD NAKDGDLWAS IKKKGVLTVG 182 YA80_HAEIN MKKLLFTTAL LTGAIAFSTF ---------- -SHAGEIADR VEKTKTLLVG 183 FLIY_ECOLI MKLAHLGRQA LMGVMAVALV AG---MSVKS FADEG-LLNK VKERGTLLVG 184 E_coli_Gln --MKSVLKVS LAALTLAFAV S--------- ---------S HAADKKLVVA 185 Deinococcu -MKKSLLSLK LSGLLVPSVL ALS------- -LSACSSPSS TLNQGTLKIA 186 HISJ_E_COL MKKLVLSLSL VLAFSSATAA F--------- ---------- AAIPQNIRIG 187 188 MSGRYFPFTF VKQ--DKLQG FEVDMWDEIG KRNDYKIEYV TANFSGLFGL 189 ATGQSYPFAY KEN--GKLTG FDVEVMEAVA KKIDMKLDWK LLEFSGLMGE 190 TEGTYEPFTY HDKDTDKLTG YDVEVITEVA KRLGLKVDFK ETQWGSMFAG 191 TEGTYAPFTF HDK-SGKLTG FDVEVIRKVA EKLGLKVEFK ETQWDAMYAG 192 LEGTYPPFSF QGD-DGKLTG FEVEFAQQLA KHLGVEASLK PTKWDGMLAS 193 TDTAFVPFEF KQG--DKYVG FDVDLWAAIA KELKLDYELK PMDFSGIIPA 194 MEGTYPPFTS KNE-QGELVG FDVDIAKAVA QKLNLKPEFV LTEWSGILAG 195 TDPTYAPFES KNS-QGELVG FDIDLAKELC KRINTQCTFV ENPLDALIPS 196 197 LETGRIDTIS NQITMTDARK AKYLFADPYV VDG-AQITVR KGNDSIQGVE 198 LQTGKLDTIS NQVAVTDERK ETYNFTKPYA YAG-TQIVVK KDNTDIKSVD 199 LNSKRFDVVA NQVG-KTDRE DKYDFSDKYT TSR-AVVVTK KDNNDIKSEA 200 LNAKRFDVIA NQTNPSPERL KKYSFTTPYN YSG-GVIVTK SSDNSIKSFE 201 LDSKRIDVVI NQVTISDERK KKYDFSTPYT ISGIQALVKK GNEGTIKTAD 202 LQTKNVDLAL AGITITDERK KAIDFSDGYY KSG-LLVMVK ANNNDVKSVK 203 LQANKYDVIV NQVGITPERQ NSIGFSQPYA YSRPEIIVAK NNTFNPQSLA 204 LKAKKIDAIM SSLSITEKRQ QEIAFTDKLY AADSRLVVAK NSDIQP-TVE 205 206 DLAGKTVAVN LGSNFEQLLR DYDKDGKINI KTYDT--GIE HDVALGRADA 207 DLKGKTVAAV LGSNHAKNLE SKDPDKKINI KTYETQEGTL KDVAYGRVDA 208 DVKGKTSAQS LTSNYNKLAT N----AGAKV EGVEGMAQAL QMIQQARVDM 209 DLKGRKSAQS ATSNWGKDAK A----AGAQI LVVDGLAQSL ELIKQGRAEA 210 DLKGKKVGVG LGTNYEEWLR QNV--QGVDV RTYDDDPTKY QDLRVGRIDA 211 DLDGKVVAVK SGTGSVDYAK AN--IKTKDL RQFPNIDNAY MELGTNRADA 212 DLKGKRVGST LGSNYEKQLI DTG---DIKI VTYPGAPEIL ADLVAGRIDA 213 SLKGKRVGVL QGTTQETFGN EHWAPKGIEI VSYQGQDNIY SDLTAGRIDA 214 215 FIMDRLSALE -LIKKT-GLP LQLAGEPFET I-----QNAW PFVDNEKGRK 216 YVNSRTVLIA -QIKKT-GLP LKLAGDPIVY E-----QVAF PFAKDDAHDK 217 TYNDKLAVLN -YLKTSGNKN VKIAFETGEP Q-----STYF TFRKGS--GE 218 TINDKLAVLD -YFKQHPNSG LKIAYDRGDK T-----PTAF AFLQGE--DA 219 ILVDRLAALD -LVKKT-NDT LAVTGEAFSR Q-----ESGV ALRKGN--ED 220 VLHDTPNILY -FIKTAGNGQ FKAVGDSLEA Q-----QYGI AFPKGS--DE 221 AYNDRLVVNY -IINDQ-KLP VRGAGQIGDA A-----PVGI ALKKGN--SA 222 AFQDEVAASE GFLKQPVGKD YKFGGPSVKD EKLFGVGTGM GLRKED--NE 223 224 LQAEVNKALA EMRADGTVEK ISVKWFGADI TK---- 225 LRKKVNKALD ELRKDGTLKK LSEKYFNEDI TVEQKH 226 VVDQVNKALK EMKEDGTLSK ISKKWFGEDV SK---- 227 LITKFNQVLE ALRQDGTLKQ ISIEWFGYDI TQ---- 228 LLKAVNDAIA EMQKDGTLQA LSEKWFGADV TK---- 229 LRDKVNGALK TLRENGTYNE IYKKWFGTEP K----- 230 LKDQIDKALT EMRSDGTFEK ISQKWFGQDV GQP--- 231 LREALNKAFA EMRADGTYEK LAKKYFDFDV YGG--- 232 """ 233 234 from cStringIO import StringIO 235 handle = StringIO(phylip_text) 236 count=0 237 for record in PhylipIterator(handle) : 238 count=count+1 239 print record.id 240 #print record.seq.tostring() 241 assert count == 8 242 243 expected="""mkklvlslsl vlafssataa faaipqniri gtdptyapfe sknsqgelvg 244 fdidlakelc krintqctfv enpldalips lkakkidaim sslsitekrq qeiaftdkly 245 aadsrlvvak nsdiqptves lkgkrvgvlq gttqetfgne hwapkgieiv syqgqdniys 246 dltagridaafqdevaaseg flkqpvgkdy kfggpsvkde klfgvgtgmg lrkednelre 247 alnkafaemradgtyeklak kyfdfdvygg""".replace(" ","").replace("\n","").upper() 248 assert record.seq.tostring().replace("-","") == expected 249 250 #From here: 251 #http://atgc.lirmm.fr/phyml/usersguide.html 252 phylip_text2="""5 60 253 Tax1 CCATCTCACGGTCGGTACGATACACCTGCTTTTGGCAG 254 Tax2 CCATCTCACGGTCAGTAAGATACACCTGCTTTTGGCGG 255 Tax3 CCATCTCCCGCTCAGTAAGATACCCCTGCTGTTGGCGG 256 Tax4 TCATCTCATGGTCAATAAGATACTCCTGCTTTTGGCGG 257 Tax5 CCATCTCACGGTCGGTAAGATACACCTGCTTTTGGCGG 258 259 GAAATGGTCAATATTACAAGGT 260 GAAATGGTCAACATTAAAAGAT 261 GAAATCGTCAATATTAAAAGGT 262 GAAATGGTCAATCTTAAAAGGT 263 GAAATGGTCAATATTAAAAGGT""" 264 265 phylip_text3="""5 60 266 Tax1 CCATCTCACGGTCGGTACGATACACCTGCTTTTGGCAGGAAATGGTCAATATTACAAGGT 267 Tax2 CCATCTCACGGTCAGTAAGATACACCTGCTTTTGGCGGGAAATGGTCAACATTAAAAGAT 268 Tax3 CCATCTCCCGCTCAGTAAGATACCCCTGCTGTTGGCGGGAAATCGTCAATATTAAAAGGT 269 Tax4 TCATCTCATGGTCAATAAGATACTCCTGCTTTTGGCGGGAAATGGTCAATCTTAAAAGGT 270 Tax5 CCATCTCACGGTCGGTAAGATACACCTGCTTTTGGCGGGAAATGGTCAATATTAAAAGGT""" 271 272 handle = StringIO(phylip_text2) 273 list2 = list(PhylipIterator(handle)) 274 handle.close() 275 assert len(list2)==5 276 277 handle = StringIO(phylip_text3) 278 list3 = list(PhylipIterator(handle)) 279 handle.close() 280 assert len(list3)==5 281 282 for i in range(0,5) : 283 list2[i].id == list3[i].id 284 list2[i].seq.tostring() == list3[i].seq.tostring() 285 286 #From here: 287 #http://evolution.genetics.washington.edu/phylip/doc/sequence.html 288 #Note the lack of any white space between names 2 and 3 and their seqs. 289 phylip_text4=""" 5 42 290 Turkey AAGCTNGGGC ATTTCAGGGT 291 Salmo gairAAGCCTTGGC AGTGCAGGGT 292 H. SapiensACCGGTTGGC CGTTCAGGGT 293 Chimp AAACCCTTGC CGTTACGCTT 294 Gorilla AAACCCTTGC CGGTACGCTT 295 296 GAGCCCGGGC AATACAGGGT AT 297 GAGCCGTGGC CGGGCACGGT AT 298 ACAGGTTGGC CGTTCAGGGT AA 299 AAACCGAGGC CGGGACACTC AT 300 AAACCATTGC CGGTACGCTT AA""" 301 302 #From here: 303 #http://evolution.genetics.washington.edu/phylip/doc/sequence.html 304 phylip_text5=""" 5 42 305 Turkey AAGCTNGGGC ATTTCAGGGT 306 GAGCCCGGGC AATACAGGGT AT 307 Salmo gairAAGCCTTGGC AGTGCAGGGT 308 GAGCCGTGGC CGGGCACGGT AT 309 H. SapiensACCGGTTGGC CGTTCAGGGT 310 ACAGGTTGGC CGTTCAGGGT AA 311 Chimp AAACCCTTGC CGTTACGCTT 312 AAACCGAGGC CGGGACACTC AT 313 Gorilla AAACCCTTGC CGGTACGCTT 314 AAACCATTGC CGGTACGCTT AA""" 315 316 phylip_text5a=""" 5 42 317 Turkey AAGCTNGGGC ATTTCAGGGT GAGCCCGGGC AATACAGGGT AT 318 Salmo gairAAGCCTTGGC AGTGCAGGGT GAGCCGTGGC CGGGCACGGT AT 319 H. SapiensACCGGTTGGC CGTTCAGGGT ACAGGTTGGC CGTTCAGGGT AA 320 Chimp AAACCCTTGC CGTTACGCTT AAACCGAGGC CGGGACACTC AT 321 Gorilla AAACCCTTGC CGGTACGCTT AAACCATTGC CGGTACGCTT AA""" 322 323 handle = StringIO(phylip_text4) 324 list4 = list(PhylipIterator(handle)) 325 handle.close() 326 assert len(list4)==5 327 328 handle = StringIO(phylip_text5) 329 try : 330 list5 = list(PhylipIterator(handle)) 331 assert len(list5)==5 332 print "That should have failed..." 333 except SyntaxError : 334 print "Evil multiline non-interlaced example failed as expected" 335 handle.close() 336 337 handle = StringIO(phylip_text5a) 338 list5 = list(PhylipIterator(handle)) 339 handle.close() 340 assert len(list5)==5 341 342 for i in range(0,5) : 343 list4[i].id == list5[i].id 344 list4[i].seq.tostring() == list5[i].seq.tostring() 345 346 347 348 """ 349 handle = StringIO(phylip_text) 350 out_handle=open("/tmp/test.phy","w") 351 writer = PhylipWriter(out_handle) 352 writer.write_file(PhylipIterator(handle)) 353 out_handle.close() 354 355 print "---------------------" 356 357 print open("/tmp/test.phy").read() 358 """ 359