Package Bio :: Package SeqIO :: Module ClustalIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.ClustalIO

  1  # Copyright 2006, 2007 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6   
  7  """Bio.SeqIO support for the "clustal" (aka ClustalW) file format. 
  8   
  9  You were expected to use this module via the Bio.SeqIO functions. 
 10  This module has now been replaced by Bio.AlignIO.ClustalIO, and is 
 11  deprecated.""" 
 12   
 13  import warnings 
 14  warnings.warn("Bio.SeqIO.ClustalIO is deprecated.  You can continue to read" \ 
 15                + " and write 'clustal' files with Bio.SeqIO, but this is now" \ 
 16                + " handled via Bio.AlignIO internally.", 
 17                DeprecationWarning) 
 18   
 19  #For reading alignments: 
 20  from Bio.Alphabet import single_letter_alphabet 
 21  from Bio.Seq import Seq 
 22  from Bio.SeqRecord import SeqRecord 
 23   
 24  #For writing alignments: 
 25  from Bio.SeqIO.Interfaces import SequenceWriter 
 26  from Bio.Clustalw import ClustalAlignment 
 27   
 28  #This is a generator function! 
 29  #TODO - Should the default be Gapped(single_letter_alphabet) instead? 
30 -def ClustalIterator(handle, alphabet = single_letter_alphabet) :
31 """Reads a Clustalw file returning a SeqRecord object iterator. 32 33 The entire file is loaded at once, but the SeqRecord objects 34 are only created "on request". 35 36 For more information on the file format, please see: 37 http://www.bioperl.org/wiki/ClustalW_multiple_alignment_format 38 39 You might like to look at Bio.Clustalw which has an interface 40 to the command line tool clustalw, and can also clustal alignment 41 files into Bio.Clustalw.ClustalAlignment objects. 42 43 We call this the "clustal" format which is consistent with EMBOSS. 44 Sadly BioPerl calls it the "clustalw" format, so we can't match 45 them both. 46 """ 47 line = handle.readline() 48 if not line: return 49 if not line[:7] == 'CLUSTAL': 50 raise ValueError("Did not find CLUSTAL header") 51 52 #There should be two blank lines after the header line 53 line = handle.readline() 54 while line.strip() == "" : 55 line = handle.readline() 56 57 #If the alignment contains entries with the same sequence 58 #identifier (not a good idea - but seems possible), then this 59 #dictionary based parser will merge their sequences. Fix this? 60 ids = [] 61 seqs = [] 62 63 #Use the first block to get the sequence identifiers 64 while line.strip() <> "" : 65 if line[0] <> " " : 66 #Sequences identifier... 67 fields = line.rstrip().split() 68 69 #We expect there to be two fields, there can be an optional 70 #"sequence number" field containing the letter count. 71 if len(fields) < 2 or len(fields) > 3: 72 raise ValueError("Could not parse line:\n%s" % line) 73 74 ids.append(fields[0]) 75 seqs.append(fields[1]) 76 77 if len(fields) == 3 : 78 #This MAY be an old style file with a letter count... 79 try : 80 letters = int(fields[2]) 81 except ValueError : 82 raise ValueError("Could not parse line, bad sequence number:\n%s" % line) 83 if len(fields[1].replace("-","")) <> letters : 84 raise ValueError("Could not parse line, invalid sequence number:\n%s" % line) 85 else : 86 #Sequence consensus line... 87 pass 88 line = handle.readline() 89 if not line : break #end of file 90 91 assert line.strip() == "" 92 93 #Loop over any remaining blocks... 94 while True : 95 #There should be a blank line between each block. 96 #Also want to ignore any consensus line from the 97 #previous block. 98 while (not line) or line.strip() == "" or line[0]==" ": 99 line = handle.readline() 100 if not line : break # end of file 101 if not line : break # end of file 102 103 for i in range(len(ids)) : 104 fields = line.rstrip().split() 105 106 #We expect there to be two fields, there can be an optional 107 #"sequence number" field containing the letter count. 108 if len(fields) < 2 or len(fields) > 3: 109 raise ValueError("Could not parse line:\n%s" % line) 110 111 if fields[0] <> ids[i] : 112 raise ValueError("Identifiers out of order? Got '%s' but expected '%s'" \ 113 % (fields[0], ids[i])) 114 115 #Append the sequence 116 seqs[i] += fields[1] 117 118 if len(fields) == 3 : 119 #This MAY be an old style file with a letter count... 120 try : 121 letters = int(fields[2]) 122 except ValueError : 123 raise ValueError("Could not parse line, bad sequence number:\n%s" % line) 124 if len(seqs[i].replace("-","")) <> letters : 125 raise ValueError("Could not parse line, invalid sequence number:\n%s" % line) 126 127 #Read in the next line 128 line = handle.readline() 129 130 assert len(ids) == len(seqs) 131 alignment_length = len(seqs[0]) 132 for i in range(len(ids)) : 133 if len(seqs[i]) <> alignment_length: 134 raise ValueError("Error parsing alignment - sequences of different length?") 135 yield SeqRecord(Seq(seqs[i], alphabet), id=ids[i])
136
137 -class ClustalWriter(SequenceWriter):
138 """Write Clustal sequence alignments."""
139 - def __init__(self, handle):
140 """Creates the writer object. 141 142 Use the method write_file() to actually record your sequence records.""" 143 self.handle = handle
144
145 - def write_file(self, records) :
146 """Use this to write an entire file containing the given records. 147 148 records - a SeqRecord iterator, or list of SeqRecords 149 150 Right now this code uses Bio.Clustalw.ClustalAlignment to do 151 the hard work - this may change in the future. 152 """ 153 # ToDo - decide if using Bio.Clustalw.ClustalAlignment is 154 # actually the best way to handle this. 155 # 156 # Copying that thirty lines of code (with slight tweaks) 157 # would be much simpler, and would probably run quicker and 158 # use less memory as it doesn't build a ClustalAlignment 159 # object. 160 # 161 # The downside is code duplication. 162 length_of_sequences = None 163 alignment = ClustalAlignment() 164 for record in records : 165 if length_of_sequences is None : 166 length_of_sequences = len(record.seq) 167 elif length_of_sequences <> len(record.seq) : 168 raise ValueError("Sequences must all be the same length") 169 170 if length_of_sequences <= 0 : 171 raise ValueError("Non-empty sequences are required") 172 173 #ToDo, check alphabet for this sequence matches that 174 #specified for the alignment. Not sure how the 175 #alphabet.contains() method is intended to be used, 176 #but it doesn't make sense to me right now. 177 178 #Doing this works, but ClustalAlignment currently uses 179 #the record.descrption when outputing the records. 180 #alignment._records.append(record) 181 182 #Make sure we don't get any spaces in the record 183 #identifier when output in the file by replacing 184 #them with underscores: 185 alignment.add_sequence(record.id.replace(" ","_"), 186 record.seq.tostring()) 187 188 if len(alignment.get_all_seqs()) == 0 : 189 raise ValueError("Must have at least one sequence") 190 191 self.handle.write(str(alignment))
192 #Don't close the handle. Doing so would prevent this code 193 #from writing concatenated Clustal files which might be used 194 #in phylogenetic bootstrapping (very common with phylip). 195 #self.handle.close() 196 197 198 if __name__ == "__main__" : 199 # Run a quick self-test 200 201 #This is a truncated version of the example in Tests/cw02.aln 202 #Notice the inclusion of sequence numbers (right hand side) 203 aln_example1 = \ 204 """CLUSTAL W (1.81) multiple sequence alignment 205 206 207 gi|4959044|gb|AAD34209.1|AF069 MENSDSNDKGSDQSAAQRRSQMDRLDREEAFYQFVNNLSEEDYRLMRDNN 50 208 gi|671626|emb|CAA85685.1| ---------MSPQTETKASVGFKAGVKEYKLTYYTPEYETKDTDILAAFR 41 209 * *: :: :. :* : :. : . :* :: . 210 211 gi|4959044|gb|AAD34209.1|AF069 LLGTPGESTEEELLRRLQQIKEGPPPQSPDENRAGESSDDVTNSDSIIDW 100 212 gi|671626|emb|CAA85685.1| VTPQPG-----------------VPPEEAGAAVAAESSTGT--------- 65 213 : ** **:... *.*** .. 214 215 gi|4959044|gb|AAD34209.1|AF069 LNSVRQTGNTTRSRQRGNQSWRAVSRTNPNSGDFRFSLEINVNRNNGSQT 150 216 gi|671626|emb|CAA85685.1| WTTVWTDGLTSLDRYKG-----RCYHIEPVPG------------------ 92 217 .:* * *: .* :* : :* .* 218 219 gi|4959044|gb|AAD34209.1|AF069 SENESEPSTRRLSVENMESSSQRQMENSASESASARPSRAERNSTEAVTE 200 220 gi|671626|emb|CAA85685.1| -EKDQCICYVAYPLDLFEEGSVTNMFTSIVGNVFGFKALRALRLEDLRIP 141 221 *::. . .:: :*..* :* .* .. . : . : 222 223 gi|4959044|gb|AAD34209.1|AF069 VPTTRAQRRA 210 224 gi|671626|emb|CAA85685.1| VAYVKTFQGP 151 225 *. .:: : . 226 227 """ 228 229 #This example is a truncated version of the dataset used here: 230 #http://virgil.ruc.dk/kurser/Sekvens/Treedraw.htm 231 #with the last record repeated twice (deliberate toture test) 232 aln_example2 = \ 233 """CLUSTAL X (1.83) multiple sequence alignment 234 235 236 V_Harveyi_PATH --MKNWIKVAVAAIA--LSAA------------------TVQAATEVKVG 237 B_subtilis_YXEM MKMKKWTVLVVAALLAVLSACG------------NGNSSSKEDDNVLHVG 238 B_subtilis_GlnH_homo_YCKK MKKALLALFMVVSIAALAACGAGNDNQSKDNAKDGDLWASIKKKGVLTVG 239 YA80_HAEIN MKKLLFTTALLTGAIAFSTF-----------SHAGEIADRVEKTKTLLVG 240 FLIY_ECOLI MKLAHLGRQALMGVMAVALVAG---MSVKSFADEG-LLNKVKERGTLLVG 241 E_coli_GlnH --MKSVLKVSLAALTLAFAVS------------------SHAADKKLVVA 242 Deinococcus_radiodurans -MKKSLLSLKLSGLLVPSVLALS--------LSACSSPSSTLNQGTLKIA 243 HISJ_E_COLI MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG 244 HISJ_E_COLI MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG 245 : . : :. 246 247 V_Harveyi_PATH MSGRYFPFTFVKQ--DKLQGFEVDMWDEIGKRNDYKIEYVTANFSGLFGL 248 B_subtilis_YXEM ATGQSYPFAYKEN--GKLTGFDVEVMEAVAKKIDMKLDWKLLEFSGLMGE 249 B_subtilis_GlnH_homo_YCKK TEGTYEPFTYHDKDTDKLTGYDVEVITEVAKRLGLKVDFKETQWGSMFAG 250 YA80_HAEIN TEGTYAPFTFHDK-SGKLTGFDVEVIRKVAEKLGLKVEFKETQWDAMYAG 251 FLIY_ECOLI LEGTYPPFSFQGD-DGKLTGFEVEFAQQLAKHLGVEASLKPTKWDGMLAS 252 E_coli_GlnH TDTAFVPFEFKQG--DKYVGFDVDLWAAIAKELKLDYELKPMDFSGIIPA 253 Deinococcus_radiodurans MEGTYPPFTSKNE-QGELVGFDVDIAKAVAQKLNLKPEFVLTEWSGILAG 254 HISJ_E_COLI TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS 255 HISJ_E_COLI TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS 256 ** .: *::::. : :. . ..: 257 258 V_Harveyi_PATH LETGRIDTISNQITMTDARKAKYLFADPYVVDG-AQI 259 B_subtilis_YXEM LQTGKLDTISNQVAVTDERKETYNFTKPYAYAG-TQI 260 B_subtilis_GlnH_homo_YCKK LNSKRFDVVANQVG-KTDREDKYDFSDKYTTSR-AVV 261 YA80_HAEIN LNAKRFDVIANQTNPSPERLKKYSFTTPYNYSG-GVI 262 FLIY_ECOLI LDSKRIDVVINQVTISDERKKKYDFSTPYTISGIQAL 263 E_coli_GlnH LQTKNVDLALAGITITDERKKAIDFSDGYYKSG-LLV 264 Deinococcus_radiodurans LQANKYDVIVNQVGITPERQNSIGFSQPYAYSRPEII 265 HISJ_E_COLI LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV 266 HISJ_E_COLI LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV 267 *.: . * . * *: : 268 269 """ 270 271 from StringIO import StringIO 272 273 records = list(ClustalIterator(StringIO(aln_example1))) 274 assert 2 == len(records) 275 assert records[0].id == "gi|4959044|gb|AAD34209.1|AF069" 276 assert records[1].id == "gi|671626|emb|CAA85685.1|" 277 assert records[0].seq.tostring() == \ 278 "MENSDSNDKGSDQSAAQRRSQMDRLDREEAFYQFVNNLSEEDYRLMRDNN" + \ 279 "LLGTPGESTEEELLRRLQQIKEGPPPQSPDENRAGESSDDVTNSDSIIDW" + \ 280 "LNSVRQTGNTTRSRQRGNQSWRAVSRTNPNSGDFRFSLEINVNRNNGSQT" + \ 281 "SENESEPSTRRLSVENMESSSQRQMENSASESASARPSRAERNSTEAVTE" + \ 282 "VPTTRAQRRA" 283 284 records = list(ClustalIterator(StringIO(aln_example2))) 285 assert 9 == len(records) 286 assert records[-1].id == "HISJ_E_COLI" 287 assert records[-1].seq.tostring() == \ 288 "MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG" + \ 289 "TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS" + \ 290 "LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV" 291