Package Bio :: Package SeqIO :: Module ClustalIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.ClustalIO

  1  # Copyright 2006, 2007 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6   
  7  #For reading alignments: 
  8  from Bio.Alphabet import single_letter_alphabet 
  9  from Bio.Seq import Seq 
 10  from Bio.SeqRecord import SeqRecord 
 11   
 12  #For writing alignments: 
 13  from Bio.SeqIO.Interfaces import SequenceWriter 
 14  from Bio.Clustalw import ClustalAlignment 
 15   
 16  #This is a generator function! 
 17  #TODO - Should the default be Gapped(single_letter_alphabet) instead? 
18 -def ClustalIterator(handle, alphabet = single_letter_alphabet) :
19 """Reads a Clustalw file returning a SeqRecord object iterator 20 21 The entire file is loaded at once, but the SeqRecord objects 22 are only created "on request". 23 24 For more information on the file format, please see: 25 http://www.bioperl.org/wiki/ClustalW_multiple_alignment_format 26 27 You might like to look at Bio.Clustalw which has an interface 28 to the command line tool clustalw, and can also clustal alignment 29 files into Bio.Clustalw.ClustalAlignment objects. 30 31 We call this the "clustal" format which is consistent with EMBOSS. 32 Sadly BioPerl calls it the "clustalw" format, so we can't match 33 them both. 34 """ 35 line = handle.readline() 36 if not line: return 37 if not line[:7] == 'CLUSTAL': 38 raise SyntaxError("Did not find CLUSTAL header") 39 40 #There should be two blank lines after the header line 41 line = handle.readline() 42 while line.strip() == "" : 43 line = handle.readline() 44 45 #If the alignment contains entries with the same sequence 46 #identifier (not a good idea - but seems possible), then this 47 #dictionary based parser will merge their sequences. Fix this? 48 ids = [] 49 seqs = [] 50 51 #Use the first block to get the sequence identifiers 52 while line.strip() <> "" : 53 if line[0] <> " " : 54 #Sequences identifier... 55 fields = line.rstrip().split() 56 57 #We expect there to be two fields, there can be an optional 58 #"sequence number" field containing the letter count. 59 if len(fields) < 2 or len(fields) > 3: 60 raise SyntaxError("Could not parse line:\n%s" % line) 61 62 ids.append(fields[0]) 63 seqs.append(fields[1]) 64 65 if len(fields) == 3 : 66 #This MAY be an old style file with a letter count... 67 try : 68 letters = int(fields[2]) 69 except ValueError : 70 raise SyntaxError("Could not parse line, bad sequence number:\n%s" % line) 71 if len(fields[1].replace("-","")) <> letters : 72 raise SyntaxError("Could not parse line, invalid sequence number:\n%s" % line) 73 else : 74 #Sequence consensus line... 75 pass 76 line = handle.readline() 77 if not line : break #end of file 78 79 assert line.strip() == "" 80 81 #Loop over any remaining blocks... 82 while True : 83 #There should be a blank line between each block. 84 #Also want to ignore any consensus line from the 85 #previous block. 86 while (not line) or line.strip() == "" or line[0]==" ": 87 line = handle.readline() 88 if not line : break # end of file 89 if not line : break # end of file 90 91 for i in range(len(ids)) : 92 fields = line.rstrip().split() 93 94 #We expect there to be two fields, there can be an optional 95 #"sequence number" field containing the letter count. 96 if len(fields) < 2 or len(fields) > 3: 97 raise SyntaxError("Could not parse line:\n%s" % line) 98 99 if fields[0] <> ids[i] : 100 raise SyntaxError("Identifiers out of order? Got '%s' but expected '%s'" \ 101 % (fields[0], ids[i])) 102 103 #Append the sequence 104 seqs[i] += fields[1] 105 106 if len(fields) == 3 : 107 #This MAY be an old style file with a letter count... 108 try : 109 letters = int(fields[2]) 110 except ValueError : 111 raise SyntaxError("Could not parse line, bad sequence number:\n%s" % line) 112 if len(seqs[i].replace("-","")) <> letters : 113 raise SyntaxError("Could not parse line, invalid sequence number:\n%s" % line) 114 115 #Read in the next line 116 line = handle.readline() 117 118 assert len(ids) == len(seqs) 119 alignment_length = len(seqs[0]) 120 for i in range(len(ids)) : 121 if len(seqs[i]) <> alignment_length: 122 raise SyntaxError("Error parsing alignment - sequences of different length?") 123 yield SeqRecord(Seq(seqs[i], alphabet), id=ids[i])
124
125 -class ClustalWriter(SequenceWriter):
126 """Write Clustal sequence alignments"""
127 - def __init__(self, handle):
128 """Creates the writer object 129 130 Use the method write_file() to actually record your sequence records.""" 131 self.handle = handle
132
133 - def write_file(self, records) :
134 """Use this to write an entire file containing the given records. 135 136 records - a SeqRecord iterator, or list of SeqRecords 137 138 Right now this code uses Bio.Clustalw.ClustalAlignment to do 139 the hard work - this may change in the future. 140 """ 141 # ToDo - decide if using Bio.Clustalw.ClustalAlignment is 142 # actually the best way to handle this. 143 # 144 # Copying that thirty lines of code (with slight tweaks) 145 # would be much simpler, and would probably run quicker and 146 # use less memory as it doesn't build a ClustalAlignment 147 # object. 148 # 149 # The downside is code duplication. 150 length_of_sequences = None 151 alignment = ClustalAlignment() 152 for record in records : 153 if length_of_sequences is None : 154 length_of_sequences = len(record.seq) 155 elif length_of_sequences <> len(record.seq) : 156 raise ValueError("Sequences must all be the same length") 157 158 if length_of_sequences <= 0 : 159 raise ValueError("Non-empty sequences are required") 160 161 #ToDo, check alphabet for this sequence matches that 162 #specified for the alignment. Not sure how the 163 #alphabet.contains() method is intended to be used, 164 #but it doesn't make sense to me right now. 165 166 #Doing this works, but ClustalAlignment currently uses 167 #the record.descrption when outputing the records. 168 #alignment._records.append(record) 169 170 #Make sure we don't get any spaces in the record 171 #identifier when output in the file by replacing 172 #them with underscores: 173 alignment.add_sequence(record.id.replace(" ","_"), 174 record.seq.tostring()) 175 176 if len(alignment.get_all_seqs()) == 0 : 177 raise ValueError("Must have at least one sequence") 178 179 self.handle.write(str(alignment))
180 #Don't close the handle. Doing so would prevent this code 181 #from writing concatenated Clustal files which might be used 182 #in phylogenetic bootstrapping (very common with phylip). 183 #self.handle.close() 184 185 186 if __name__ == "__main__" : 187 # Run a quick self-test 188 189 #This is a truncated version of the example in Tests/cw02.aln 190 #Notice the inclusion of sequence numbers (right hand side) 191 aln_example1 = \ 192 """CLUSTAL W (1.81) multiple sequence alignment 193 194 195 gi|4959044|gb|AAD34209.1|AF069 MENSDSNDKGSDQSAAQRRSQMDRLDREEAFYQFVNNLSEEDYRLMRDNN 50 196 gi|671626|emb|CAA85685.1| ---------MSPQTETKASVGFKAGVKEYKLTYYTPEYETKDTDILAAFR 41 197 * *: :: :. :* : :. : . :* :: . 198 199 gi|4959044|gb|AAD34209.1|AF069 LLGTPGESTEEELLRRLQQIKEGPPPQSPDENRAGESSDDVTNSDSIIDW 100 200 gi|671626|emb|CAA85685.1| VTPQPG-----------------VPPEEAGAAVAAESSTGT--------- 65 201 : ** **:... *.*** .. 202 203 gi|4959044|gb|AAD34209.1|AF069 LNSVRQTGNTTRSRQRGNQSWRAVSRTNPNSGDFRFSLEINVNRNNGSQT 150 204 gi|671626|emb|CAA85685.1| WTTVWTDGLTSLDRYKG-----RCYHIEPVPG------------------ 92 205 .:* * *: .* :* : :* .* 206 207 gi|4959044|gb|AAD34209.1|AF069 SENESEPSTRRLSVENMESSSQRQMENSASESASARPSRAERNSTEAVTE 200 208 gi|671626|emb|CAA85685.1| -EKDQCICYVAYPLDLFEEGSVTNMFTSIVGNVFGFKALRALRLEDLRIP 141 209 *::. . .:: :*..* :* .* .. . : . : 210 211 gi|4959044|gb|AAD34209.1|AF069 VPTTRAQRRA 210 212 gi|671626|emb|CAA85685.1| VAYVKTFQGP 151 213 *. .:: : . 214 215 """ 216 217 #This example is a truncated version of the dataset used here: 218 #http://virgil.ruc.dk/kurser/Sekvens/Treedraw.htm 219 #with the last record repeated twice (deliberate toture test) 220 aln_example2 = \ 221 """CLUSTAL X (1.83) multiple sequence alignment 222 223 224 V_Harveyi_PATH --MKNWIKVAVAAIA--LSAA------------------TVQAATEVKVG 225 B_subtilis_YXEM MKMKKWTVLVVAALLAVLSACG------------NGNSSSKEDDNVLHVG 226 B_subtilis_GlnH_homo_YCKK MKKALLALFMVVSIAALAACGAGNDNQSKDNAKDGDLWASIKKKGVLTVG 227 YA80_HAEIN MKKLLFTTALLTGAIAFSTF-----------SHAGEIADRVEKTKTLLVG 228 FLIY_ECOLI MKLAHLGRQALMGVMAVALVAG---MSVKSFADEG-LLNKVKERGTLLVG 229 E_coli_GlnH --MKSVLKVSLAALTLAFAVS------------------SHAADKKLVVA 230 Deinococcus_radiodurans -MKKSLLSLKLSGLLVPSVLALS--------LSACSSPSSTLNQGTLKIA 231 HISJ_E_COLI MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG 232 HISJ_E_COLI MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG 233 : . : :. 234 235 V_Harveyi_PATH MSGRYFPFTFVKQ--DKLQGFEVDMWDEIGKRNDYKIEYVTANFSGLFGL 236 B_subtilis_YXEM ATGQSYPFAYKEN--GKLTGFDVEVMEAVAKKIDMKLDWKLLEFSGLMGE 237 B_subtilis_GlnH_homo_YCKK TEGTYEPFTYHDKDTDKLTGYDVEVITEVAKRLGLKVDFKETQWGSMFAG 238 YA80_HAEIN TEGTYAPFTFHDK-SGKLTGFDVEVIRKVAEKLGLKVEFKETQWDAMYAG 239 FLIY_ECOLI LEGTYPPFSFQGD-DGKLTGFEVEFAQQLAKHLGVEASLKPTKWDGMLAS 240 E_coli_GlnH TDTAFVPFEFKQG--DKYVGFDVDLWAAIAKELKLDYELKPMDFSGIIPA 241 Deinococcus_radiodurans MEGTYPPFTSKNE-QGELVGFDVDIAKAVAQKLNLKPEFVLTEWSGILAG 242 HISJ_E_COLI TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS 243 HISJ_E_COLI TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS 244 ** .: *::::. : :. . ..: 245 246 V_Harveyi_PATH LETGRIDTISNQITMTDARKAKYLFADPYVVDG-AQI 247 B_subtilis_YXEM LQTGKLDTISNQVAVTDERKETYNFTKPYAYAG-TQI 248 B_subtilis_GlnH_homo_YCKK LNSKRFDVVANQVG-KTDREDKYDFSDKYTTSR-AVV 249 YA80_HAEIN LNAKRFDVIANQTNPSPERLKKYSFTTPYNYSG-GVI 250 FLIY_ECOLI LDSKRIDVVINQVTISDERKKKYDFSTPYTISGIQAL 251 E_coli_GlnH LQTKNVDLALAGITITDERKKAIDFSDGYYKSG-LLV 252 Deinococcus_radiodurans LQANKYDVIVNQVGITPERQNSIGFSQPYAYSRPEII 253 HISJ_E_COLI LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV 254 HISJ_E_COLI LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV 255 *.: . * . * *: : 256 257 """ 258 259 from StringIO import StringIO 260 261 records = list(ClustalIterator(StringIO(aln_example1))) 262 assert 2 == len(records) 263 assert records[0].id == "gi|4959044|gb|AAD34209.1|AF069" 264 assert records[1].id == "gi|671626|emb|CAA85685.1|" 265 assert records[0].seq.tostring() == \ 266 "MENSDSNDKGSDQSAAQRRSQMDRLDREEAFYQFVNNLSEEDYRLMRDNN" + \ 267 "LLGTPGESTEEELLRRLQQIKEGPPPQSPDENRAGESSDDVTNSDSIIDW" + \ 268 "LNSVRQTGNTTRSRQRGNQSWRAVSRTNPNSGDFRFSLEINVNRNNGSQT" + \ 269 "SENESEPSTRRLSVENMESSSQRQMENSASESASARPSRAERNSTEAVTE" + \ 270 "VPTTRAQRRA" 271 272 records = list(ClustalIterator(StringIO(aln_example2))) 273 assert 9 == len(records) 274 assert records[-1].id == "HISJ_E_COLI" 275 assert records[-1].seq.tostring() == \ 276 "MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG" + \ 277 "TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS" + \ 278 "LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV" 279