Package Bio :: Package AlignIO :: Module ClustalIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.ClustalIO

  1  # Copyright 2006-2008 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6   
  7  from Bio.Align.Generic import Alignment 
  8  from Interfaces import AlignmentIterator, SequentialAlignmentWriter 
  9   
10 -class ClustalWriter(SequentialAlignmentWriter) :
11 """Clustalw alignment writer."""
12 - def write_alignment(self, alignment) :
13 """Use this to write (another) single alignment to an open file.""" 14 15 if len(alignment.get_all_seqs()) == 0 : 16 raise ValueError("Must have at least one sequence") 17 18 #Old versions of the parser in Bio.Clustalw used a ._version property, 19 try : 20 version = self._version 21 except AttributeError : 22 version = '1.81' 23 output = "CLUSTAL X (%s) multiple sequence alignment\n\n\n" % version 24 25 cur_char = 0 26 max_length = len(alignment._records[0].seq) 27 28 if max_length <= 0 : 29 raise ValueError("Non-empty sequences are required") 30 31 # keep displaying sequences until we reach the end 32 while cur_char != max_length: 33 # calculate the number of sequences to show, which will 34 # be less if we are at the end of the sequence 35 if (cur_char + 50) > max_length: 36 show_num = max_length - cur_char 37 else: 38 show_num = 50 39 40 # go through all of the records and print out the sequences 41 # when we output, we do a nice 80 column output, although this 42 # may result in truncation of the ids. 43 for record in alignment._records: 44 #Make sure we don't get any spaces in the record 45 #identifier when output in the file by replacing 46 #them with underscores: 47 line = record.id[0:30].replace(" ","_").ljust(36) 48 line += record.seq.data[cur_char:(cur_char + show_num)] 49 output += line + "\n" 50 51 # now we need to print out the star info, if we've got it 52 # This was stored by Bio.Clustalw using a ._star_info property. 53 if hasattr(alignment, "_star_info") and alignment._star_info != '': 54 output += (" " * 36) + \ 55 alignment._star_info[cur_char:(cur_char + show_num)] + "\n" 56 57 output += "\n" 58 cur_char += show_num 59 60 # Want a trailing blank new line in case the output is concatenated 61 self.handle.write(output + "\n")
62
63 -class ClustalIterator(AlignmentIterator) :
64 """Clustalw alignment iterator.""" 65
66 - def next(self) :
67 68 handle = self.handle 69 70 try : 71 #Header we saved from when we were parsing 72 #the previous alignment. 73 line = self._header 74 del self._header 75 except AttributeError: 76 line = handle.readline() 77 if not line: 78 return None 79 if line[:7] <> 'CLUSTAL': 80 raise ValueError("Did not find CLUSTAL header") 81 82 83 # find the clustal version in the header line 84 version = None 85 for word in line.split() : 86 if word[0]=='(' and word[-1]==')': 87 word = word[1:-1] 88 if word[0] in '0123456789': 89 version = word 90 91 92 #There should be two blank lines after the header line 93 line = handle.readline() 94 while line.strip() == "" : 95 line = handle.readline() 96 97 #If the alignment contains entries with the same sequence 98 #identifier (not a good idea - but seems possible), then this 99 #dictionary based parser will merge their sequences. Fix this? 100 ids = [] 101 seqs = [] 102 consensus = "" 103 seq_cols = None #: Used to extract the consensus 104 105 #Use the first block to get the sequence identifiers 106 while line.strip() <> "" : 107 if line[0] <> " " : 108 #Sequences identifier... 109 fields = line.rstrip().split() 110 111 #We expect there to be two fields, there can be an optional 112 #"sequence number" field containing the letter count. 113 if len(fields) < 2 or len(fields) > 3: 114 raise ValueError("Could not parse line:\n%s" % line) 115 116 ids.append(fields[0]) 117 seqs.append(fields[1]) 118 119 #Record the sequence position to get the consensus 120 if seq_cols is None : 121 start = len(fields[0]) + line[len(fields[0]):].find(fields[1]) 122 end = start + len(fields[1]) 123 seq_cols = slice(start, end) 124 del start, end 125 assert fields[1] == line[seq_cols] 126 127 if len(fields) == 3 : 128 #This MAY be an old style file with a letter count... 129 try : 130 letters = int(fields[2]) 131 except ValueError : 132 raise ValueError("Could not parse line, bad sequence number:\n%s" % line) 133 if len(fields[1].replace("-","")) <> letters : 134 raise ValueError("Could not parse line, invalid sequence number:\n%s" % line) 135 else : 136 #Sequence consensus line... 137 assert seq_cols is not None 138 consensus = line[seq_cols] 139 assert not line[seq_cols.stop:].strip() 140 line = handle.readline() 141 if not line : break #end of file 142 143 assert line.strip() == "" 144 assert seq_cols is not None 145 146 #Loop over any remaining blocks... 147 done = False 148 while not done : 149 #There should be a blank line between each block. 150 #Also want to ignore any consensus line from the 151 #previous block. 152 while (not line) or line.strip() == "" : 153 line = handle.readline() 154 if not line : break # end of file 155 if not line : break # end of file 156 157 if line[:7] == 'CLUSTAL': 158 #Found concatenated alignment. 159 done = True 160 self._header = line 161 break 162 163 for i in range(len(ids)) : 164 fields = line.rstrip().split() 165 166 #We expect there to be two fields, there can be an optional 167 #"sequence number" field containing the letter count. 168 if len(fields) < 2 or len(fields) > 3: 169 raise ValueError("Could not parse line:\n%s" % line) 170 171 if fields[0] <> ids[i] : 172 raise ValueError("Identifiers out of order? Got '%s' but expected '%s'" \ 173 % (fields[0], ids[i])) 174 175 if fields[1] <> line[seq_cols] : 176 start = len(fields[0]) + line[len(fields[0]):].find(fields[1]) 177 assert start == seq_cols.start, 'Old location %s -> %i:XX' % (seq_cols, start) 178 end = start + len(fields[1]) 179 seq_cols = slice(start, end) 180 del start, end 181 182 #Append the sequence 183 seqs[i] += fields[1] 184 185 if len(fields) == 3 : 186 #This MAY be an old style file with a letter count... 187 try : 188 letters = int(fields[2]) 189 except ValueError : 190 raise ValueError("Could not parse line, bad sequence number:\n%s" % line) 191 if len(seqs[i].replace("-","")) <> letters : 192 raise ValueError("Could not parse line, invalid sequence number:\n%s" % line) 193 194 #Read in the next line 195 line = handle.readline() 196 #There should now be a consensus line 197 if consensus : 198 assert seq_cols is not None 199 consensus += line[seq_cols] 200 assert not line[seq_cols.stop:].strip() 201 #Read in the next line 202 line = handle.readline() 203 204 205 assert len(ids) == len(seqs) 206 if len(seqs) == 0 or len(seqs[0]) == 0 : 207 return None 208 209 if self.records_per_alignment is not None \ 210 and self.records_per_alignment <> len(ids) : 211 raise ValueError("Found %i records in this alignment, told to expect %i" \ 212 % (len(ids), self.records_per_alignment)) 213 214 alignment = Alignment(self.alphabet) 215 alignment_length = len(seqs[0]) 216 for i in range(len(ids)) : 217 if len(seqs[i]) <> alignment_length: 218 raise ValueError("Error parsing alignment - sequences of different length?") 219 alignment.add_sequence(ids[i], seqs[i]) 220 #TODO - Handle alignment annotation better, for now 221 #mimic the old parser in Bio.Clustalw 222 if version : 223 alignment._version = version 224 if consensus : 225 assert len(consensus) == alignment_length, \ 226 "Alignment length is %i, consensus length is %i, '%s'" \ 227 % (alignment_length, len(consensus), consensus) 228 alignment._star_info = consensus 229 return alignment
230 231 if __name__ == "__main__" : 232 print "Running a quick self-test" 233 234 #This is a truncated version of the example in Tests/cw02.aln 235 #Notice the inclusion of sequence numbers (right hand side) 236 aln_example1 = \ 237 """CLUSTAL W (1.81) multiple sequence alignment 238 239 240 gi|4959044|gb|AAD34209.1|AF069 MENSDSNDKGSDQSAAQRRSQMDRLDREEAFYQFVNNLSEEDYRLMRDNN 50 241 gi|671626|emb|CAA85685.1| ---------MSPQTETKASVGFKAGVKEYKLTYYTPEYETKDTDILAAFR 41 242 * *: :: :. :* : :. : . :* :: . 243 244 gi|4959044|gb|AAD34209.1|AF069 LLGTPGESTEEELLRRLQQIKEGPPPQSPDENRAGESSDDVTNSDSIIDW 100 245 gi|671626|emb|CAA85685.1| VTPQPG-----------------VPPEEAGAAVAAESSTGT--------- 65 246 : ** **:... *.*** .. 247 248 gi|4959044|gb|AAD34209.1|AF069 LNSVRQTGNTTRSRQRGNQSWRAVSRTNPNSGDFRFSLEINVNRNNGSQT 150 249 gi|671626|emb|CAA85685.1| WTTVWTDGLTSLDRYKG-----RCYHIEPVPG------------------ 92 250 .:* * *: .* :* : :* .* 251 252 gi|4959044|gb|AAD34209.1|AF069 SENESEPSTRRLSVENMESSSQRQMENSASESASARPSRAERNSTEAVTE 200 253 gi|671626|emb|CAA85685.1| -EKDQCICYVAYPLDLFEEGSVTNMFTSIVGNVFGFKALRALRLEDLRIP 141 254 *::. . .:: :*..* :* .* .. . : . : 255 256 gi|4959044|gb|AAD34209.1|AF069 VPTTRAQRRA 210 257 gi|671626|emb|CAA85685.1| VAYVKTFQGP 151 258 *. .:: : . 259 260 """ 261 262 #This example is a truncated version of the dataset used here: 263 #http://virgil.ruc.dk/kurser/Sekvens/Treedraw.htm 264 #with the last record repeated twice (deliberate toture test) 265 aln_example2 = \ 266 """CLUSTAL X (1.83) multiple sequence alignment 267 268 269 V_Harveyi_PATH --MKNWIKVAVAAIA--LSAA------------------TVQAATEVKVG 270 B_subtilis_YXEM MKMKKWTVLVVAALLAVLSACG------------NGNSSSKEDDNVLHVG 271 B_subtilis_GlnH_homo_YCKK MKKALLALFMVVSIAALAACGAGNDNQSKDNAKDGDLWASIKKKGVLTVG 272 YA80_HAEIN MKKLLFTTALLTGAIAFSTF-----------SHAGEIADRVEKTKTLLVG 273 FLIY_ECOLI MKLAHLGRQALMGVMAVALVAG---MSVKSFADEG-LLNKVKERGTLLVG 274 E_coli_GlnH --MKSVLKVSLAALTLAFAVS------------------SHAADKKLVVA 275 Deinococcus_radiodurans -MKKSLLSLKLSGLLVPSVLALS--------LSACSSPSSTLNQGTLKIA 276 HISJ_E_COLI MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG 277 HISJ_E_COLI MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG 278 : . : :. 279 280 V_Harveyi_PATH MSGRYFPFTFVKQ--DKLQGFEVDMWDEIGKRNDYKIEYVTANFSGLFGL 281 B_subtilis_YXEM ATGQSYPFAYKEN--GKLTGFDVEVMEAVAKKIDMKLDWKLLEFSGLMGE 282 B_subtilis_GlnH_homo_YCKK TEGTYEPFTYHDKDTDKLTGYDVEVITEVAKRLGLKVDFKETQWGSMFAG 283 YA80_HAEIN TEGTYAPFTFHDK-SGKLTGFDVEVIRKVAEKLGLKVEFKETQWDAMYAG 284 FLIY_ECOLI LEGTYPPFSFQGD-DGKLTGFEVEFAQQLAKHLGVEASLKPTKWDGMLAS 285 E_coli_GlnH TDTAFVPFEFKQG--DKYVGFDVDLWAAIAKELKLDYELKPMDFSGIIPA 286 Deinococcus_radiodurans MEGTYPPFTSKNE-QGELVGFDVDIAKAVAQKLNLKPEFVLTEWSGILAG 287 HISJ_E_COLI TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS 288 HISJ_E_COLI TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS 289 ** .: *::::. : :. . ..: 290 291 V_Harveyi_PATH LETGRIDTISNQITMTDARKAKYLFADPYVVDG-AQI 292 B_subtilis_YXEM LQTGKLDTISNQVAVTDERKETYNFTKPYAYAG-TQI 293 B_subtilis_GlnH_homo_YCKK LNSKRFDVVANQVG-KTDREDKYDFSDKYTTSR-AVV 294 YA80_HAEIN LNAKRFDVIANQTNPSPERLKKYSFTTPYNYSG-GVI 295 FLIY_ECOLI LDSKRIDVVINQVTISDERKKKYDFSTPYTISGIQAL 296 E_coli_GlnH LQTKNVDLALAGITITDERKKAIDFSDGYYKSG-LLV 297 Deinococcus_radiodurans LQANKYDVIVNQVGITPERQNSIGFSQPYAYSRPEII 298 HISJ_E_COLI LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV 299 HISJ_E_COLI LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV 300 *.: . * . * *: : 301 302 """ 303 304 from StringIO import StringIO 305 306 alignments = list(ClustalIterator(StringIO(aln_example1))) 307 assert 1 == len(alignments) 308 assert alignments[0]._version == "1.81" 309 records = alignments[0].get_all_seqs() 310 assert 2 == len(records) 311 assert records[0].id == "gi|4959044|gb|AAD34209.1|AF069" 312 assert records[1].id == "gi|671626|emb|CAA85685.1|" 313 assert records[0].seq.tostring() == \ 314 "MENSDSNDKGSDQSAAQRRSQMDRLDREEAFYQFVNNLSEEDYRLMRDNN" + \ 315 "LLGTPGESTEEELLRRLQQIKEGPPPQSPDENRAGESSDDVTNSDSIIDW" + \ 316 "LNSVRQTGNTTRSRQRGNQSWRAVSRTNPNSGDFRFSLEINVNRNNGSQT" + \ 317 "SENESEPSTRRLSVENMESSSQRQMENSASESASARPSRAERNSTEAVTE" + \ 318 "VPTTRAQRRA" 319 320 alignments = list(ClustalIterator(StringIO(aln_example2))) 321 assert 1 == len(alignments) 322 assert alignments[0]._version == "1.83" 323 records = alignments[0].get_all_seqs() 324 assert 9 == len(records) 325 assert records[-1].id == "HISJ_E_COLI" 326 assert records[-1].seq.tostring() == \ 327 "MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG" + \ 328 "TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS" + \ 329 "LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV" 330 331 for alignment in ClustalIterator(StringIO(aln_example2 + aln_example1)) : 332 print "Alignment with %i records of length %i" \ 333 % (len(alignment.get_all_seqs()), 334 alignment.get_alignment_length()) 335 336 print "Checking empty file..." 337 assert 0 == len(list(ClustalIterator(StringIO("")))) 338 339 print "Checking write/read..." 340 alignments = list(ClustalIterator(StringIO(aln_example1))) \ 341 + list(ClustalIterator(StringIO(aln_example2)))*2 342 handle = StringIO() 343 ClustalWriter(handle).write_file(alignments) 344 handle.seek(0) 345 for i,a in enumerate(ClustalIterator(handle)) : 346 assert a.get_alignment_length() == alignments[i].get_alignment_length() 347 handle.seek(0) 348 349 print "Testing write/read when there is only one sequence..." 350 alignment._records = alignment._records[0:1] 351 handle = StringIO() 352 ClustalWriter(handle).write_file([alignment]) 353 handle.seek(0) 354 for i,a in enumerate(ClustalIterator(handle)) : 355 assert a.get_alignment_length() == alignment.get_alignment_length() 356 assert len(a.get_all_seqs()) == 1 357 358 359 print "The End" 360