Package Bio :: Package Clustalw
[hide private]
[frames] | no frames]

Source Code for Package Bio.Clustalw

  1  # Clustalw modules 
  2   
  3  """ 
  4  A set of classes to interact with the multiple alignment command 
  5  line program clustalw.  
  6   
  7  Clustalw is the command line version of the graphical Clustalx  
  8  aligment program. 
  9   
 10  This requires clustalw available from: 
 11   
 12  ftp://ftp-igbmc.u-strasbg.fr/pub/ClustalW/. 
 13   
 14  functions: 
 15  o parse_file 
 16  o do_alignment 
 17   
 18  classes: 
 19  o ClustalAlignment 
 20  o _AlignCreator 
 21  o MultipleAlignCL""" 
 22   
 23  # standard library 
 24  import os 
 25  import sys 
 26  import string #Obsolete - we should switch to using string object methods instead! 
 27   
 28  # biopython 
 29  from Bio.Seq import Seq 
 30  from Bio.SeqRecord import SeqRecord 
 31  from Bio import Alphabet 
 32  from Bio.Alphabet import IUPAC 
 33  import clustal_format 
 34  from Bio.Align.Generic import Alignment 
 35   
 36  # PyXML package 
 37  from xml.sax import saxutils 
 38  from xml.sax import handler 
 39   
40 -def parse_file(file_name, alphabet = IUPAC.unambiguous_dna, debug_level = 0):
41 """Parse the given file into a clustal aligment object. 42 43 Arguments: 44 o file_name - The name of the file to parse. 45 o alphabet - The type of alphabet to use for the alignment sequences. 46 This should correspond to the type of information contained in the file. 47 Defaults to be unambiguous_dna sequence. 48 """ 49 align_handler = _AlignCreator(Alphabet.Gapped(alphabet)) 50 51 parser = clustal_format.format.make_parser(debug_level) 52 parser.setContentHandler(align_handler) 53 parser.setErrorHandler(handler.ErrorHandler()) 54 55 to_parse = open(file_name, 'r') 56 parser.parseFile(to_parse) 57 to_parse.close() 58 59 return align_handler.align
60
61 -def do_alignment(command_line, alphabet=None):
62 """Perform an alignment with the given command line. 63 64 Arguments: 65 o command_line - A command line object that can give out 66 the command line we will input into clustalw. 67 o alphabet - the alphabet to use in the created alignment. If not 68 specified IUPAC.unambiguous_dna and IUPAC.protein will be used for 69 dna and protein alignment respectively. 70 71 Returns: 72 o A clustal alignment object corresponding to the created alignment. 73 If the alignment type was not a clustal object, None is returned. 74 """ 75 run_clust = os.popen(str(command_line)) 76 status = run_clust.close() 77 78 79 # The exit status is the second byte of the termination status 80 # TODO - Check this holds on win32... 81 value = 0 82 if status: value = status / 256 83 # check the return value for errors, as on 1.81 the return value 84 # from Clustalw is actually helpful for figuring out errors 85 # 1 => bad command line option 86 if value == 1: 87 raise ValueError("Bad command line option in the command: %s" 88 % str(command_line)) 89 # 2 => can't open sequence file 90 elif value == 2: 91 raise IOError("Cannot open sequence file %s" 92 % command_line.sequence_file) 93 # 3 => wrong format in sequence file 94 elif value == 3: 95 raise IOError("Sequence file %s has an invalid format." 96 % command_line.sequence_file) 97 # 4 => sequence file only has one sequence 98 elif value == 4: 99 raise IOError("Sequence file %s has only one sequence present." 100 % command_line.sequence_file) 101 102 # if an output file was specified, we need to grab it 103 if command_line.output_file: 104 out_file = command_line.output_file 105 else: 106 out_file = os.path.splitext(command_line.sequence_file)[0] + '.aln' 107 108 # if we can't deal with the format, just return None 109 if command_line.output_type and command_line.output_type != 'CLUSTAL': 110 return None 111 # otherwise parse it into a ClustalAlignment object 112 else: 113 if not alphabet: 114 alphabet = (IUPAC.unambiguous_dna, IUPAC.protein)[ 115 command_line.type == 'PROTEIN'] 116 117 # check if the outfile exists before parsing 118 if not(os.path.exists(out_file)): 119 raise IOError("Output .aln file %s not produced, commandline: %s" 120 % (out_file, command_line)) 121 122 return parse_file(out_file, alphabet)
123 124
125 -class ClustalAlignment(Alignment):
126 """Work with the clustal aligment format. 127 128 This format is the default output from clustal -- these files normally 129 have an extension of .aln. 130 """ 131 # the default version to use if one isn't set 132 DEFAULT_VERSION = '1.81' 133
134 - def __init__(self, alphabet = Alphabet.Gapped(IUPAC.ambiguous_dna)):
135 Alignment.__init__(self, alphabet) 136 137 # represent all of those stars in the aln output format 138 self._star_info = '' 139 140 self._version = ''
141
142 - def __str__(self):
143 """Print out the alignment so it looks pretty. 144 145 The output produced from this should also be formatted in valid 146 clustal format. 147 """ 148 # if the version isn't set, we need to use the default 149 if self._version == '': 150 self._version = self.DEFAULT_VERSION 151 152 output = "CLUSTAL X (%s) multiple sequence alignment\n\n\n" % \ 153 self._version 154 155 cur_char = 0 156 max_length = len(self._records[0].seq) 157 158 # keep displaying sequences until we reach the end 159 while cur_char != max_length: 160 # calculate the number of sequences to show, which will 161 # be less if we are at the end of the sequence 162 if (cur_char + 50) > max_length: 163 show_num = max_length - cur_char 164 else: 165 show_num = 50 166 167 # go through all of the records and print out the sequences 168 # when we output, we do a nice 80 column output, although this 169 # may result in truncation of the ids. 170 for record in self._records: 171 line = record.description[0:30].ljust(36) 172 line = line + record.seq.data[cur_char:(cur_char + show_num)] 173 174 output = output + line + "\n" 175 176 # now we need to print out the star info, if we've got it 177 if self._star_info != '': 178 output = output + (" " * 36) + \ 179 self._star_info[cur_char:(cur_char + show_num)] + "\n" 180 181 output = output + "\n" 182 cur_char = cur_char + show_num 183 184 # have a extra newline, so strip two off and add one before returning 185 return string.rstrip(output) + "\n"
186 187
188 - def _add_star_info(self, stars):
189 """Add all of the stars, which indicate consensus sequence. 190 """ 191 self._star_info = stars
192
193 - def _add_version(self, version):
194 """Add the version information about the clustal file being read. 195 """ 196 self._version = version
197
198 -class _AlignCreator(handler.ContentHandler):
199 """Handler to create a ClustalAlignment object from clustal file info. 200 201 This handler is used to accept events coming from a Martel parsing 202 stream, and acts like a normal SAX handler. 203 204 After parsing, the alignment object created is available as the 205 align attribute of the class. 206 """
207 - def __init__(self, alphabet):
208 """Create a new handler ready to deal with output from Martel parsing. 209 210 Arguments: 211 o alphabet - The alphabet to create all of the new sequences with. 212 """ 213 self.align = ClustalAlignment(alphabet) 214 215 # store sequence info in a dictionary 216 self.all_info = {} 217 self.all_keys = [] 218 219 # the current id we are working with 220 self.cur_id = None 221 222 # info so we know how big the ids and sequences are 223 self.id_size = 0 224 self.space_size = 0 225 self.seq_size = 0 226 227 # flags so we can keep track of where we are during the parse 228 self.in_version = 0 229 self.in_stars = 0 230 self.in_seq_id = 0 231 self.in_space = 0 232 self.in_seq = 0 233 self.all_star_info = ''
234
235 - def startElement(self, name, attrs):
236 """Check the various tags for the info we are interested in.""" 237 if name == "version": 238 self.in_version = 1 239 self.version_info = '' 240 elif name == "seq_id": 241 self.in_seq_id = 1 242 self.seq_id_info = '' 243 elif name == "seq_space": 244 self.in_space = 1 245 self.space_info = '' 246 elif name == "seq_info": 247 self.in_seq = 1 248 self.seq_info = '' 249 elif name == "match_stars": 250 self.in_stars = 1 251 self.star_info = ''
252
253 - def characters(self, content):
254 if self.in_version: 255 self.version_info = self.version_info + content 256 elif self.in_seq_id: 257 self.seq_id_info = self.seq_id_info + content 258 elif self.in_space: 259 self.space_info = self.space_info + content 260 elif self.in_seq: 261 self.seq_info = self.seq_info + content 262 elif self.in_stars: 263 self.star_info = self.star_info + content
264
265 - def endElement(self, name):
266 if name == "version": 267 self.in_version = 0 268 self.align._add_version(string.strip(self.version_info)) 269 elif name == "seq_id": 270 self.in_seq_id = 0 271 self.id_size = len(self.seq_id_info) 272 self.cur_id = self.seq_id_info 273 elif name == "seq_space": 274 self.in_space = 0 275 self.space_size = len(self.space_info) 276 elif name == "seq_info": 277 self.in_seq = 0 278 self.seq_size = len(self.seq_info) 279 280 # if the id is already there, add the sequence info 281 if self.cur_id in self.all_info.keys(): 282 self.all_info[self.cur_id] = self.all_info[self.cur_id] + \ 283 self.seq_info 284 else: 285 self.all_info[self.cur_id] = self.seq_info 286 self.all_keys.append(self.cur_id) 287 288 elif name == "match_stars": 289 id_length = self.id_size + self.space_size 290 line_length = id_length + self.seq_size 291 292 self.all_star_info = self.all_star_info + \ 293 self.star_info[id_length:line_length]
294
295 - def endDocument(self):
296 # when we are done parsing add all of the info we need 297 self.align._add_star_info(self.all_star_info) 298 299 for id in self.all_keys: 300 self.align.add_sequence(id, self.all_info[id])
301
302 -class MultipleAlignCL:
303 """Represent a clustalw multiple alignment command line. 304 305 This is meant to make it easy to code the command line options you 306 want to submit to clustalw. 307 308 Clustalw has a ton of options and things to do but this is set up to 309 represent a clustalw mutliple alignment. 310 311 Warning: I don't use all of these options personally, so if you find 312 one to be broken for any reason, please let us know! 313 """ 314 # set the valid options for different parameters 315 OUTPUT_TYPES = ['GCG', 'GDE', 'PHYLIP', 'PIR', 'NEXUS', 'FASTA'] 316 OUTPUT_ORDER = ['INPUT', 'ALIGNED'] 317 OUTPUT_CASE = ['LOWER', 'UPPER'] 318 OUTPUT_SEQNOS = ['OFF', 'ON'] 319 RESIDUE_TYPES = ['PROTEIN', 'DNA'] 320 PROTEIN_MATRIX = ['BLOSUM', 'PAM', 'GONNET', 'ID'] 321 DNA_MATRIX = ['IUB', 'CLUSTALW'] 322
323 - def __init__(self, sequence_file, command = 'clustalw'):
324 """Initialize some general parameters that can be set as attributes. 325 326 Arguments: 327 o sequence_file - The file to read the sequences for alignment from. 328 o command - The command used to run clustalw. This defaults to 329 just 'clustalw' (ie. assumes you have it on your path somewhere). 330 331 General attributes that can be set: 332 o is_quick - if set as 1, will use a fast algorithm to create 333 the alignment guide tree. 334 o allow_negative - allow negative values in the alignment matrix. 335 336 Multiple alignment attributes that can be set as attributes: 337 o gap_open_pen - Gap opening penalty 338 o gap_ext_pen - Gap extension penalty 339 o is_no_end_pen - A flag as to whether or not there should be a gap 340 separation penalty for the ends. 341 o gap_sep_range - The gap separation penalty range. 342 o is_no_pgap - A flag to turn off residue specific gaps 343 o is_no_hgap - A flag to turn off hydrophilic gaps 344 o h_gap_residues - A list of residues to count a hydrophilic 345 o max_div - A percent identity to use for delay (? - I don't undertand 346 this!) 347 o trans_weight - The weight to use for transitions 348 """ 349 self.sequence_file = sequence_file 350 self.command = command 351 352 self.is_quick = None 353 self.allow_negative = None 354 355 self.gap_open_pen = None 356 self.gap_ext_pen = None 357 self.is_no_end_pen = None 358 self.gap_sep_range = None 359 self.is_no_pgap = None 360 self.is_no_hgap = None 361 self.h_gap_residues = [] 362 self.max_div = None 363 self.trans_weight = None 364 365 # other attributes that should be set via various functions 366 # 1. output parameters 367 self.output_file = None 368 self.output_type = None 369 self.output_order = None 370 self.change_case = None 371 self.add_seqnos = None 372 373 # 2. a guide tree to use 374 self.guide_tree = None 375 self.new_tree = None 376 377 # 3. matrices 378 self.protein_matrix = None 379 self.dna_matrix = None 380 381 # 4. type of residues 382 self.type = None
383
384 - def __str__(self):
385 """Write out the command line as a string.""" 386 387 if sys.platform <> "win32" : 388 #On Linux with clustalw 1.83, you can do: 389 #clustalw input.faa 390 #clustalw /full/path/input.faa 391 #clustalw -INFILE=input.faa 392 #clustalw -INFILE=/full/path/input.faa 393 # 394 #Note these fail (using DOS style slashes): 395 # 396 #clustalw /INFILE=input.faa 397 #clustalw /INFILE=/full/path/input.faa 398 # 399 #To keep things simple, and follow the original 400 #behaviour of Bio.Clustalw use this: 401 cline = self.command + " " + self.sequence_file 402 else : 403 #On Windows XP with clustalw.exe 1.83, these work at 404 #the command prompt: 405 # 406 #clustalw.exe input.faa 407 #clustalw.exe /INFILE=input.faa 408 #clustalw.exe /INFILE="input.faa" 409 #clustalw.exe /INFILE="with space.faa" 410 #clustalw.exe /INFILE=C:\full\path\input.faa 411 #clustalw.exe /INFILE="C:\full path\with spaces.faa" 412 # 413 #Sadly these fail: 414 #clustalw.exe "input.faa" 415 #clustalw.exe "with space.faa" 416 #clustalw.exe C:\full\path\input.faa 417 #clustalw.exe "C:\full path\with spaces.faa" 418 # 419 #These also fail but a minus/dash does seem to 420 #work with other options (!): 421 #clustalw.exe -INFILE=input.faa 422 #clustalw.exe -INFILE=C:\full\path\input.faa 423 # 424 #Also these fail: 425 #clustalw.exe "/INFILE=input.faa" 426 #clustalw.exe "/INFILE=C:\full\path\input.faa" 427 # 428 #Thanks to Emanuel Hey for flagging this on the mailing list. 429 # 430 #In addtion, both self.command and self.sequence_file 431 #may contain spaces, so should be quoted. But clustalw 432 #is fussy. 433 if self.command.count(" ") > 0 : 434 cline = '"%s"' % self.command 435 else : 436 cline = self.command 437 if self.sequence_file.count(" ") > 0 : 438 cline += ' /INFILE="%s"' % self.sequence_file 439 else : 440 cline += ' /INFILE=%s' % self.sequence_file 441 442 # general options 443 if self.type: 444 cline += " -TYPE=%s" % self.type 445 if self.is_quick == 1: 446 #Some versions of clustalw are case sensitive, 447 #and require -quicktree rather than -QUICKTREE 448 cline += " -quicktree" 449 if self.allow_negative == 1: 450 cline += " -NEGATIVE" 451 452 # output options 453 if self.output_file: 454 cline += " -OUTFILE=%s" % self.output_file 455 if self.output_type: 456 cline += " -OUTPUT=%s" % self.output_type 457 if self.output_order: 458 cline += " -OUTORDER=%s" % self.output_order 459 if self.change_case: 460 cline += " -CASE=%s" % self.change_case 461 if self.add_seqnos: 462 cline += " -SEQNOS=%s" % self.add_seqnos 463 if self.new_tree: 464 # clustal does not work if -align is written -ALIGN 465 cline += " -NEWTREE=%s -align" % self.new_tree 466 467 # multiple alignment options 468 if self.guide_tree: 469 cline += " -USETREE=%s" % self.guide_tree 470 if self.protein_matrix: 471 cline += " -MATRIX=%s" % self.protein_matrix 472 if self.dna_matrix: 473 cline += " -DNAMATRIX=%s" % self.dna_matrix 474 if self.gap_open_pen: 475 cline += " -GAPOPEN=%s" % self.gap_open_pen 476 if self.gap_ext_pen: 477 cline += " -GAPEXT=%s" % self.gap_ext_pen 478 if self.is_no_end_pen == 1: 479 cline += " -ENDGAPS" 480 if self.gap_sep_range: 481 cline += " -GAPDIST=%s" % self.gap_sep_range 482 if self.is_no_pgap == 1: 483 cline += " -NOPGAP" 484 if self.is_no_hgap == 1: 485 cline += " -NOHGAP" 486 if len(self.h_gap_residues) != 0: 487 # stick the list of residues together as one big list o' residues 488 residue_list = '' 489 for residue in self.h_gap_residues: 490 residue_list = residue_list + residue 491 cline += " -HGAPRESIDUES=%s" % residue_list 492 if self.max_div: 493 cline += " -MAXDIV=%s" % self.max_div 494 if self.trans_weight: 495 cline += " -TRANSWEIGHT=%s" % self.trans_weight 496 497 return cline
498
499 - def set_output(self, output_file, output_type = None, output_order = None, 500 change_case = None, add_seqnos = None):
501 """Set the output parameters for the command line. 502 """ 503 self.output_file = output_file 504 505 if output_type: 506 output_type = string.upper(output_type) 507 if output_type not in self.OUTPUT_TYPES: 508 raise ValueError("Invalid output type %s. Valid choices are %s" 509 % (output_type, self.OUTPUT_TYPES)) 510 else: 511 self.output_type = output_type 512 513 if output_order: 514 output_order = string.upper(output_order) 515 if output_order not in self.OUTPUT_ORDER: 516 raise ValueError("Invalid output order %s. Valid choices are %s" 517 % (output_order, self.OUTPUT_ORDER)) 518 else: 519 self.output_order = output_order 520 521 if change_case: 522 change_case = string.upper(change_case) 523 if output_type != "GDE": 524 raise ValueError("Change case only valid for GDE output.") 525 elif change_case not in self.CHANGE_CASE: 526 raise ValueError("Invalid change case %s. Valid choices are %s" 527 % (change_case, self.CHANGE_CASE)) 528 else: 529 self.change_case = change_case 530 531 if add_seqnos: 532 add_seqnos = string.upper(add_seqnos) 533 if output_type: 534 raise ValueError("Add SeqNos only valid for CLUSTAL output.") 535 elif add_seqnos not in self.OUTPUT_SEQNOS: 536 raise ValueError("Invalid seqnos option %s. Valid choices: %s" 537 % (add_seqnos, self.OUTPUT_SEQNOS)) 538 else: 539 self.add_seqnos = add_seqnos
540
541 - def set_guide_tree(self, tree_file):
542 """Provide a file to use as the guide tree for alignment. 543 544 Raises: 545 o IOError - If the tree_file doesn't exist.""" 546 if not(os.path.exists(tree_file)): 547 raise IOError("Could not find the guide tree file %s." % 548 tree_file) 549 else: 550 self.guide_tree = tree_file
551
552 - def set_new_guide_tree(self, tree_file):
553 """Set the name of the guide tree file generated in the alignment. 554 """ 555 self.new_tree = tree_file
556
557 - def set_protein_matrix(self, protein_matrix):
558 """Set the type of protein matrix to use. 559 560 Protein matrix can be either one of the defined types (blosum, pam, 561 gonnet or id) or a file with your own defined matrix. 562 """ 563 if string.upper(protein_matrix) in self.PROTEIN_MATRIX: 564 self.protein_matrix = string.upper(protein_matrix) 565 elif os.path.exists(protein_matrix): 566 self.protein_matrix = protein_matrix 567 else: 568 raise ValueError("Invalid matrix %s. Options are %s or a file." % 569 (string.upper(protein_matrix), 570 self.PROTEIN_MATRIX))
571
572 - def set_dna_matrix(self, dna_matrix):
573 """Set the type of DNA matrix to use. 574 575 The dna_matrix can either be one of the defined types (iub or clustalw) 576 or a file with the matrix to use.""" 577 if string.upper(dna_matrix) in self.DNA_MATRIX: 578 self.dna_matrix = string.upper(dna_matrix) 579 elif os.path.exists(dna_matrix): 580 self.dna_matrix = dna_matrix 581 else: 582 raise ValueError("Invalid matrix %s. Options are %s or a file." % 583 (dna_matrix, self.DNA_MATRIX))
584
585 - def set_type(self, residue_type):
586 """Set the type of residues within the file. 587 588 Clustal tries to guess whether the info is protein or DNA based on 589 the number of GATCs, but this can be wrong if you have a messed up 590 protein or DNA you are working with, so this allows you to set it 591 explicitly. 592 """ 593 residue_type = string.upper(residue_type) 594 if residue_type in self.RESIDUE_TYPES: 595 self.type = residue_type 596 else: 597 raise ValueError("Invalid residue type %s. Valid choices are %s" 598 % (residue_type, self.RESIDUE_TYPES))
599