1
2
3 """
4 A set of classes to interact with the multiple alignment command
5 line program clustalw.
6
7 Clustalw is the command line version of the graphical Clustalx
8 aligment program.
9
10 This requires clustalw available from:
11
12 ftp://ftp-igbmc.u-strasbg.fr/pub/ClustalW/.
13
14 functions:
15 o parse_file
16 o do_alignment
17
18 classes:
19 o ClustalAlignment
20 o _AlignCreator
21 o MultipleAlignCL"""
22
23
24 import os
25 import sys
26 import string
27
28
29 from Bio.Seq import Seq
30 from Bio.SeqRecord import SeqRecord
31 from Bio import Alphabet
32 from Bio.Alphabet import IUPAC
33 import clustal_format
34 from Bio.Align.Generic import Alignment
35
36
37 from xml.sax import saxutils
38 from xml.sax import handler
39
41 """Parse the given file into a clustal aligment object.
42
43 Arguments:
44 o file_name - The name of the file to parse.
45 o alphabet - The type of alphabet to use for the alignment sequences.
46 This should correspond to the type of information contained in the file.
47 Defaults to be unambiguous_dna sequence.
48 """
49 align_handler = _AlignCreator(Alphabet.Gapped(alphabet))
50
51 parser = clustal_format.format.make_parser(debug_level)
52 parser.setContentHandler(align_handler)
53 parser.setErrorHandler(handler.ErrorHandler())
54
55 to_parse = open(file_name, 'r')
56 parser.parseFile(to_parse)
57 to_parse.close()
58
59 return align_handler.align
60
62 """Perform an alignment with the given command line.
63
64 Arguments:
65 o command_line - A command line object that can give out
66 the command line we will input into clustalw.
67 o alphabet - the alphabet to use in the created alignment. If not
68 specified IUPAC.unambiguous_dna and IUPAC.protein will be used for
69 dna and protein alignment respectively.
70
71 Returns:
72 o A clustal alignment object corresponding to the created alignment.
73 If the alignment type was not a clustal object, None is returned.
74 """
75 run_clust = os.popen(str(command_line))
76 status = run_clust.close()
77
78
79
80
81 value = 0
82 if status: value = status / 256
83
84
85
86 if value == 1:
87 raise ValueError("Bad command line option in the command: %s"
88 % str(command_line))
89
90 elif value == 2:
91 raise IOError("Cannot open sequence file %s"
92 % command_line.sequence_file)
93
94 elif value == 3:
95 raise IOError("Sequence file %s has an invalid format."
96 % command_line.sequence_file)
97
98 elif value == 4:
99 raise IOError("Sequence file %s has only one sequence present."
100 % command_line.sequence_file)
101
102
103 if command_line.output_file:
104 out_file = command_line.output_file
105 else:
106 out_file = os.path.splitext(command_line.sequence_file)[0] + '.aln'
107
108
109 if command_line.output_type and command_line.output_type != 'CLUSTAL':
110 return None
111
112 else:
113 if not alphabet:
114 alphabet = (IUPAC.unambiguous_dna, IUPAC.protein)[
115 command_line.type == 'PROTEIN']
116
117
118 if not(os.path.exists(out_file)):
119 raise IOError("Output .aln file %s not produced, commandline: %s"
120 % (out_file, command_line))
121
122 return parse_file(out_file, alphabet)
123
124
126 """Work with the clustal aligment format.
127
128 This format is the default output from clustal -- these files normally
129 have an extension of .aln.
130 """
131
132 DEFAULT_VERSION = '1.81'
133
141
143 """Print out the alignment so it looks pretty.
144
145 The output produced from this should also be formatted in valid
146 clustal format.
147 """
148
149 if self._version == '':
150 self._version = self.DEFAULT_VERSION
151
152 output = "CLUSTAL X (%s) multiple sequence alignment\n\n\n" % \
153 self._version
154
155 cur_char = 0
156 max_length = len(self._records[0].seq)
157
158
159 while cur_char != max_length:
160
161
162 if (cur_char + 50) > max_length:
163 show_num = max_length - cur_char
164 else:
165 show_num = 50
166
167
168
169
170 for record in self._records:
171 line = record.description[0:30].ljust(36)
172 line = line + record.seq.data[cur_char:(cur_char + show_num)]
173
174 output = output + line + "\n"
175
176
177 if self._star_info != '':
178 output = output + (" " * 36) + \
179 self._star_info[cur_char:(cur_char + show_num)] + "\n"
180
181 output = output + "\n"
182 cur_char = cur_char + show_num
183
184
185 return string.rstrip(output) + "\n"
186
187
189 """Add all of the stars, which indicate consensus sequence.
190 """
191 self._star_info = stars
192
194 """Add the version information about the clustal file being read.
195 """
196 self._version = version
197
199 """Handler to create a ClustalAlignment object from clustal file info.
200
201 This handler is used to accept events coming from a Martel parsing
202 stream, and acts like a normal SAX handler.
203
204 After parsing, the alignment object created is available as the
205 align attribute of the class.
206 """
208 """Create a new handler ready to deal with output from Martel parsing.
209
210 Arguments:
211 o alphabet - The alphabet to create all of the new sequences with.
212 """
213 self.align = ClustalAlignment(alphabet)
214
215
216 self.all_info = {}
217 self.all_keys = []
218
219
220 self.cur_id = None
221
222
223 self.id_size = 0
224 self.space_size = 0
225 self.seq_size = 0
226
227
228 self.in_version = 0
229 self.in_stars = 0
230 self.in_seq_id = 0
231 self.in_space = 0
232 self.in_seq = 0
233 self.all_star_info = ''
234
236 """Check the various tags for the info we are interested in."""
237 if name == "version":
238 self.in_version = 1
239 self.version_info = ''
240 elif name == "seq_id":
241 self.in_seq_id = 1
242 self.seq_id_info = ''
243 elif name == "seq_space":
244 self.in_space = 1
245 self.space_info = ''
246 elif name == "seq_info":
247 self.in_seq = 1
248 self.seq_info = ''
249 elif name == "match_stars":
250 self.in_stars = 1
251 self.star_info = ''
252
254 if self.in_version:
255 self.version_info = self.version_info + content
256 elif self.in_seq_id:
257 self.seq_id_info = self.seq_id_info + content
258 elif self.in_space:
259 self.space_info = self.space_info + content
260 elif self.in_seq:
261 self.seq_info = self.seq_info + content
262 elif self.in_stars:
263 self.star_info = self.star_info + content
264
266 if name == "version":
267 self.in_version = 0
268 self.align._add_version(string.strip(self.version_info))
269 elif name == "seq_id":
270 self.in_seq_id = 0
271 self.id_size = len(self.seq_id_info)
272 self.cur_id = self.seq_id_info
273 elif name == "seq_space":
274 self.in_space = 0
275 self.space_size = len(self.space_info)
276 elif name == "seq_info":
277 self.in_seq = 0
278 self.seq_size = len(self.seq_info)
279
280
281 if self.cur_id in self.all_info.keys():
282 self.all_info[self.cur_id] = self.all_info[self.cur_id] + \
283 self.seq_info
284 else:
285 self.all_info[self.cur_id] = self.seq_info
286 self.all_keys.append(self.cur_id)
287
288 elif name == "match_stars":
289 id_length = self.id_size + self.space_size
290 line_length = id_length + self.seq_size
291
292 self.all_star_info = self.all_star_info + \
293 self.star_info[id_length:line_length]
294
301
303 """Represent a clustalw multiple alignment command line.
304
305 This is meant to make it easy to code the command line options you
306 want to submit to clustalw.
307
308 Clustalw has a ton of options and things to do but this is set up to
309 represent a clustalw mutliple alignment.
310
311 Warning: I don't use all of these options personally, so if you find
312 one to be broken for any reason, please let us know!
313 """
314
315 OUTPUT_TYPES = ['GCG', 'GDE', 'PHYLIP', 'PIR', 'NEXUS', 'FASTA']
316 OUTPUT_ORDER = ['INPUT', 'ALIGNED']
317 OUTPUT_CASE = ['LOWER', 'UPPER']
318 OUTPUT_SEQNOS = ['OFF', 'ON']
319 RESIDUE_TYPES = ['PROTEIN', 'DNA']
320 PROTEIN_MATRIX = ['BLOSUM', 'PAM', 'GONNET', 'ID']
321 DNA_MATRIX = ['IUB', 'CLUSTALW']
322
323 - def __init__(self, sequence_file, command = 'clustalw'):
324 """Initialize some general parameters that can be set as attributes.
325
326 Arguments:
327 o sequence_file - The file to read the sequences for alignment from.
328 o command - The command used to run clustalw. This defaults to
329 just 'clustalw' (ie. assumes you have it on your path somewhere).
330
331 General attributes that can be set:
332 o is_quick - if set as 1, will use a fast algorithm to create
333 the alignment guide tree.
334 o allow_negative - allow negative values in the alignment matrix.
335
336 Multiple alignment attributes that can be set as attributes:
337 o gap_open_pen - Gap opening penalty
338 o gap_ext_pen - Gap extension penalty
339 o is_no_end_pen - A flag as to whether or not there should be a gap
340 separation penalty for the ends.
341 o gap_sep_range - The gap separation penalty range.
342 o is_no_pgap - A flag to turn off residue specific gaps
343 o is_no_hgap - A flag to turn off hydrophilic gaps
344 o h_gap_residues - A list of residues to count a hydrophilic
345 o max_div - A percent identity to use for delay (? - I don't undertand
346 this!)
347 o trans_weight - The weight to use for transitions
348 """
349 self.sequence_file = sequence_file
350 self.command = command
351
352 self.is_quick = None
353 self.allow_negative = None
354
355 self.gap_open_pen = None
356 self.gap_ext_pen = None
357 self.is_no_end_pen = None
358 self.gap_sep_range = None
359 self.is_no_pgap = None
360 self.is_no_hgap = None
361 self.h_gap_residues = []
362 self.max_div = None
363 self.trans_weight = None
364
365
366
367 self.output_file = None
368 self.output_type = None
369 self.output_order = None
370 self.change_case = None
371 self.add_seqnos = None
372
373
374 self.guide_tree = None
375 self.new_tree = None
376
377
378 self.protein_matrix = None
379 self.dna_matrix = None
380
381
382 self.type = None
383
385 """Write out the command line as a string."""
386
387 if sys.platform <> "win32" :
388
389
390
391
392
393
394
395
396
397
398
399
400
401 cline = self.command + " " + self.sequence_file
402 else :
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433 if self.command.count(" ") > 0 :
434 cline = '"%s"' % self.command
435 else :
436 cline = self.command
437 if self.sequence_file.count(" ") > 0 :
438 cline += ' /INFILE="%s"' % self.sequence_file
439 else :
440 cline += ' /INFILE=%s' % self.sequence_file
441
442
443 if self.type:
444 cline += " -TYPE=%s" % self.type
445 if self.is_quick == 1:
446
447
448 cline += " -quicktree"
449 if self.allow_negative == 1:
450 cline += " -NEGATIVE"
451
452
453 if self.output_file:
454 cline += " -OUTFILE=%s" % self.output_file
455 if self.output_type:
456 cline += " -OUTPUT=%s" % self.output_type
457 if self.output_order:
458 cline += " -OUTORDER=%s" % self.output_order
459 if self.change_case:
460 cline += " -CASE=%s" % self.change_case
461 if self.add_seqnos:
462 cline += " -SEQNOS=%s" % self.add_seqnos
463 if self.new_tree:
464
465 cline += " -NEWTREE=%s -align" % self.new_tree
466
467
468 if self.guide_tree:
469 cline += " -USETREE=%s" % self.guide_tree
470 if self.protein_matrix:
471 cline += " -MATRIX=%s" % self.protein_matrix
472 if self.dna_matrix:
473 cline += " -DNAMATRIX=%s" % self.dna_matrix
474 if self.gap_open_pen:
475 cline += " -GAPOPEN=%s" % self.gap_open_pen
476 if self.gap_ext_pen:
477 cline += " -GAPEXT=%s" % self.gap_ext_pen
478 if self.is_no_end_pen == 1:
479 cline += " -ENDGAPS"
480 if self.gap_sep_range:
481 cline += " -GAPDIST=%s" % self.gap_sep_range
482 if self.is_no_pgap == 1:
483 cline += " -NOPGAP"
484 if self.is_no_hgap == 1:
485 cline += " -NOHGAP"
486 if len(self.h_gap_residues) != 0:
487
488 residue_list = ''
489 for residue in self.h_gap_residues:
490 residue_list = residue_list + residue
491 cline += " -HGAPRESIDUES=%s" % residue_list
492 if self.max_div:
493 cline += " -MAXDIV=%s" % self.max_div
494 if self.trans_weight:
495 cline += " -TRANSWEIGHT=%s" % self.trans_weight
496
497 return cline
498
499 - def set_output(self, output_file, output_type = None, output_order = None,
500 change_case = None, add_seqnos = None):
501 """Set the output parameters for the command line.
502 """
503 self.output_file = output_file
504
505 if output_type:
506 output_type = string.upper(output_type)
507 if output_type not in self.OUTPUT_TYPES:
508 raise ValueError("Invalid output type %s. Valid choices are %s"
509 % (output_type, self.OUTPUT_TYPES))
510 else:
511 self.output_type = output_type
512
513 if output_order:
514 output_order = string.upper(output_order)
515 if output_order not in self.OUTPUT_ORDER:
516 raise ValueError("Invalid output order %s. Valid choices are %s"
517 % (output_order, self.OUTPUT_ORDER))
518 else:
519 self.output_order = output_order
520
521 if change_case:
522 change_case = string.upper(change_case)
523 if output_type != "GDE":
524 raise ValueError("Change case only valid for GDE output.")
525 elif change_case not in self.CHANGE_CASE:
526 raise ValueError("Invalid change case %s. Valid choices are %s"
527 % (change_case, self.CHANGE_CASE))
528 else:
529 self.change_case = change_case
530
531 if add_seqnos:
532 add_seqnos = string.upper(add_seqnos)
533 if output_type:
534 raise ValueError("Add SeqNos only valid for CLUSTAL output.")
535 elif add_seqnos not in self.OUTPUT_SEQNOS:
536 raise ValueError("Invalid seqnos option %s. Valid choices: %s"
537 % (add_seqnos, self.OUTPUT_SEQNOS))
538 else:
539 self.add_seqnos = add_seqnos
540
542 """Provide a file to use as the guide tree for alignment.
543
544 Raises:
545 o IOError - If the tree_file doesn't exist."""
546 if not(os.path.exists(tree_file)):
547 raise IOError("Could not find the guide tree file %s." %
548 tree_file)
549 else:
550 self.guide_tree = tree_file
551
553 """Set the name of the guide tree file generated in the alignment.
554 """
555 self.new_tree = tree_file
556
558 """Set the type of protein matrix to use.
559
560 Protein matrix can be either one of the defined types (blosum, pam,
561 gonnet or id) or a file with your own defined matrix.
562 """
563 if string.upper(protein_matrix) in self.PROTEIN_MATRIX:
564 self.protein_matrix = string.upper(protein_matrix)
565 elif os.path.exists(protein_matrix):
566 self.protein_matrix = protein_matrix
567 else:
568 raise ValueError("Invalid matrix %s. Options are %s or a file." %
569 (string.upper(protein_matrix),
570 self.PROTEIN_MATRIX))
571
573 """Set the type of DNA matrix to use.
574
575 The dna_matrix can either be one of the defined types (iub or clustalw)
576 or a file with the matrix to use."""
577 if string.upper(dna_matrix) in self.DNA_MATRIX:
578 self.dna_matrix = string.upper(dna_matrix)
579 elif os.path.exists(dna_matrix):
580 self.dna_matrix = dna_matrix
581 else:
582 raise ValueError("Invalid matrix %s. Options are %s or a file." %
583 (dna_matrix, self.DNA_MATRIX))
584
586 """Set the type of residues within the file.
587
588 Clustal tries to guess whether the info is protein or DNA based on
589 the number of GATCs, but this can be wrong if you have a messed up
590 protein or DNA you are working with, so this allows you to set it
591 explicitly.
592 """
593 residue_type = string.upper(residue_type)
594 if residue_type in self.RESIDUE_TYPES:
595 self.type = residue_type
596 else:
597 raise ValueError("Invalid residue type %s. Valid choices are %s"
598 % (residue_type, self.RESIDUE_TYPES))
599