1 """Extract information from alignment objects.
2
3 In order to try and avoid huge alignment objects with tons of functions,
4 functions which return summary type information about alignments should
5 be put into classes in this module.
6
7 classes:
8 o SummaryInfo
9 o PSSM
10 """
11
12
13
14 import string
15 import math
16 import sys
17
18
19 from Bio import Alphabet
20 from Bio.Alphabet import IUPAC
21 from Bio.Seq import Seq
22
23
24
25 Protein20Random = 0.05
26 Nucleotide4Random = 0.25
28 """Calculate summary info about the alignment.
29
30 This class should be used to caclculate information summarizing the
31 results of an alignment. This may either be straight consensus info
32 or more complicated things.
33 """
35 """Initialize with the alignment to calculate information on.
36 ic_vector attribute. A dictionary. Keys: column numbers. Values:
37 """
38 self.alignment = alignment
39 self.ic_vector = {}
40
41 - def dumb_consensus(self, threshold = .7, ambiguous = "X",
42 consensus_alpha = None, require_multiple = 0):
43 """Output a fast consensus sequence of the alignment.
44
45 This doesn't do anything fancy at all. It will just go through the
46 sequence residue by residue and count up the number of each type
47 of residue (ie. A or G or T or C for DNA) in all sequences in the
48 alignment. If the percentage of the most common residue type is
49 greater then the passed threshold, then we will add that residue type,
50 otherwise an ambiguous character will be added.
51
52 This could be made a lot fancier (ie. to take a substitution matrix
53 into account), but it just meant for a quick and dirty consensus.
54
55 Arguments:
56 o threshold - The threshold value that is required to add a particular
57 atom.
58 o ambiguous - The ambiguous character to be added when the threshold is
59 not reached.
60 o consensus_alpha - The alphabet to return for the consensus sequence.
61 If this is None, then we will try to guess the alphabet.
62 o require_multiple - If set as 1, this will require that more than
63 1 sequence be part of an alignment to put it in the consensus (ie.
64 not just 1 sequence and gaps).
65 """
66
67 consensus = ''
68
69
70 con_len = self.alignment.get_alignment_length()
71
72
73 for n in range(con_len):
74
75 atom_dict = {}
76 num_atoms = 0
77
78 for record in self.alignment._records:
79
80
81 if n < len(record.seq):
82 if record.seq[n] != '-' and record.seq[n] != '.':
83 if record.seq[n] not in atom_dict.keys():
84 atom_dict[record.seq[n]] = 1
85 else:
86 atom_dict[record.seq[n]] = \
87 atom_dict[record.seq[n]] + 1
88
89 num_atoms = num_atoms + 1
90
91 max_atoms = []
92 max_size = 0
93
94 for atom in atom_dict.keys():
95 if atom_dict[atom] > max_size:
96 max_atoms = [atom]
97 max_size = atom_dict[atom]
98 elif atom_dict[atom] == max_size:
99 max_atoms.append(atom)
100
101 if require_multiple and num_atoms == 1:
102 consensus = consensus + ambiguous
103 elif (len(max_atoms) == 1) and ((float(max_size)/float(num_atoms))
104 >= threshold):
105 consensus = consensus + max_atoms[0]
106 else:
107 consensus = consensus + ambiguous
108
109
110 if consensus_alpha is None:
111 consensus_alpha = self._guess_consensus_alphabet()
112
113 return Seq(consensus, consensus_alpha)
114
115 - def gap_consensus(self, threshold = .7, ambiguous = "X",
116 consensus_alpha = None, require_multiple = 0):
117 """Same as dumb_consensus(), but allows gap on the output.
118
119 Things to do: Let the user define that with only one gap, the result
120 character in consensus is gap. Let the user select gap character, now
121 it takes the same is input.
122 """
123
124 consensus = ''
125
126
127 con_len = self.alignment.get_alignment_length()
128
129
130 for n in range(con_len):
131
132 atom_dict = {}
133 num_atoms = 0
134
135 for record in self.alignment._records:
136
137
138 if n < len(record.seq):
139 if record.seq[n] not in atom_dict.keys():
140 atom_dict[record.seq[n]] = 1
141 else:
142 atom_dict[record.seq[n]] = \
143 atom_dict[record.seq[n]] + 1
144
145 num_atoms = num_atoms + 1
146
147 max_atoms = []
148 max_size = 0
149
150 for atom in atom_dict.keys():
151 if atom_dict[atom] > max_size:
152 max_atoms = [atom]
153 max_size = atom_dict[atom]
154 elif atom_dict[atom] == max_size:
155 max_atoms.append(atom)
156
157 if require_multiple and num_atoms == 1:
158 consensus = consensus + ambiguous
159 elif (len(max_atoms) == 1) and ((float(max_size)/float(num_atoms))
160 >= threshold):
161 consensus = consensus + max_atoms[0]
162 else:
163 consensus = consensus + ambiguous
164
165
166 if consensus_alpha is None:
167 consensus_alpha = self._guess_consensus_alphabet()
168
169 return Seq(consensus, consensus_alpha)
170
206
208 """Generate a replacement dictionary to plug into a substitution matrix
209
210 This should look at an alignment, and be able to generate the number
211 of substitutions of different residues for each other in the
212 aligned object.
213
214 Will then return a dictionary with this information:
215 {('A', 'C') : 10, ('C', 'A') : 12, ('G', 'C') : 15 ....}
216
217 This also treats weighted sequences. The following example shows how
218 we calculate the replacement dictionary. Given the following
219 multiple sequence alignments:
220
221 GTATC 0.5
222 AT--C 0.8
223 CTGTC 1.0
224
225 For the first column we have:
226 ('A', 'G') : 0.5 * 0.8 = 0.4
227 ('C', 'G') : 0.5 * 1.0 = 0.5
228 ('A', 'C') : 0.8 * 1.0 = 0.8
229
230 We then continue this for all of the columns in the alignment, summing
231 the information for each substitution in each column, until we end
232 up with the replacement dictionary.
233
234 Arguments:
235 o skip_chars - A list of characters to skip when creating the dictionary.
236 For instance, you might have Xs (screened stuff) or Ns, and not want
237 to include the ambiguity characters in the dictionary.
238 """
239
240 rep_dict, skip_items = self._get_base_replacements(skip_chars)
241
242
243 for rec_num1 in range(len(self.alignment._records)):
244
245
246 for rec_num2 in range(rec_num1 + 1, len(self.alignment._records)):
247
248
249 rep_dict = self._pair_replacement(
250 self.alignment._records[rec_num1].seq,
251 self.alignment._records[rec_num2].seq,
252 self.alignment._records[rec_num1].annotations['weight'],
253 self.alignment._records[rec_num2].annotations['weight'],
254 rep_dict, skip_items)
255
256 return rep_dict
257
258 - def _pair_replacement(self, seq1, seq2, weight1, weight2,
259 start_dict, ignore_chars):
260 """Compare two sequences and generate info on the replacements seen.
261
262 Arguments:
263 o seq1, seq2 - The two sequences to compare.
264 o weight1, weight2 - The relative weights of seq1 and seq2.
265 o start_dict - The dictionary containing the starting replacement
266 info that we will modify.
267 o ignore_chars - A list of characters to ignore when calculating
268 replacements (ie. '-').
269
270 Returns:
271 o A replacment dictionary which is modified from initial_dict with
272 the information from the sequence comparison.
273 """
274
275 for residue_num in range(len(seq1)):
276 residue1 = seq1[residue_num]
277 try:
278 residue2 = seq2[residue_num]
279
280
281 except IndexError:
282 return start_dict
283
284
285 if (residue1 not in ignore_chars) and (residue2 not in ignore_chars):
286 try:
287
288
289 start_dict[(residue1, residue2)] = \
290 start_dict[(residue1, residue2)] + \
291 weight1 * weight2
292
293
294 except KeyError:
295 raise ValueError("Residues %s, %s not found in alphabet %s"
296 % (residue1, residue2,
297 self.alignment._alphabet))
298
299 return start_dict
300
301
303 """Get a zeroed dictonary of all possible letter combinations.
304
305 This looks at the type of alphabet and gets the letters for it.
306 It then creates a dictionary with all possible combinations of these
307 letters as keys (ie. ('A', 'G')) and sets the values as zero.
308
309 Returns:
310 o The base dictionary created
311 o A list of alphabet items to skip when filling the dictionary.Right
312 now the only thing I can imagine in this list is gap characters, but
313 maybe X's or something else might be useful later. This will also
314 include any characters that are specified to be skipped.
315 """
316 base_dictionary = {}
317 all_letters = self.alignment._alphabet.letters
318
319
320
321 if isinstance(self.alignment._alphabet, Alphabet.Gapped):
322 skip_items.append(self.alignment._alphabet.gap_char)
323 all_letters = string.replace(all_letters,
324 self.alignment._alphabet.gap_char,
325 '')
326
327
328 for first_letter in all_letters:
329 for second_letter in all_letters:
330 if (first_letter not in skip_items and
331 second_letter not in skip_items):
332 base_dictionary[(first_letter, second_letter)] = 0
333
334 return base_dictionary, skip_items
335
336
339 """Create a position specific score matrix object for the alignment.
340
341 This creates a position specific score matrix (pssm) which is an
342 alternative method to look at a consensus sequence.
343
344 Arguments:
345 o chars_to_ignore - A listing of all characters not to include in
346 the pssm. By default, gap characters will be excluded.
347 o axis_seq - An optional argument specifying the sequence to
348 put on the axis of the PSSM. This should be a Seq object. If nothing
349 is specified, the consensus sequence, calculated with default
350 parameters, will be used.
351
352 Returns:
353 o A PSSM (position specific score matrix) object.
354 """
355
356 all_letters = self.alignment._alphabet.letters
357
358
359 if isinstance(self.alignment._alphabet, Alphabet.Gapped):
360 chars_to_ignore.append(self.alignment._alphabet.gap_char)
361
362 for char in chars_to_ignore:
363 all_letters = string.replace(all_letters, char, '')
364
365 if axis_seq:
366 left_seq = axis_seq
367 else:
368 left_seq = self.dumb_consensus()
369
370 pssm_info = []
371
372 for residue_num in range(len(left_seq)):
373 score_dict = self._get_base_letters(all_letters)
374 for record in self.alignment._records:
375 try:
376 this_residue = record.seq[residue_num]
377
378
379 except IndexError:
380 this_residue = None
381
382 if this_residue and this_residue not in chars_to_ignore:
383 try:
384 score_dict[this_residue] = score_dict[this_residue] + \
385 record.annotations['weight']
386
387
388 except KeyError:
389 raise ValueError("Residue %s not found in alphabet %s"
390 % (this_residue,
391 self.alignment._alphabet))
392
393 pssm_info.append((left_seq[residue_num],
394 score_dict))
395
396
397 return PSSM(pssm_info)
398
400 """Create a zeroed dictionary with all of the specified letters.
401 """
402 base_info = {}
403 for letter in letters:
404 base_info[letter] = 0
405
406 return base_info
407
408 - def information_content(self, start = 0,
409 end = None,
410 e_freq_table = None, log_base = 2,
411 chars_to_ignore = []):
412 """Calculate the information content for each residue along an alignment.
413
414 Arguments:
415 o start, end - The starting an ending points to calculate the
416 information content. These points should be relative to the first
417 sequence in the alignment, starting at zero (ie. even if the 'real'
418 first position in the seq is 203 in the initial sequence, for
419 the info content, we need to use zero). This defaults to the entire
420 length of the first sequence.
421 o e_freq_table - A FreqTable object specifying the expected frequencies
422 for each letter in the alphabet we are using (ie. {'G' : 0.4,
423 'C' : 0.4, 'T' : 0.1, 'A' : 0.1}). Gap characters should not be
424 included, since these should not have expected frequencies.
425 o log_base - The base of the logathrim to use in calculating the
426 information content. This defaults to 2 so the info is in bits.
427 o chars_to_ignore - A listing of characterw which should be ignored
428 in calculating the info content.
429
430 Returns:
431 o A number representing the info content for the specified region.
432
433 Please see the Biopython manual for more information on how information
434 content is calculated.
435 """
436
437 if end is None:
438 end = len(self.alignment._records[0].seq)
439
440 if start < 0 or end > len(self.alignment._records[0].seq):
441 raise ValueError \
442 ("Start (%s) and end (%s) are not in the range %s to %s"
443 % (start, end, 0, len(self.alignment._records[0].seq)))
444
445
446
447 if not e_freq_table:
448 if isinstance(self.alignment._alphabet,
449 Alphabet.ProteinAlphabet):
450 random_expected = Protein20Random
451 elif isinstance(self.alignment._alphabet, Alphabet.RNAAlphabet) or \
452 isinstance(self.alignment._alphabet, Alphabet.DNAAlphabet):
453 random_expected = Nucleotide4Random
454
455 elif isinstance(self.alignment._alphabet, Alphabet.AlphabetEncoder):
456 if isinstance(self.alignment._alphabet.alphabet, Alphabet.ProteinAlphabet):
457 random_expected = Protein20Random
458 elif isinstance(self.alignment._alphabet.alphabet, Alphabet.RNAAlphabet) or \
459 isinstance(self.alignment._alphabet.alphabet, Alphabet.DNAAlphabet):
460 random_expected = Nucleotide4Random
461 else:
462 errstr = "Error in alphabet: not Nucleotide or Protein, "
463 errstr += "supply expected frequencies"
464 raise ValueError, errstr
465 else:
466 random_expected = None
467
468 all_letters = self.alignment._alphabet.letters
469 for char in chars_to_ignore:
470 all_letters = string.replace(all_letters, char, '')
471
472 info_content = {}
473 for residue_num in range(start, end):
474 freq_dict = self._get_letter_freqs(residue_num,
475 self.alignment._records,
476 all_letters, chars_to_ignore)
477
478 column_score = self._get_column_info_content(freq_dict,
479 e_freq_table,
480 log_base,
481 random_expected)
482
483 info_content[residue_num] = column_score
484
485 total_info = 0
486 for column_info in info_content.values():
487 total_info = total_info + column_info
488
489 for i in info_content.keys():
490 self.ic_vector[i] = info_content[i]
491 return total_info
492
494 """Determine the frequency of specific letters in the alignment.
495
496 Arguments:
497 o residue_num - The number of the column we are getting frequencies
498 from.
499 o all_records - All of the SeqRecords in the alignment.
500 o letters - The letters we are interested in getting the frequency
501 for.
502 o to_ignore - Letters we are specifically supposed to ignore.
503
504 This will calculate the frequencies of each of the specified letters
505 in the alignment at the given frequency, and return this as a
506 dictionary where the keys are the letters and the values are the
507 frequencies.
508 """
509 freq_info = self._get_base_letters(letters)
510
511 total_count = 0
512
513 for record in all_records:
514 try:
515 if record.seq[residue_num] not in to_ignore:
516 freq_info[record.seq[residue_num]] = \
517 freq_info[record.seq[residue_num]] + \
518 record.annotations['weight']
519
520 total_count = total_count + record.annotations['weight']
521
522 except KeyError:
523 raise ValueError("Residue %s not found in alphabet %s"
524 % (record.seq[residue_num],
525 self.alignment._alphabet))
526
527
528 for letter in freq_info.keys():
529 freq_info[letter] = freq_info[letter] / total_count
530
531 return freq_info
532
533 - def _get_column_info_content(self, obs_freq, e_freq_table, log_base,
534 random_expected):
535 """Calculate the information content for a column.
536
537 Arguments:
538 o obs_freq - The frequencies observed for each letter in the column.
539 o e_freq_table - An optional argument specifying the expected
540 frequencies for each letter. This is a SubsMat.FreqTable instance.
541 o log_base - The base of the logathrim to use in calculating the
542 info content.
543 """
544 if e_freq_table:
545
546 for key in obs_freq.keys():
547 if (key != self.alignment._alphabet.gap_char and
548 key not in e_freq_table):
549 raise ValueError("Expected frequency letters %s" +
550 " do not match observed %s"
551 % (e_freq_table.keys(), obs_freq.keys() -
552 [self.alignment._alphabet.gap_char]))
553
554 total_info = 0
555
556 for letter in obs_freq.keys():
557 inner_log = 0.
558
559
560
561 if letter != self.alignment._alphabet.gap_char:
562 if e_freq_table:
563 inner_log = obs_freq[letter] / e_freq_table[letter]
564 else:
565 inner_log = obs_freq[letter] / random_expected
566
567
568 if inner_log > 0:
569 letter_info = (obs_freq[letter] *
570 math.log(inner_log) / math.log(log_base))
571 total_info = total_info + letter_info
572 return total_info
573
576
578 """Represent a position specific score matrix.
579
580 This class is meant to make it easy to access the info within a PSSM
581 and also make it easy to print out the information in a nice table.
582
583 Let's say you had an alignment like this:
584 GTATC
585 AT--C
586 CTGTC
587
588 The position specific score matrix (when printed) looks like:
589
590 G A T C
591 G 1 1 0 1
592 T 0 0 3 0
593 A 1 1 0 0
594 T 0 0 2 0
595 C 0 0 0 3
596
597 You can access a single element of the PSSM using the following:
598
599 your_pssm[sequence_number][residue_count_name]
600
601 For instance, to get the 'T' residue for the second element in the
602 above alignment you would need to do:
603
604 your_pssm[1]['T']
605 """
607 """Initialize with pssm data to represent.
608
609 The pssm passed should be a list with the following structure:
610
611 list[0] - The letter of the residue being represented (for instance,
612 from the example above, the first few list[0]s would be GTAT...
613 list[1] - A dictionary with the letter substitutions and counts.
614 """
615 self.pssm = pssm
616
618 return self.pssm[pos][1]
619
621 out = " "
622 all_residues = self.pssm[0][1].keys()
623 all_residues.sort()
624
625
626 for res in all_residues:
627 out = out + " %s" % res
628 out = out + "\n"
629
630
631 for item in self.pssm:
632 out = out + "%s " % item[0]
633 for res in all_residues:
634 out = out + " %.1f" % item[1][res]
635
636 out = out + "\n"
637 return out
638
640 """Return the residue letter at the specified position.
641 """
642 return self.pssm[pos][0]
643
644
645 -def print_info_content(summary_info,fout=None,rep_record=0):
646 """ Three column output: position, aa in representative sequence,
647 ic_vector value"""
648 fout = fout or sys.stdout
649 if not summary_info.ic_vector:
650 summary_info.information_content()
651 rep_sequence = summary_info.alignment._records[rep_record].seq
652 positions = summary_info.ic_vector.keys()
653 positions.sort()
654 for pos in positions:
655 fout.write("%d %s %.3f\n" % (pos, rep_sequence[pos],
656 summary_info.ic_vector[pos]))
657