Package Bio :: Package Align :: Module AlignInfo
[hide private]
[frames] | no frames]

Source Code for Module Bio.Align.AlignInfo

  1  """Extract information from alignment objects. 
  2   
  3  In order to try and avoid huge alignment objects with tons of functions, 
  4  functions which return summary type information about alignments should 
  5  be put into classes in this module. 
  6   
  7  classes: 
  8  o SummaryInfo 
  9  o PSSM 
 10  """ 
 11   
 12   
 13  # standard library 
 14  import string 
 15  import math 
 16  import sys 
 17   
 18  # biopython modules 
 19  from Bio import Alphabet 
 20  from Bio.Alphabet import IUPAC 
 21  from Bio.Seq import Seq 
 22   
 23  # Expected random distributions for 20-letter protein, and 
 24  # for 4-letter nucleotide alphabets 
 25  Protein20Random = 0.05 
 26  Nucleotide4Random = 0.25 
27 -class SummaryInfo:
28 """Calculate summary info about the alignment. 29 30 This class should be used to caclculate information summarizing the 31 results of an alignment. This may either be straight consensus info 32 or more complicated things. 33 """
34 - def __init__(self, alignment):
35 """Initialize with the alignment to calculate information on. 36 ic_vector attribute. A dictionary. Keys: column numbers. Values: 37 """ 38 self.alignment = alignment 39 self.ic_vector = {}
40
41 - def dumb_consensus(self, threshold = .7, ambiguous = "X", 42 consensus_alpha = None, require_multiple = 0):
43 """Output a fast consensus sequence of the alignment. 44 45 This doesn't do anything fancy at all. It will just go through the 46 sequence residue by residue and count up the number of each type 47 of residue (ie. A or G or T or C for DNA) in all sequences in the 48 alignment. If the percentage of the most common residue type is 49 greater then the passed threshold, then we will add that residue type, 50 otherwise an ambiguous character will be added. 51 52 This could be made a lot fancier (ie. to take a substitution matrix 53 into account), but it just meant for a quick and dirty consensus. 54 55 Arguments: 56 o threshold - The threshold value that is required to add a particular 57 atom. 58 o ambiguous - The ambiguous character to be added when the threshold is 59 not reached. 60 o consensus_alpha - The alphabet to return for the consensus sequence. 61 If this is None, then we will try to guess the alphabet. 62 o require_multiple - If set as 1, this will require that more than 63 1 sequence be part of an alignment to put it in the consensus (ie. 64 not just 1 sequence and gaps). 65 """ 66 # Iddo Friedberg, 1-JUL-2004: changed ambiguous default to "X" 67 consensus = '' 68 69 # find the length of the consensus we are creating 70 con_len = self.alignment.get_alignment_length() 71 72 # go through each seq item 73 for n in range(con_len): 74 # keep track of the counts of the different atoms we get 75 atom_dict = {} 76 num_atoms = 0 77 78 for record in self.alignment._records: 79 # make sure we haven't run past the end of any sequences 80 # if they are of different lengths 81 if n < len(record.seq): 82 if record.seq[n] != '-' and record.seq[n] != '.': 83 if record.seq[n] not in atom_dict.keys(): 84 atom_dict[record.seq[n]] = 1 85 else: 86 atom_dict[record.seq[n]] = \ 87 atom_dict[record.seq[n]] + 1 88 89 num_atoms = num_atoms + 1 90 91 max_atoms = [] 92 max_size = 0 93 94 for atom in atom_dict.keys(): 95 if atom_dict[atom] > max_size: 96 max_atoms = [atom] 97 max_size = atom_dict[atom] 98 elif atom_dict[atom] == max_size: 99 max_atoms.append(atom) 100 101 if require_multiple and num_atoms == 1: 102 consensus = consensus + ambiguous 103 elif (len(max_atoms) == 1) and ((float(max_size)/float(num_atoms)) 104 >= threshold): 105 consensus = consensus + max_atoms[0] 106 else: 107 consensus = consensus + ambiguous 108 109 # we need to guess a consensus alphabet if one isn't specified 110 if consensus_alpha is None: 111 consensus_alpha = self._guess_consensus_alphabet() 112 113 return Seq(consensus, consensus_alpha)
114
115 - def gap_consensus(self, threshold = .7, ambiguous = "X", 116 consensus_alpha = None, require_multiple = 0):
117 """Same as dumb_consensus(), but allows gap on the output. 118 119 Things to do: Let the user define that with only one gap, the result 120 character in consensus is gap. Let the user select gap character, now 121 it takes the same is input. 122 """ 123 # Iddo Friedberg, 1-JUL-2004: changed ambiguous default to "X" 124 consensus = '' 125 126 # find the length of the consensus we are creating 127 con_len = self.alignment.get_alignment_length() 128 129 # go through each seq item 130 for n in range(con_len): 131 # keep track of the counts of the different atoms we get 132 atom_dict = {} 133 num_atoms = 0 134 135 for record in self.alignment._records: 136 # make sure we haven't run past the end of any sequences 137 # if they are of different lengths 138 if n < len(record.seq): 139 if record.seq[n] not in atom_dict.keys(): 140 atom_dict[record.seq[n]] = 1 141 else: 142 atom_dict[record.seq[n]] = \ 143 atom_dict[record.seq[n]] + 1 144 145 num_atoms = num_atoms + 1 146 147 max_atoms = [] 148 max_size = 0 149 150 for atom in atom_dict.keys(): 151 if atom_dict[atom] > max_size: 152 max_atoms = [atom] 153 max_size = atom_dict[atom] 154 elif atom_dict[atom] == max_size: 155 max_atoms.append(atom) 156 157 if require_multiple and num_atoms == 1: 158 consensus = consensus + ambiguous 159 elif (len(max_atoms) == 1) and ((float(max_size)/float(num_atoms)) 160 >= threshold): 161 consensus = consensus + max_atoms[0] 162 else: 163 consensus = consensus + ambiguous 164 165 # we need to guess a consensus alphabet if one isn't specified 166 if consensus_alpha is None: 167 consensus_alpha = self._guess_consensus_alphabet() 168 169 return Seq(consensus, consensus_alpha)
170
172 """Pick an alphabet for a consesus sequence of an alignment. 173 174 This just looks at the sequences we have, checks their type, and 175 returns as appropriate type which seems to make sense with the 176 sequences we've got. 177 """ 178 # check to be sure our alphabet is a gapped alphabet, otherwise 179 # we have a weird bunch of sequences -- this would make it harder 180 # to find an alphabet (since we'd have to check non-gapped types 181 # as well). 182 # Right now we'll raise an error for non-gapped alphabets, and 183 # if anyone complains, we can figure out what the use is for 184 # non-gapped alphabets. 185 186 if not(isinstance(self.alignment._records[0].seq.alphabet, 187 Alphabet.Gapped)): 188 raise ValueError \ 189 ("Non-gapped alphabet found in alignment object.") 190 191 # now check the types, and get an ambiguous alphabet based on the 192 # type of alphabet found. 193 if isinstance(self.alignment._records[0].seq.alphabet.alphabet, 194 Alphabet.ProteinAlphabet): 195 alpha = IUPAC.protein 196 elif isinstance(self.alignment._records[0].seq.alphabet.alphabet, 197 Alphabet.DNAAlphabet): 198 alpha = IUPAC.ambiguous_dna 199 elif isinstance(self.alignment._records[0].seq.alphabet.alphabet, 200 Alphabet.RNAAlphabet): 201 alpha = IUPAC.ambiguous_rna 202 else: 203 raise ValueError("Could not determine the type of alphabet.") 204 205 return alpha
206
207 - def replacement_dictionary(self, skip_chars = []):
208 """Generate a replacement dictionary to plug into a substitution matrix 209 210 This should look at an alignment, and be able to generate the number 211 of substitutions of different residues for each other in the 212 aligned object. 213 214 Will then return a dictionary with this information: 215 {('A', 'C') : 10, ('C', 'A') : 12, ('G', 'C') : 15 ....} 216 217 This also treats weighted sequences. The following example shows how 218 we calculate the replacement dictionary. Given the following 219 multiple sequence alignments: 220 221 GTATC 0.5 222 AT--C 0.8 223 CTGTC 1.0 224 225 For the first column we have: 226 ('A', 'G') : 0.5 * 0.8 = 0.4 227 ('C', 'G') : 0.5 * 1.0 = 0.5 228 ('A', 'C') : 0.8 * 1.0 = 0.8 229 230 We then continue this for all of the columns in the alignment, summing 231 the information for each substitution in each column, until we end 232 up with the replacement dictionary. 233 234 Arguments: 235 o skip_chars - A list of characters to skip when creating the dictionary. 236 For instance, you might have Xs (screened stuff) or Ns, and not want 237 to include the ambiguity characters in the dictionary. 238 """ 239 # get a starting dictionary based on the alphabet of the alignment 240 rep_dict, skip_items = self._get_base_replacements(skip_chars) 241 242 # iterate through each record 243 for rec_num1 in range(len(self.alignment._records)): 244 # iterate through each record from one beyond the current record 245 # to the end of the list of records 246 for rec_num2 in range(rec_num1 + 1, len(self.alignment._records)): 247 # for each pair of records, compare the sequences and add 248 # the pertinent info to the dictionary 249 rep_dict = self._pair_replacement( 250 self.alignment._records[rec_num1].seq, 251 self.alignment._records[rec_num2].seq, 252 self.alignment._records[rec_num1].annotations['weight'], 253 self.alignment._records[rec_num2].annotations['weight'], 254 rep_dict, skip_items) 255 256 return rep_dict
257
258 - def _pair_replacement(self, seq1, seq2, weight1, weight2, 259 start_dict, ignore_chars):
260 """Compare two sequences and generate info on the replacements seen. 261 262 Arguments: 263 o seq1, seq2 - The two sequences to compare. 264 o weight1, weight2 - The relative weights of seq1 and seq2. 265 o start_dict - The dictionary containing the starting replacement 266 info that we will modify. 267 o ignore_chars - A list of characters to ignore when calculating 268 replacements (ie. '-'). 269 270 Returns: 271 o A replacment dictionary which is modified from initial_dict with 272 the information from the sequence comparison. 273 """ 274 # loop through each residue in the sequences 275 for residue_num in range(len(seq1)): 276 residue1 = seq1[residue_num] 277 try: 278 residue2 = seq2[residue_num] 279 # if seq2 is shorter, then we just stop looking at replacements 280 # and return the information 281 except IndexError: 282 return start_dict 283 284 # if the two residues are characters we want to count 285 if (residue1 not in ignore_chars) and (residue2 not in ignore_chars): 286 try: 287 # add info about the replacement to the dictionary, 288 # modified by the sequence weights 289 start_dict[(residue1, residue2)] = \ 290 start_dict[(residue1, residue2)] + \ 291 weight1 * weight2 292 293 # if we get a key error, then we've got a problem with alphabets 294 except KeyError: 295 raise ValueError("Residues %s, %s not found in alphabet %s" 296 % (residue1, residue2, 297 self.alignment._alphabet)) 298 299 return start_dict
300 301
302 - def _get_base_replacements(self, skip_items = []):
303 """Get a zeroed dictonary of all possible letter combinations. 304 305 This looks at the type of alphabet and gets the letters for it. 306 It then creates a dictionary with all possible combinations of these 307 letters as keys (ie. ('A', 'G')) and sets the values as zero. 308 309 Returns: 310 o The base dictionary created 311 o A list of alphabet items to skip when filling the dictionary.Right 312 now the only thing I can imagine in this list is gap characters, but 313 maybe X's or something else might be useful later. This will also 314 include any characters that are specified to be skipped. 315 """ 316 base_dictionary = {} 317 all_letters = self.alignment._alphabet.letters 318 319 # if we have a gapped alphabet we need to find the gap character 320 # and drop it out 321 if isinstance(self.alignment._alphabet, Alphabet.Gapped): 322 skip_items.append(self.alignment._alphabet.gap_char) 323 all_letters = string.replace(all_letters, 324 self.alignment._alphabet.gap_char, 325 '') 326 327 # now create the dictionary 328 for first_letter in all_letters: 329 for second_letter in all_letters: 330 if (first_letter not in skip_items and 331 second_letter not in skip_items): 332 base_dictionary[(first_letter, second_letter)] = 0 333 334 return base_dictionary, skip_items
335 336
337 - def pos_specific_score_matrix(self, axis_seq = None, 338 chars_to_ignore = []):
339 """Create a position specific score matrix object for the alignment. 340 341 This creates a position specific score matrix (pssm) which is an 342 alternative method to look at a consensus sequence. 343 344 Arguments: 345 o chars_to_ignore - A listing of all characters not to include in 346 the pssm. By default, gap characters will be excluded. 347 o axis_seq - An optional argument specifying the sequence to 348 put on the axis of the PSSM. This should be a Seq object. If nothing 349 is specified, the consensus sequence, calculated with default 350 parameters, will be used. 351 352 Returns: 353 o A PSSM (position specific score matrix) object. 354 """ 355 # determine all of the letters we have to deal with 356 all_letters = self.alignment._alphabet.letters 357 358 # if we have a gap char, add it to stuff to ignore 359 if isinstance(self.alignment._alphabet, Alphabet.Gapped): 360 chars_to_ignore.append(self.alignment._alphabet.gap_char) 361 362 for char in chars_to_ignore: 363 all_letters = string.replace(all_letters, char, '') 364 365 if axis_seq: 366 left_seq = axis_seq 367 else: 368 left_seq = self.dumb_consensus() 369 370 pssm_info = [] 371 # now start looping through all of the sequences and getting info 372 for residue_num in range(len(left_seq)): 373 score_dict = self._get_base_letters(all_letters) 374 for record in self.alignment._records: 375 try: 376 this_residue = record.seq[residue_num] 377 # if we hit an index error we've run out of sequence and 378 # should not add new residues 379 except IndexError: 380 this_residue = None 381 382 if this_residue and this_residue not in chars_to_ignore: 383 try: 384 score_dict[this_residue] = score_dict[this_residue] + \ 385 record.annotations['weight'] 386 387 # if we get a KeyError then we have an alphabet problem 388 except KeyError: 389 raise ValueError("Residue %s not found in alphabet %s" 390 % (this_residue, 391 self.alignment._alphabet)) 392 393 pssm_info.append((left_seq[residue_num], 394 score_dict)) 395 396 397 return PSSM(pssm_info)
398
399 - def _get_base_letters(self, letters):
400 """Create a zeroed dictionary with all of the specified letters. 401 """ 402 base_info = {} 403 for letter in letters: 404 base_info[letter] = 0 405 406 return base_info
407
408 - def information_content(self, start = 0, 409 end = None, 410 e_freq_table = None, log_base = 2, 411 chars_to_ignore = []):
412 """Calculate the information content for each residue along an alignment. 413 414 Arguments: 415 o start, end - The starting an ending points to calculate the 416 information content. These points should be relative to the first 417 sequence in the alignment, starting at zero (ie. even if the 'real' 418 first position in the seq is 203 in the initial sequence, for 419 the info content, we need to use zero). This defaults to the entire 420 length of the first sequence. 421 o e_freq_table - A FreqTable object specifying the expected frequencies 422 for each letter in the alphabet we are using (ie. {'G' : 0.4, 423 'C' : 0.4, 'T' : 0.1, 'A' : 0.1}). Gap characters should not be 424 included, since these should not have expected frequencies. 425 o log_base - The base of the logathrim to use in calculating the 426 information content. This defaults to 2 so the info is in bits. 427 o chars_to_ignore - A listing of characterw which should be ignored 428 in calculating the info content. 429 430 Returns: 431 o A number representing the info content for the specified region. 432 433 Please see the Biopython manual for more information on how information 434 content is calculated. 435 """ 436 # if no end was specified, then we default to the end of the sequence 437 if end is None: 438 end = len(self.alignment._records[0].seq) 439 440 if start < 0 or end > len(self.alignment._records[0].seq): 441 raise ValueError \ 442 ("Start (%s) and end (%s) are not in the range %s to %s" 443 % (start, end, 0, len(self.alignment._records[0].seq))) 444 # determine random expected frequencies, if necessary 445 # Iddo Friedberg, 1-JUL-2004: fixed self.alignment._alphabet.alphabet to 446 # self.alignment._alphabet 447 if not e_freq_table: 448 if isinstance(self.alignment._alphabet, 449 Alphabet.ProteinAlphabet): 450 random_expected = Protein20Random 451 elif isinstance(self.alignment._alphabet, Alphabet.RNAAlphabet) or \ 452 isinstance(self.alignment._alphabet, Alphabet.DNAAlphabet): 453 random_expected = Nucleotide4Random 454 # Iddo, 15-FEB-2005: added the following for encoded Alpahbets (e.g. Gapped()) 455 elif isinstance(self.alignment._alphabet, Alphabet.AlphabetEncoder): 456 if isinstance(self.alignment._alphabet.alphabet, Alphabet.ProteinAlphabet): 457 random_expected = Protein20Random 458 elif isinstance(self.alignment._alphabet.alphabet, Alphabet.RNAAlphabet) or \ 459 isinstance(self.alignment._alphabet.alphabet, Alphabet.DNAAlphabet): 460 random_expected = Nucleotide4Random 461 else: 462 errstr = "Error in alphabet: not Nucleotide or Protein, " 463 errstr += "supply expected frequencies" 464 raise ValueError, errstr 465 else: 466 random_expected = None 467 # determine all of the letters we have to deal with 468 all_letters = self.alignment._alphabet.letters 469 for char in chars_to_ignore: 470 all_letters = string.replace(all_letters, char, '') 471 472 info_content = {} 473 for residue_num in range(start, end): 474 freq_dict = self._get_letter_freqs(residue_num, 475 self.alignment._records, 476 all_letters, chars_to_ignore) 477 # print freq_dict, 478 column_score = self._get_column_info_content(freq_dict, 479 e_freq_table, 480 log_base, 481 random_expected) 482 483 info_content[residue_num] = column_score 484 # sum up the score 485 total_info = 0 486 for column_info in info_content.values(): 487 total_info = total_info + column_info 488 # fill in the ic_vector member: holds IC for each column 489 for i in info_content.keys(): 490 self.ic_vector[i] = info_content[i] 491 return total_info
492
493 - def _get_letter_freqs(self, residue_num, all_records, letters, to_ignore):
494 """Determine the frequency of specific letters in the alignment. 495 496 Arguments: 497 o residue_num - The number of the column we are getting frequencies 498 from. 499 o all_records - All of the SeqRecords in the alignment. 500 o letters - The letters we are interested in getting the frequency 501 for. 502 o to_ignore - Letters we are specifically supposed to ignore. 503 504 This will calculate the frequencies of each of the specified letters 505 in the alignment at the given frequency, and return this as a 506 dictionary where the keys are the letters and the values are the 507 frequencies. 508 """ 509 freq_info = self._get_base_letters(letters) 510 511 total_count = 0 512 # collect the count info into the dictionary for all the records 513 for record in all_records: 514 try: 515 if record.seq[residue_num] not in to_ignore: 516 freq_info[record.seq[residue_num]] = \ 517 freq_info[record.seq[residue_num]] + \ 518 record.annotations['weight'] 519 520 total_count = total_count + record.annotations['weight'] 521 # getting a key error means we've got a problem with the alphabet 522 except KeyError: 523 raise ValueError("Residue %s not found in alphabet %s" 524 % (record.seq[residue_num], 525 self.alignment._alphabet)) 526 527 # now convert the counts into frequencies 528 for letter in freq_info.keys(): 529 freq_info[letter] = freq_info[letter] / total_count 530 531 return freq_info
532
533 - def _get_column_info_content(self, obs_freq, e_freq_table, log_base, 534 random_expected):
535 """Calculate the information content for a column. 536 537 Arguments: 538 o obs_freq - The frequencies observed for each letter in the column. 539 o e_freq_table - An optional argument specifying the expected 540 frequencies for each letter. This is a SubsMat.FreqTable instance. 541 o log_base - The base of the logathrim to use in calculating the 542 info content. 543 """ 544 if e_freq_table: 545 # check the expected freq information to make sure it is good 546 for key in obs_freq.keys(): 547 if (key != self.alignment._alphabet.gap_char and 548 key not in e_freq_table): 549 raise ValueError("Expected frequency letters %s" + 550 " do not match observed %s" 551 % (e_freq_table.keys(), obs_freq.keys() - 552 [self.alignment._alphabet.gap_char])) 553 554 total_info = 0 555 556 for letter in obs_freq.keys(): 557 inner_log = 0. 558 # if we have expected frequencies, modify the log value by them 559 # gap characters do not have expected frequencies, so they 560 # should just be the observed frequency. 561 if letter != self.alignment._alphabet.gap_char: 562 if e_freq_table: 563 inner_log = obs_freq[letter] / e_freq_table[letter] 564 else: 565 inner_log = obs_freq[letter] / random_expected 566 # if the observed frequency is zero, we don't add any info to the 567 # total information content 568 if inner_log > 0: 569 letter_info = (obs_freq[letter] * 570 math.log(inner_log) / math.log(log_base)) 571 total_info = total_info + letter_info 572 return total_info
573
574 - def get_column(self,col):
575 return self.alignment.get_column(col)
576
577 -class PSSM:
578 """Represent a position specific score matrix. 579 580 This class is meant to make it easy to access the info within a PSSM 581 and also make it easy to print out the information in a nice table. 582 583 Let's say you had an alignment like this: 584 GTATC 585 AT--C 586 CTGTC 587 588 The position specific score matrix (when printed) looks like: 589 590 G A T C 591 G 1 1 0 1 592 T 0 0 3 0 593 A 1 1 0 0 594 T 0 0 2 0 595 C 0 0 0 3 596 597 You can access a single element of the PSSM using the following: 598 599 your_pssm[sequence_number][residue_count_name] 600 601 For instance, to get the 'T' residue for the second element in the 602 above alignment you would need to do: 603 604 your_pssm[1]['T'] 605 """
606 - def __init__(self, pssm):
607 """Initialize with pssm data to represent. 608 609 The pssm passed should be a list with the following structure: 610 611 list[0] - The letter of the residue being represented (for instance, 612 from the example above, the first few list[0]s would be GTAT... 613 list[1] - A dictionary with the letter substitutions and counts. 614 """ 615 self.pssm = pssm
616
617 - def __getitem__(self, pos):
618 return self.pssm[pos][1]
619
620 - def __str__(self):
621 out = " " 622 all_residues = self.pssm[0][1].keys() 623 all_residues.sort() 624 625 # first print out the top header 626 for res in all_residues: 627 out = out + " %s" % res 628 out = out + "\n" 629 630 # for each item, write out the substitutions 631 for item in self.pssm: 632 out = out + "%s " % item[0] 633 for res in all_residues: 634 out = out + " %.1f" % item[1][res] 635 636 out = out + "\n" 637 return out
638
639 - def get_residue(self, pos):
640 """Return the residue letter at the specified position. 641 """ 642 return self.pssm[pos][0]
643 644 657