1 """Substitution matrices, log odds matrices, and operations on them.
2 """
3 import re
4 import string
5 import sys
6 import copy
7 import math
8
9 from Bio import Alphabet
10 from Bio.SubsMat import FreqTable
11
12
13 log = math.log
14
15 NOTYPE = 0
16 ACCREP = 1
17 OBSFREQ = 2
18 SUBS = 3
19 EXPFREQ = 4
20 LO = 5
21 EPSILON = 0.00000000000001
23 """Exception raised when verifying a matrix"""
26 BadMatrixError = BadMatrix()
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
130 """A Generic sequence matrix class
131 The key is a 2-tuple containing the letter indices of the matrix. Those
132 should be sorted in the tuple (low, high). Because each matrix is dealt
133 with as a half-matrix."""
134
136 ab_dict = {}
137 s = ''
138 for i in self.keys():
139 ab_dict[i[0]] = 1
140 ab_dict[i[1]] = 1
141 letters_list = ab_dict.keys()
142 letters_list.sort()
143 for i in letters_list:
144 s = s + i
145 self.alphabet.letters = s
146
147 - def __init__(self,data=None, alphabet=None,
148 mat_type=NOTYPE,mat_name='',build_later=0):
202
204 keylist = self.keys()
205 for key in keylist:
206 if key[0] > key[1]:
207 self[(key[1],key[0])] = self[key]
208 del self[key]
209
211 """
212 Convert a full-matrix to a half-matrix
213 """
214
215
216
217
218 N = len(self.alphabet.letters)
219
220 if len(self) == N*(N+1)/2:
221 return
222 for i in self.ab_list:
223 for j in self.ab_list[:self.ab_list.index(i)+1]:
224 if i <> j:
225 self[j,i] = self[j,i] + self[i,j]
226 del self[i,j]
227
229 for i in self.ab_list:
230 for j in self.ab_list[:self.ab_list.index(i)+1]:
231 self[j,i] = 0.
232
234 """if this matrix is a log-odds matrix, return its entropy
235 Needs the observed frequency matrix for that"""
236 ent = 0.
237 if self.mat_type == LO:
238 for i in self.keys():
239 ent += obs_freq_mat[i]*self[i]/log(2)
240 elif self.mat_type == SUBS:
241 for i in self.keys():
242 if self[i] > EPSILON:
243 ent += obs_freq_mat[i]*log(self[i])/log(2)
244 else:
245 raise TypeError,"entropy: substitution or log-odds matrices only"
246 self.relative_entropy = ent
247
249 self.entropy = 0
250 for i in self.keys():
251 if self[i] > EPSILON:
252 self.entropy += self[i]*log(self[i])/log(2)
253 self.entropy = -self.entropy
255 assert letter in self.alphabet.letters
256 sum = 0.
257 for i in self.keys():
258 if letter in i:
259 if i[0] == i[1]:
260 sum += self[i]
261 else:
262 sum += (self[i] / 2.)
263 return sum
264
268 - def print_full_mat(self,f=None,format="%4d",topformat="%4s",
269 alphabet=None,factor=1,non_sym=None):
270 f = f or sys.stdout
271
272
273 assert non_sym == None or type(non_sym) == type(1.) or \
274 type(non_sym) == type(1)
275 full_mat = copy.copy(self)
276 for i in self:
277 if i[0] <> i[1]:
278 full_mat[(i[1],i[0])] = full_mat[i]
279 if not alphabet:
280 alphabet = self.ab_list
281 topline = ''
282 for i in alphabet:
283 topline = topline + topformat % i
284 topline = topline + '\n'
285 f.write(topline)
286 for i in alphabet:
287 outline = i
288 for j in alphabet:
289 if alphabet.index(j) > alphabet.index(i) and non_sym <> None:
290 val = non_sym
291 else:
292 val = full_mat[i,j]
293 val *= factor
294 if val <= -999:
295 cur_str = ' ND'
296 else:
297 cur_str = format % val
298
299 outline = outline+cur_str
300 outline = outline+'\n'
301 f.write(outline)
302
303 - def print_mat(self,f=None,format="%4d",bottomformat="%4s",
304 alphabet=None,factor=1):
305 """Print a nice half-matrix. f=sys.stdout to see on the screen
306 User may pass own alphabet, which should contain all letters in the
307 alphabet of the matrix, but may be in a different order. This
308 order will be the order of the letters on the axes"""
309
310 f = f or sys.stdout
311 if not alphabet:
312 alphabet = self.ab_list
313 bottomline = ''
314 for i in alphabet:
315 bottomline = bottomline + bottomformat % i
316 bottomline = bottomline + '\n'
317 for i in alphabet:
318 outline = i
319 for j in alphabet[:alphabet.index(i)+1]:
320 try:
321 val = self[j,i]
322 except KeyError:
323 val = self[i,j]
324 val *= factor
325 if val == -999:
326 cur_str = ' ND'
327 else:
328 cur_str = format % val
329
330 outline = outline+cur_str
331 outline = outline+'\n'
332 f.write(outline)
333 f.write(bottomline)
335 """ returns a number which is the subtraction product of the two matrices"""
336 mat_diff = 0
337 for i in self.keys():
338 mat_diff += (self[i] - other[i])
339 return mat_diff
341 """ returns a matrix for which each entry is the multiplication product of the
342 two matrices passed"""
343 new_mat = copy.copy(self)
344 for i in self.keys():
345 new_mat[i] *= other[i]
346 return new_mat
348 new_mat = copy.copy(self)
349 for i in self.keys():
350 new_mat[i] += other[i]
351 return new_mat
352
354 """
355 build_obs_freq_mat(acc_rep_mat):
356 Build the observed frequency matrix, from an accepted replacements matrix
357 The accRep matrix should be generated by the user.
358 """
359
360 sum = 0.
361 for i in acc_rep_mat.values():
362 sum += i
363 obs_freq_mat = SeqMat(alphabet=acc_rep_mat.alphabet,build_later=1)
364 for i in acc_rep_mat.keys():
365 obs_freq_mat[i] = acc_rep_mat[i]/sum
366 obs_freq_mat.mat_type = OBSFREQ
367 return obs_freq_mat
368
370 exp_freq_table = {}
371 for i in obs_freq_mat.alphabet.letters:
372 exp_freq_table[i] = 0.
373 for i in obs_freq_mat.keys():
374 if i[0] == i[1]:
375 exp_freq_table[i[0]] += obs_freq_mat[i]
376 else:
377 exp_freq_table[i[0]] += obs_freq_mat[i] / 2.
378 exp_freq_table[i[1]] += obs_freq_mat[i] / 2.
379 return FreqTable.FreqTable(exp_freq_table,FreqTable.FREQ)
380
382 """Build an expected frequency matrix
383 exp_freq_table: should be a FreqTable instance
384 """
385 exp_freq_mat = SeqMat(alphabet=exp_freq_table.alphabet,build_later=1)
386 for i in exp_freq_mat.keys():
387 if i[0] == i[1]:
388 exp_freq_mat[i] = exp_freq_table[i[0]]**2
389 else:
390 exp_freq_mat[i] = 2.0*exp_freq_table[i[0]]*exp_freq_table[i[1]]
391 exp_freq_mat.mat_type = EXPFREQ
392 return exp_freq_mat
393
394
395
397 """ Build the substitution matrix """
398 if obs_freq_mat.ab_list <> exp_freq_mat.ab_list:
399 raise ValueError, "Alphabet mismatch in passed matrices"
400 subs_mat = SeqMat(obs_freq_mat)
401 for i in obs_freq_mat.keys():
402 subs_mat[i] = obs_freq_mat[i]/exp_freq_mat[i]
403
404 subs_mat.mat_type = SUBS
405 return subs_mat
406
407
408
409
411 """_build_log_odds_mat(subs_mat,logbase=10,factor=10.0,round_digit=1):
412 Build a log-odds matrix
413 logbase=2: base of logarithm used to build (default 2)
414 factor=10.: a factor by which each matrix entry is multiplied
415 round_digit: roundoff place after decimal point
416 keep_nd: if true, keeps the -999 value for non-determined values (for which there
417 are no substitutions in the frequency substitutions matrix). If false, plants the
418 minimum log-odds value of the matrix in entries containing -999
419 """
420 lo_mat = SeqMat(subs_mat)
421 for i in subs_mat.keys():
422 if subs_mat[i] < EPSILON:
423 lo_mat[i] = -999
424 else:
425 lo_mat[i] = round(factor*log(subs_mat[i])/log(logbase),round_digit)
426 lo_mat.mat_type = LO
427 mat_min = min(lo_mat.values())
428 if not keep_nd:
429 for i in lo_mat.keys():
430 if lo_mat[i] <= -999:
431 lo_mat[i] = mat_min
432 return lo_mat
433
434
435
436
437
438
439
440 -def make_log_odds_matrix(acc_rep_mat,exp_freq_table=None,logbase=2,
441 factor=1.,round_digit=9,keep_nd=0):
455 -def read_text_matrix(data_file,mat_type=NOTYPE):
456 matrix = {}
457 tmp = string.split(data_file.read(),"\n")
458 table=[]
459 for i in tmp:
460 table.append(string.split(i))
461
462 for rec in table[:]:
463 if (rec.count('#') > 0):
464 table.remove(rec)
465
466
467 while (table.count([]) > 0):
468 table.remove([])
469
470 alphabet = table[0]
471 j = 0
472 for rec in table[1:]:
473
474 row = alphabet[j]
475
476 if re.compile('[A-z\*]').match(rec[0]):
477 first_col = 1
478 else:
479 first_col = 0
480 i = 0
481 for field in rec[first_col:]:
482 col = alphabet[i]
483 matrix[(row,col)] = string.atof(field)
484 i += 1
485 j += 1
486
487 for i in matrix.keys():
488 if '*' in i: del(matrix[i])
489 ret_mat = SeqMat(matrix,mat_type=mat_type)
490 return ret_mat
491
492 diagNO = 1
493 diagONLY = 2
494 diagALL = 3
495
497 rel_ent = 0.
498 key_list_1 = mat_1.keys(); key_list_2 = mat_2.keys()
499 key_list_1.sort(); key_list_2.sort()
500 key_list = []
501 sum_ent_1 = 0.; sum_ent_2 = 0.
502 for i in key_list_1:
503 if i in key_list_2:
504 key_list.append(i)
505 if len(key_list_1) <> len(key_list_2):
506
507 sys.stderr.write("Warning:first matrix has more entries than the second\n")
508 if key_list_1 <> key_list_2:
509 sys.stderr.write("Warning: indices not the same between matrices\n")
510 for key in key_list:
511 if diag == diagNO and key[0] == key[1]:
512 continue
513 if diag == diagONLY and key[0] <> key[1]:
514 continue
515 if mat_1[key] > EPSILON and mat_2[key] > EPSILON:
516 sum_ent_1 += mat_1[key]
517 sum_ent_2 += mat_2[key]
518
519 for key in key_list:
520 if diag == diagNO and key[0] == key[1]:
521 continue
522 if diag == diagONLY and key[0] <> key[1]:
523 continue
524 if mat_1[key] > EPSILON and mat_2[key] > EPSILON:
525 val_1 = mat_1[key] / sum_ent_1
526 val_2 = mat_2[key] / sum_ent_2
527
528 rel_ent += val_1 * log(val_1/val_2)/log(logbase)
529 return rel_ent
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
548 assert mat_1.ab_list == mat_2.ab_list
549 assert pi_1 > 0 and pi_2 > 0 and pi_1< 1 and pi_2 <1
550 assert not (pi_1 + pi_2 - 1.0 > EPSILON)
551 sum_mat = SeqMat(build_later=1)
552 sum_mat.ab_list = mat_1.ab_list
553 for i in mat_1.keys():
554 sum_mat[i] = pi_1 * mat_1[i] + pi_2 * mat_2[i]
555 sum_mat.make_entropy()
556 mat_1.make_entropy()
557 mat_2.make_entropy()
558
559 dJS = sum_mat.entropy - pi_1 * mat_1.entropy - pi_2 *mat_2.entropy
560 return dJS
561
562 """
563 This isn't working yet. Boo hoo!
564 def two_mat_print(mat_1, mat_2, f=None,alphabet=None,factor_1=1, factor_2=1,
565 format="%4d",bottomformat="%4s",topformat="%4s",
566 topindent=7*" ", bottomindent=1*" "):
567 f = f or sys.stdout
568 if not alphabet:
569 assert mat_1.ab_list == mat_2.ab_list
570 alphabet = mat_1.ab_list
571 len_alphabet = len(alphabet)
572 print_mat = {}
573 topline = topindent
574 bottomline = bottomindent
575 for i in alphabet:
576 bottomline += bottomformat % i
577 topline += topformat % alphabet[len_alphabet-alphabet.index(i)-1]
578 topline += '\n'
579 bottomline += '\n'
580 f.write(topline)
581 for i in alphabet:
582 for j in alphabet:
583 print_mat[i,j] = -999
584 diag_1 = {}; diag_2 = {}
585 for i in alphabet:
586 for j in alphabet[:alphabet.index(i)+1]:
587 if i == j:
588 diag_1[i] = mat_1[(i,i)]
589 diag_2[i] = mat_2[(alphabet[len_alphabet-alphabet.index(i)-1],
590 alphabet[len_alphabet-alphabet.index(i)-1])]
591 else:
592 if i > j:
593 key = (j,i)
594 else:
595 key = (i,j)
596 mat_2_key = [alphabet[len_alphabet-alphabet.index(key[0])-1],
597 alphabet[len_alphabet-alphabet.index(key[1])-1]]
598 # print mat_2_key
599 mat_2_key.sort(); mat_2_key = tuple(mat_2_key)
600 # print key ,"||", mat_2_key
601 print_mat[key] = mat_2[mat_2_key]
602 print_mat[(key[1],key[0])] = mat_1[key]
603 for i in alphabet:
604 outline = i
605 for j in alphabet:
606 if i == j:
607 if diag_1[i] == -999:
608 val_1 = ' ND'
609 else:
610 val_1 = format % (diag_1[i]*factor_1)
611 if diag_2[i] == -999:
612 val_2 = ' ND'
613 else:
614 val_2 = format % (diag_2[i]*factor_2)
615 cur_str = val_1 + " " + val_2
616 else:
617 if print_mat[(i,j)] == -999:
618 val = ' ND'
619 elif alphabet.index(i) > alphabet.index(j):
620 val = format % (print_mat[(i,j)]*factor_1)
621 else:
622 val = format % (print_mat[(i,j)]*factor_2)
623 cur_str = val
624 outline += cur_str
625 outline += bottomformat % (alphabet[len_alphabet-alphabet.index(i)-1] +
626 '\n')
627 f.write(outline)
628 f.write(bottomline)
629 """
630