1 """Deal with representations of Markov Models.
2 """
3
4 import copy
5 import math
6 import random
7
8
9 from Bio.Seq import MutableSeq
10
12 """Interface to build up a Markov Model.
13
14 This class is designed to try to separate the task of specifying the
15 Markov Model from the actual model itself. This is in hopes of making
16 the actual Markov Model classes smaller.
17
18 So, this builder class should be used to create Markov models instead
19 of trying to initiate a Markov Model directly.
20 """
21
22 DEFAULT_PSEUDO = 1
23
24 - def __init__(self, state_alphabet, emission_alphabet):
25 """Initialize a builder to create Markov Models.
26
27 Arguments:
28
29 o state_alphabet -- An alphabet containing all of the letters that
30 can appear in the states
31
32 o emission_alphabet -- An alphabet containing all of the letters for
33 states that can be emitted by the HMM.
34 """
35 self._state_alphabet = state_alphabet
36 self._emission_alphabet = emission_alphabet
37
38
39
40 self.transition_prob = {}
41 self.emission_prob = self._all_blank(state_alphabet,
42 emission_alphabet)
43
44
45 self.transition_pseudo = {}
46 self.emission_pseudo = self._all_pseudo(state_alphabet,
47 emission_alphabet)
48
49 - def _all_blank(self, first_alphabet, second_alphabet):
50 """Return a dictionary with all counts set to zero.
51
52 This uses the letters in the first and second alphabet to create
53 a dictionary with keys of two tuples organized as
54 (letter of first alphabet, letter of second alphabet). The values
55 are all set to 0.
56 """
57 all_blank = {}
58 for first_state in first_alphabet.letters:
59 for second_state in second_alphabet.letters:
60 all_blank[(first_state, second_state)] = 0
61
62 return all_blank
63
64 - def _all_pseudo(self, first_alphabet, second_alphabet):
65 """Return a dictionary with all counts set to a default value.
66
67 This takes the letters in first alphabet and second alphabet and
68 creates a dictionary with keys of two tuples organized as:
69 (letter of first alphabet, letter of second alphabet). The values
70 are all set to the value of the class attribute DEFAULT_PSEUDO.
71 """
72 all_counts = {}
73 for first_state in first_alphabet.letters:
74 for second_state in second_alphabet.letters:
75 all_counts[(first_state, second_state)] = self.DEFAULT_PSEUDO
76
77 return all_counts
78
80 """Return the markov model corresponding with the current parameters.
81
82 Each markov model returned by a call to this function is unique
83 (ie. they don't influence each other).
84 """
85 transition_prob = copy.deepcopy(self.transition_prob)
86 emission_prob = copy.deepcopy(self.emission_prob)
87 transition_pseudo = copy.deepcopy(self.transition_pseudo)
88 emission_pseudo = copy.deepcopy(self.emission_pseudo)
89
90 return HiddenMarkovModel(transition_prob, emission_prob,
91 transition_pseudo, emission_pseudo)
92
94 """Reset all probabilities to be an average value.
95
96 This resets the values of all allowed transitions and all allowed
97 emissions to be equal to 1 divided by the number of possible elements.
98
99 This is useful if you just want to initialize a Markov Model to
100 starting values (ie. if you have no prior notions of what the
101 probabilities should be -- or if you are just feeling too lazy
102 to calculate them :-).
103
104 Warning 1 -- this will reset all currently set probabilities.
105
106 Warning 2 -- This just sets all probabilities for transitions and
107 emissions to total up to 1, so it doesn't ensure that the sum of
108 each set of transitions adds up to 1.
109 """
110
111 new_trans_prob = float(1) / float(len(self.transition_prob.keys()))
112 for key in self.transition_prob.keys():
113 self.transition_prob[key] = new_trans_prob
114
115
116 new_emission_prob = float(1) / float(len(self.emission_prob.keys()))
117 for key in self.emission_prob.keys():
118 self.emission_prob[key] = new_emission_prob
119
120
122 """Set all probabilities to randomly generated numbers.
123
124 This will reset the value of all allowed transitions and emissions
125 to random values.
126
127 Warning 1 -- This will reset any currently set probabibilities.
128
129 Warning 2 -- This does not check to ensure that the sum of
130 all of the probabilities is less then 1. It just randomly assigns
131 a probability to each
132 """
133 for key in self.transition_prob.keys():
134 self.transition_prob[key] = random.random()
135
136 for key in self.emission_prob.keys():
137 self.emission_prob[key] = random.random()
138
139
140
142 """A convenience function to create transitions between all states.
143
144 By default all transitions within the alphabet are disallowed; this
145 is a way to change this to allow all possible transitions.
146 """
147
148
149 all_probs = self._all_blank(self._state_alphabet,
150 self._state_alphabet)
151
152 all_pseudo = self._all_pseudo(self._state_alphabet,
153 self._state_alphabet)
154
155
156
157 for set_key in self.transition_prob.keys():
158 all_probs[set_key] = self.transition_prob[set_key]
159
160 for set_key in self.transition_pseudo.keys():
161 all_pseudo[set_key] = self.transition_pseudo[set_key]
162
163
164 self.transition_prob = all_probs
165 self.transition_pseudo = all_pseudo
166
167 - def allow_transition(self, from_state, to_state, probability = None,
168 pseudocount = None):
169 """Set a transition as being possible between the two states.
170
171 probability and pseudocount are optional arguments
172 specifying the probabilities and pseudo counts for the transition.
173 If these are not supplied, then the values are set to the
174 default values.
175
176 Raises:
177 KeyError -- if the two states already have an allowed transition.
178 """
179
180 for state in [from_state, to_state]:
181 assert state in self._state_alphabet, \
182 "State %s was not found in the sequence alphabet" % state
183
184
185 if ((from_state, to_state) not in self.transition_prob.keys() and
186 (from_state, to_state) not in self.transition_pseudo.keys()):
187
188 if probability is None:
189 probability = 0
190 self.transition_prob[(from_state, to_state)] = probability
191
192
193 if pseudocount is None:
194 pseudcount = self.DEFAULT_PSEUDO
195 self.transition_pseudo[(from_state, to_state)] = pseudocount
196 else:
197 raise KeyError("Transtion from %s to %s is already allowed."
198 % (from_state, to_state))
199
201 """Restrict transitions between the two states.
202
203 Raises:
204 KeyError if the transition is not currently allowed.
205 """
206 try:
207 del self.transition_prob[(from_state, to_state)]
208 del self.transition_pseudo[(from_state, to_state)]
209 except KeyError:
210 raise KeyError("Transition from %s to %s is already disallowed."
211 % (from_state, to_state))
212
214 """Set the probability of a transition between two states.
215
216 Raises:
217 KeyError if the transition is not allowed.
218 """
219 if self.transition_prob.has_key((from_state, to_state)):
220 self.transition_prob[(from_state, to_state)] = probability
221 else:
222 raise KeyError("Transition from %s to %s is not allowed."
223 % (from_state, to_state))
224
226 """Set the default pseudocount for a transition.
227
228 To avoid computational problems, it is helpful to be able to
229 set a 'default' pseudocount to start with for estimating
230 transition and emission probabilities (see p62 in Durbin et al
231 for more discussion on this. By default, all transitions have
232 a pseudocount of 1.
233
234 Raises:
235 KeyError if the transition is not allowed.
236 """
237 if self.transition_pseudo.has_key((from_state, to_state)):
238 self.transition_pseudo[(from_state, to_state)] = count
239 else:
240 raise KeyError("Transition from %s to %s is not allowed."
241 % (from_state, to_state))
242
243
244
246 """Set the probability of a emission from a particular state.
247
248 Raises:
249 KeyError if the emission from the given state is not allowed.
250 """
251 if self.emission_prob.has_key((seq_state, emission_state)):
252 self.emission_prob[(seq_state, emission_state)] = probability
253 else:
254 raise KeyError("Emission of %s from %s is not allowed."
255 % (emission_state, seq_state))
256
258 """Set the default pseudocount for an emission.
259
260 To avoid computational problems, it is helpful to be able to
261 set a 'default' pseudocount to start with for estimating
262 transition and emission probabilities (see p62 in Durbin et al
263 for more discussion on this. By default, all emissions have
264 a pseudocount of 1.
265
266 Raises:
267 KeyError if the emission from the given state is not allowed.
268 """
269 if self.emission_pseudo.has_key((seq_state, emission_state)):
270 self.emission_pseudo[(seq_state, emission_state)] = count
271 else:
272 raise KeyError("Emission of %s from %s is not allowed."
273 % (emission_state, seq_state))
274
276 """Represent a hidden markov model that can be used for state estimation.
277 """
278 - def __init__(self, transition_prob, emission_prob, transition_pseudo,
279 emission_pseudo):
280 """Initialize a Markov Model.
281
282 Note: You should use the MarkovModelBuilder class instead of
283 initiating this class directly.
284
285 Arguments:
286
287 o transition_prob -- A dictionary of transition probabilities for all
288 possible transitions in the sequence.
289
290 o emission_prob -- A dictionary of emissions probabilities for all
291 possible emissions from the sequence states.
292
293 o transition_pseudo -- Pseudo-counts to be used for the transitions,
294 when counting for purposes of estimating transition probabilities.
295
296 o emission_pseduo -- Pseudo-counts fo tbe used for the emissions,
297 when counting for purposes of estimating emission probabilities.
298 """
299 self._transition_pseudo = transition_pseudo
300 self._emission_pseudo = emission_pseudo
301
302 self.transition_prob = transition_prob
303 self.emission_prob = emission_prob
304
305
306
307
308 self._transitions_from = \
309 self._calculate_from_transitions(self.transition_prob)
310
312 """Calculate which 'from transitions' are allowed for each letter.
313
314 This looks through all of the trans_probs, and uses this dictionary
315 to determine allowed transitions. It converts this information into
316 a dictionary, whose keys are the transition letters and whose
317 values are a list of allowed letters to transition to.
318 """
319 from_transitions = {}
320
321
322 for trans_key in trans_probs.keys():
323
324
325 try:
326 from_transitions[trans_key[0]].append(trans_key[1])
327
328 except KeyError:
329 from_transitions[trans_key[0]] = []
330 from_transitions[trans_key[0]].append(trans_key[1])
331
332 return from_transitions
333
335 """Get the default transitions for the model.
336
337 Returns a dictionary of all of the default transitions between any
338 two letters in the sequence alphabet. The dictionary is structured
339 with keys as (letter1, letter2) and values as the starting number
340 of transitions.
341 """
342 return self._transition_pseudo
343
345 """Get the starting default emmissions for each sequence.
346
347 This returns a dictionary of the default emmissions for each
348 letter. The dictionary is structured with keys as
349 (seq_letter, emmission_letter) and values as the starting number
350 of emmissions.
351 """
352 return self._emission_pseudo
353
355 """Get all transitions which can happen from the given state.
356
357 This returns all letters which the given state_letter is allowed
358 to transition to. An empty list is returned if no letters are possible.
359 """
360 try:
361 return self._transitions_from[state_letter]
362 except KeyError:
363 return []
364
365 - def viterbi(self, sequence, state_alphabet):
366 """Calculate the most probable state path using the Viterbi algorithm.
367
368 This implements the Viterbi algorithm (see pgs 55-57 in Durbin et
369 al for a full explanation -- this is where I took my implementation
370 ideas from), to allow decoding of the state path, given a sequence
371 of emissions.
372
373 Arguments:
374
375 o sequence -- A Seq object with the emission sequence that we
376 want to decode.
377
378 o state_alphabet -- The alphabet of the possible state sequences
379 that can be generated.
380 """
381
382 log_trans = self._log_transform(self.transition_prob)
383 log_emission = self._log_transform(self.emission_prob)
384
385 viterbi_probs = {}
386 pred_state_seq = {}
387 state_letters = state_alphabet.letters
388
389
390
391
392
393
394
395 viterbi_probs[(state_letters[0], -1)] = 1
396
397 for state_letter in state_letters[1:]:
398 viterbi_probs[(state_letter, -1)] = 0
399
400
401
402 for i in range(0, len(sequence)):
403
404 for main_state in state_letters:
405
406 emission_part = log_emission[(main_state, sequence[i])]
407
408
409 possible_state_probs = {}
410 for cur_state in self.transitions_from(main_state):
411
412 trans_part = log_trans[(cur_state, main_state)]
413
414
415 viterbi_part = viterbi_probs[(cur_state, i - 1)]
416 cur_prob = viterbi_part + trans_part
417
418 possible_state_probs[cur_state] = cur_prob
419
420
421 max_prob = max(possible_state_probs.values())
422 viterbi_probs[(main_state, i)] = (emission_part + max_prob)
423
424
425 for state in possible_state_probs.keys():
426 if possible_state_probs[state] == max_prob:
427 pred_state_seq[(i - 1, main_state)] = state
428 break
429
430
431
432
433 all_probs = {}
434 for state in state_letters:
435
436 viterbi_part = viterbi_probs[(state, len(sequence) - 1)]
437
438 transition_part = log_trans[(state, state_letters[0])]
439
440 all_probs[state] = viterbi_part * transition_part
441
442 state_path_prob = max(all_probs.values())
443
444
445 last_state = ''
446 for state in all_probs.keys():
447 if all_probs[state] == state_path_prob:
448 last_state = state
449
450 assert last_state != '', "Didn't find the last state to trace from!"
451
452
453 traceback_seq = MutableSeq('', state_alphabet)
454
455 loop_seq = range(0, len(sequence))
456 loop_seq.reverse()
457
458 cur_state = last_state
459 for i in loop_seq:
460 traceback_seq.append(cur_state)
461
462 cur_state = pred_state_seq[(i - 1, cur_state)]
463
464
465 traceback_seq.reverse()
466
467 return traceback_seq.toseq(), state_path_prob
468
484