1 """
2 This is an implementation of a state-emitting MarkovModel. I am using
3 terminology similar to Manning and Schutze.
4
5
6
7 Functions:
8 train_bw Train a markov model using the Baum-Welch algorithm.
9 train_visible Train a visible markov model using MLE.
10 find_states Find the a state sequence that explains some observations.
11
12 load Load a MarkovModel.
13 save Save a MarkovModel.
14
15 Classes:
16 MarkovModel Holds the description of a markov model
17 """
18 import math
19
20 from Numeric import *
21 import RandomArray
22
23 import StringIO
24
25 from Bio import listfns
26
27
28
29 RandomArray.seed()
30
31 VERY_SMALL_NUMBER = 1E-300
32 LOG0 = log(VERY_SMALL_NUMBER)
33
34 MATCODE = Float64
35
37 - def __init__(self, states, alphabet,
38 p_initial=None, p_transition=None, p_emission=None):
39 self.states = states
40 self.alphabet = alphabet
41 self.p_initial = p_initial
42 self.p_transition = p_transition
43 self.p_emission = p_emission
49
51 line = handle.readline()
52 if not line.startswith(start):
53 raise ValueError, "I expected %r but got %r" % (start, line)
54 return line
55
57 """load(handle) -> MarkovModel()"""
58
59 line = _readline_and_check_start(handle, "STATES:")
60 states = line.split()[1:]
61
62
63 line = _readline_and_check_start(handle, "ALPHABET:")
64 alphabet = line.split()[1:]
65
66 mm = MarkovModel(states, alphabet)
67 N, M = len(states), len(alphabet)
68
69
70 mm.p_initial = zeros(N, MATCODE)
71 line = _readline_and_check_start(handle, "INITIAL:")
72 for i in range(len(states)):
73 line = _readline_and_check_start(handle, " %s:" % states[i])
74 mm.p_initial[i] = float(line.split()[-1])
75
76
77 mm.p_transition = zeros((N, N), MATCODE)
78 line = _readline_and_check_start(handle, "TRANSITION:")
79 for i in range(len(states)):
80 line = _readline_and_check_start(handle, " %s:" % states[i])
81 mm.p_transition[i,:] = map(float, line.split()[1:])
82
83
84 mm.p_emission = zeros((N, M), MATCODE)
85 line = _readline_and_check_start(handle, "EMISSION:")
86 for i in range(len(states)):
87 line = _readline_and_check_start(handle, " %s:" % states[i])
88 mm.p_emission[i,:] = map(float, line.split()[1:])
89
90 return mm
91
92 -def save(mm, handle):
93 """save(mm, handle)"""
94
95 w = handle.write
96 w("STATES: %s\n" % ' '.join(mm.states))
97 w("ALPHABET: %s\n" % ' '.join(mm.alphabet))
98 w("INITIAL:\n")
99 for i in range(len(mm.p_initial)):
100 w(" %s: %g\n" % (mm.states[i], mm.p_initial[i]))
101 w("TRANSITION:\n")
102 for i in range(len(mm.p_transition)):
103 x = map(str, mm.p_transition[i])
104 w(" %s: %s\n" % (mm.states[i], ' '.join(x)))
105 w("EMISSION:\n")
106 for i in range(len(mm.p_emission)):
107 x = map(str, mm.p_emission[i])
108 w(" %s: %s\n" % (mm.states[i], ' '.join(x)))
109
110
111 -def train_bw(states, alphabet, training_data,
112 pseudo_initial=None, pseudo_transition=None, pseudo_emission=None,
113 update_fn=None,
114 ):
115 """train_bw(states, alphabet, training_data[, pseudo_initial]
116 [, pseudo_transition][, pseudo_emission][, update_fn]) -> MarkovModel
117
118 Train a MarkovModel using the Baum-Welch algorithm. states is a list
119 of strings that describe the names of each state. alphabet is a
120 list of objects that indicate the allowed outputs. training_data
121 is a list of observations. Each observation is a list of objects
122 from the alphabet.
123
124 pseudo_initial, pseudo_transition, and pseudo_emission are
125 optional parameters that you can use to assign pseudo-counts to
126 different matrices. They should be matrices of the appropriate
127 size that contain numbers to add to each parameter matrix, before
128 normalization.
129
130 update_fn is an optional callback that takes parameters
131 (iteration, log_likelihood). It is called once per iteration.
132
133 """
134 N, M = len(states), len(alphabet)
135 if not training_data:
136 raise ValueError, "No training data given."
137 pseudo_initial, pseudo_emission, pseudo_transition = map(
138 _safe_asarray, (pseudo_initial, pseudo_emission, pseudo_transition))
139 if pseudo_initial and shape(pseudo_initial) != (N,):
140 raise ValueError, "pseudo_initial not shape len(states)"
141 if pseudo_transition and shape(pseudo_transition) != (N,N):
142 raise ValueError, "pseudo_transition not shape " + \
143 "len(states) X len(states)"
144 if pseudo_emission and shape(pseudo_emission) != (N,M):
145 raise ValueError, "pseudo_emission not shape " + \
146 "len(states) X len(alphabet)"
147
148
149
150
151 training_outputs = []
152 indexes = listfns.itemindex(alphabet)
153 for outputs in training_data:
154 training_outputs.append([indexes[x] for x in outputs])
155
156
157 lengths = map(len, training_outputs)
158 if min(lengths) == 0:
159 raise ValueError, "I got training data with outputs of length 0"
160
161
162 x = _baum_welch(N, M, training_outputs,
163 pseudo_initial=pseudo_initial,
164 pseudo_transition=pseudo_transition,
165 pseudo_emission=pseudo_emission,
166 update_fn=update_fn)
167 p_initial, p_transition, p_emission = x
168 return MarkovModel(states, alphabet, p_initial, p_transition, p_emission)
169
170 MAX_ITERATIONS = 1000
171 -def _baum_welch(N, M, training_outputs,
172 p_initial=None, p_transition=None, p_emission=None,
173 pseudo_initial=None, pseudo_transition=None,
174 pseudo_emission=None, update_fn=None):
175
176 p_initial = _safe_copy_and_check(p_initial, (N,)) or \
177 _random_norm(N)
178 p_transition = _safe_copy_and_check(p_transition, (N,N)) or \
179 _random_norm((N,N))
180 p_emission = _safe_copy_and_check(p_emission, (N,M)) or \
181 _random_norm((N,M))
182
183
184 lp_initial, lp_transition, lp_emission = map(
185 log, (p_initial, p_transition, p_emission))
186 lpseudo_initial, lpseudo_transition, lpseudo_emission = map(
187 _safe_log, (pseudo_initial, pseudo_transition, pseudo_emission))
188
189
190
191
192 prev_llik = None
193 for i in range(MAX_ITERATIONS):
194 llik = LOG0
195 for outputs in training_outputs:
196 x = _baum_welch_one(
197 N, M, outputs,
198 lp_initial, lp_transition, lp_emission,
199 lpseudo_initial, lpseudo_transition, lpseudo_emission,)
200 llik += x
201 if update_fn is not None:
202 update_fn(i, llik)
203 if prev_llik is not None and fabs(prev_llik-llik) < 0.1:
204 break
205 prev_llik = llik
206 else:
207 raise "HMM did not converge in %d iterations" % MAX_ITERATIONS
208
209
210 return map(exp, (lp_initial, lp_transition, lp_emission))
211
212 -def _baum_welch_one(N, M, outputs,
213 lp_initial, lp_transition, lp_emission,
214 lpseudo_initial, lpseudo_transition, lpseudo_emission):
215
216
217
218 T = len(outputs)
219 fmat = _forward(N, T, lp_initial, lp_transition, lp_emission, outputs)
220 bmat = _backward(N, T, lp_transition, lp_emission, outputs)
221
222
223
224 lp_arc = zeros((N, N, T), MATCODE)
225 for t in range(T):
226 k = outputs[t]
227 lp_traverse = zeros((N, N), MATCODE)
228 for i in range(N):
229 for j in range(N):
230
231
232
233
234 lp = fmat[i][t] + \
235 lp_transition[i][j] + \
236 lp_emission[i][k] + \
237 bmat[j][t+1]
238 lp_traverse[i][j] = lp
239
240 lp_arc[:,:,t] = lp_traverse - _logsum(lp_traverse)
241
242
243
244 lp_arcout_t = zeros((N, T), MATCODE)
245 for t in range(T):
246 for i in range(N):
247 lp_arcout_t[i][t] = _logsum(lp_arc[i,:,t])
248
249
250 lp_arcout = zeros(N, MATCODE)
251 for i in range(N):
252 lp_arcout[i] = _logsum(lp_arcout_t[i,:])
253
254
255 lp_initial[:] = lp_arcout_t[:,0]
256 if lpseudo_initial:
257 lp_initial = _logvecadd(lp_initial, lpseudo_initial)
258 lp_initial = lp_initial - _logsum(lp_initial)
259
260
261
262
263 for i in range(N):
264 for j in range(N):
265 lp_transition[i][j] = _logsum(lp_arc[i,j,:]) - lp_arcout[i]
266 if lpseudo_transition:
267 lp_transition[i] = _logvecadd(lp_transition[i], lpseudo_transition)
268 lp_transition[i] = lp_transition[i] - _logsum(lp_transition[i])
269
270
271
272
273 for i in range(N):
274 ksum = zeros(M, MATCODE)+LOG0
275 for t in range(T):
276 k = outputs[t]
277 for j in range(N):
278 ksum[k] = _logadd(ksum[k], lp_arc[i,j,t])
279 ksum = ksum - _logsum(ksum)
280 if lpseudo_emission:
281 ksum = _logvecadd(ksum, lpseudo_emission[i])
282 ksum = ksum - _logsum(ksum)
283 lp_emission[i,:] = ksum
284
285
286
287
288
289
290
291
292 return _logsum(fmat[:,T])
293
294 -def _forward(N, T, lp_initial, lp_transition, lp_emission, outputs):
295
296
297
298
299 matrix = zeros((N, T+1), MATCODE)
300
301
302 matrix[:,0] = lp_initial
303 for t in range(1, T+1):
304 k = outputs[t-1]
305 for j in range(N):
306
307
308 lprob = LOG0
309 for i in range(N):
310 lp = matrix[i][t-1] + \
311 lp_transition[i][j] + \
312 lp_emission[i][k]
313 lprob = _logadd(lprob, lp)
314 matrix[j][t] = lprob
315 return matrix
316
317 -def _backward(N, T, lp_transition, lp_emission, outputs):
318 matrix = zeros((N, T+1), MATCODE)
319 for t in range(T-1, -1, -1):
320 k = outputs[t]
321 for i in range(N):
322
323
324 lprob = LOG0
325 for j in range(N):
326 lp = matrix[j][t+1] + \
327 lp_transition[i][j] + \
328 lp_emission[i][k]
329 lprob = _logadd(lprob, lp)
330 matrix[i][t] = lprob
331 return matrix
332
333 -def train_visible(states, alphabet, training_data,
334 pseudo_initial=None, pseudo_transition=None,
335 pseudo_emission=None):
336 """train_visible(states, alphabet, training_data[, pseudo_initial]
337 [, pseudo_transition][, pseudo_emission]) -> MarkovModel
338
339 Train a visible MarkovModel using maximum likelihoood estimates
340 for each of the parameters. states is a list of strings that
341 describe the names of each state. alphabet is a list of objects
342 that indicate the allowed outputs. training_data is a list of
343 (outputs, observed states) where outputs is a list of the emission
344 from the alphabet, and observed states is a list of states from
345 states.
346
347 pseudo_initial, pseudo_transition, and pseudo_emission are
348 optional parameters that you can use to assign pseudo-counts to
349 different matrices. They should be matrices of the appropriate
350 size that contain numbers to add to each parameter matrix
351
352 """
353 N, M = len(states), len(alphabet)
354 pseudo_initial, pseudo_emission, pseudo_transition = map(
355 _safe_asarray, (pseudo_initial, pseudo_emission, pseudo_transition))
356 if pseudo_initial and shape(pseudo_initial) != (N,):
357 raise ValueError, "pseudo_initial not shape len(states)"
358 if pseudo_transition and shape(pseudo_transition) != (N,N):
359 raise ValueError, "pseudo_transition not shape " + \
360 "len(states) X len(states)"
361 if pseudo_emission and shape(pseudo_emission) != (N,M):
362 raise ValueError, "pseudo_emission not shape " + \
363 "len(states) X len(alphabet)"
364
365
366
367
368 training_states, training_outputs = [], []
369 states_indexes = listfns.itemindex(states)
370 outputs_indexes = listfns.itemindex(alphabet)
371 for toutputs, tstates in training_data:
372 if len(tstates) != len(toutputs):
373 raise ValueError, "states and outputs not aligned"
374 training_states.append([states_indexes[x] for x in tstates])
375 training_outputs.append([outputs_indexes[x] for x in toutputs])
376
377 x = _mle(N, M, training_outputs, training_states,
378 pseudo_initial, pseudo_transition, pseudo_emission)
379 p_initial, p_transition, p_emission = x
380
381 return MarkovModel(states, alphabet, p_initial, p_transition, p_emission)
382
383 -def _mle(N, M, training_outputs, training_states, pseudo_initial,
384 pseudo_transition, pseudo_emission):
385
386
387 p_initial = zeros(N, MATCODE)
388 if pseudo_initial:
389 p_initial = p_initial + pseudo_initial
390 for states in training_states:
391 p_initial[states[0]] += 1
392 p_initial = _normalize(p_initial)
393
394
395
396 p_transition = zeros((N,N), MATCODE)
397 if pseudo_transition:
398 p_transition = p_transition + pseudo_transition
399 for states in training_states:
400 for n in range(len(states)-1):
401 i, j = states[n], states[n+1]
402 p_transition[i, j] += 1
403 for i in range(len(p_transition)):
404 p_transition[i,:] = p_transition[i,:] / sum(p_transition[i,:])
405
406
407
408 p_emission = zeros((N,M), MATCODE)
409 if pseudo_emission:
410 p_emission = p_emission + pseudo_emission
411 p_emission = ones((N,M), MATCODE)
412 for outputs, states in zip(training_outputs, training_states):
413 for o, s in zip(outputs, states):
414 p_emission[s, o] += 1
415 for i in range(len(p_emission)):
416 p_emission[i,:] = p_emission[i,:] / sum(p_emission[i,:])
417
418 return p_initial, p_transition, p_emission
419
421 return [argmax(vector)]
422
424 """find_states(markov_model, output) -> list of (states, score)"""
425 mm = markov_model
426 N = len(mm.states)
427
428
429
430 x = mm.p_initial + VERY_SMALL_NUMBER
431 y = mm.p_transition + VERY_SMALL_NUMBER
432 z = mm.p_emission + VERY_SMALL_NUMBER
433 lp_initial, lp_transition, lp_emission = map(log, (x, y, z))
434
435 indexes = listfns.itemindex(mm.alphabet)
436 output = [indexes[x] for x in output]
437
438
439 results = _viterbi(N, lp_initial, lp_transition, lp_emission, output)
440
441 for i in range(len(results)):
442 states, score = results[i]
443 results[i] = [mm.states[x] for x in states], exp(score)
444 return results
445
446 -def _viterbi(N, lp_initial, lp_transition, lp_emission, output):
447
448
449
450 T = len(output)
451
452 backtrace = []
453 for i in range(N):
454 backtrace.append([None] * T)
455
456
457 scores = zeros((N, T), MATCODE)
458 scores[:,0] = lp_initial + lp_emission[:,output[0]]
459 for t in range(1, T):
460 k = output[t]
461 for j in range(N):
462
463 i_scores = scores[:,t-1] + \
464 lp_transition[:,j] + \
465 lp_emission[j,k]
466 indexes = _argmaxes(i_scores)
467 scores[j,t] = i_scores[indexes[0]]
468 backtrace[j][t] = indexes
469
470
471
472
473
474
475 in_process = []
476 results = []
477 indexes = _argmaxes(scores[:,T-1])
478 for i in indexes:
479 in_process.append((T-1, [i], scores[i][T-1]))
480 while in_process:
481 t, states, score = in_process.pop()
482 if t == 0:
483 results.append((states, score))
484 else:
485 indexes = backtrace[states[0]][t]
486 for i in indexes:
487 in_process.append((t-1, [i]+states, score))
488 return results
489
501
505
509
511
512 matrix = array(matrix, MATCODE, copy=1)
513
514 if shape(matrix) != desired_shape:
515 raise ValuError, "Incorrect dimension"
516
517 if len(shape(matrix)) == 1:
518 if fabs(sum(matrix)-1.0) > 0.01:
519 raise ValueError, "matrix not normalized to 1.0"
520 elif len(shape(matrix)) == 2:
521 for i in range(len(matrix)):
522 if fabs(sum(matrix[i])-1.0) > 0.01:
523 raise ValueError, "matrix %d not normalized to 1.0" % i
524 else:
525 raise ValueError, "I don't handle matrices > 2 dimensions"
526 return matrix
527
532
534 if n is None:
535 return None
536 return log(n)
537
539 if a is None:
540 return None
541 return asarray(a, typecode=typecode)
542
544 if logy - logx > 100:
545 return logy
546 elif logx - logy > 100:
547 return logx
548 minxy = min(logx, logy)
549 return minxy + log(exp(logx-minxy) + exp(logy-minxy))
550
560
562 assert len(logvec1) == len(logvec2), "vectors aren't the same length"
563 sumvec = zeros(len(logvec1), MATCODE)
564 for i in range(len(logvec1)):
565 sumvec[i] = _logadd(logvec1[i], logvec2[i])
566 return sumvec
567
571
572 try:
573 import cMarkovModel
574 except ImportError, x:
575 pass
576 else:
577 import sys
578 this_module = sys.modules[__name__]
579 for name in cMarkovModel.__dict__.keys():
580 if not name.startswith("__"):
581 this_module.__dict__[name] = cMarkovModel.__dict__[name]
582