1
2
3
4
5
6 """Provide trainers which estimate parameters based on training sequences.
7
8 These should be used to 'train' a Markov Model prior to actually using
9 it to decode state paths. When supplied training sequences and a model
10 to work from, these classes will estimate parameters of the model.
11
12 This aims to estimate two parameters:
13
14 - a_{kl} -- the number of times there is a transition from k to l in the
15 training data.
16
17 - e_{k}(b) -- the number of emissions of the state b from the letter k
18 in the training data.
19 """
20
21 import math
22
23
24 from .DynamicProgramming import ScaledDPAlgorithms
25
26 __docformat__ = "restructuredtext en"
27
28
30 """Hold a training sequence with emissions and optionally, a state path.
31 """
32 - def __init__(self, emissions, state_path):
33 """Initialize a training sequence.
34
35 Arguments:
36
37 o emissions - A Seq object containing the sequence of emissions in
38 the training sequence, and the alphabet of the sequence.
39
40 o state_path - A Seq object containing the sequence of states and
41 the alphabet of the states. If there is no known state path, then
42 the sequence of states should be an empty string.
43 """
44 if len(state_path) > 0:
45 assert len(emissions) == len(state_path), \
46 "State path does not match associated emissions."
47 self.emissions = emissions
48 self.states = state_path
49
50
52 """Provide generic functionality needed in all trainers.
53 """
55 self._markov_model = markov_model
56
58 """Calculate the log likelihood of the training seqs.
59
60 Arguments:
61
62 o probabilities -- A list of the probabilities of each training
63 sequence under the current parameters, calculated using the forward
64 algorithm.
65 """
66 total_likelihood = 0
67 for probability in probabilities:
68 total_likelihood += math.log(probability)
69
70 return total_likelihood
71
73 """Get a maximum likelihood estimation of transition and emmission.
74
75 Arguments:
76
77 o transition_counts -- A dictionary with the total number of counts
78 of transitions between two states.
79
80 o emissions_counts -- A dictionary with the total number of counts
81 of emmissions of a particular emission letter by a state letter.
82
83 This then returns the maximum likelihood estimators for the
84 transitions and emissions, estimated by formulas 3.18 in
85 Durbin et al:
86
87 a_{kl} = A_{kl} / sum(A_{kl'})
88 e_{k}(b) = E_{k}(b) / sum(E_{k}(b'))
89
90 Returns:
91 Transition and emission dictionaries containing the maximum
92 likelihood estimators.
93 """
94
95 ml_transitions = self.ml_estimator(transition_counts)
96 ml_emissions = self.ml_estimator(emission_counts)
97
98 return ml_transitions, ml_emissions
99
101 """Calculate the maximum likelihood estimator.
102
103 This can calculate maximum likelihoods for both transitions
104 and emissions.
105
106 Arguments:
107
108 o counts -- A dictionary of the counts for each item.
109
110 See estimate_params for a description of the formula used for
111 calculation.
112 """
113
114 all_ordered = sorted(counts)
115
116 ml_estimation = {}
117
118
119 cur_letter = None
120 cur_letter_counts = 0
121
122 for cur_item in all_ordered:
123
124 if cur_item[0] != cur_letter:
125
126 cur_letter = cur_item[0]
127
128
129 cur_letter_counts = counts[cur_item]
130
131
132 cur_position = all_ordered.index(cur_item) + 1
133
134
135
136 while (cur_position < len(all_ordered) and
137 all_ordered[cur_position][0] == cur_item[0]):
138 cur_letter_counts += counts[all_ordered[cur_position]]
139 cur_position += 1
140
141 else:
142 pass
143
144
145 cur_ml = float(counts[cur_item]) / float(cur_letter_counts)
146 ml_estimation[cur_item] = cur_ml
147
148 return ml_estimation
149
150
152 """Trainer that uses the Baum-Welch algorithm to estimate parameters.
153
154 These should be used when a training sequence for an HMM has unknown
155 paths for the actual states, and you need to make an estimation of the
156 model parameters from the observed emissions.
157
158 This uses the Baum-Welch algorithm, first described in
159 Baum, L.E. 1972. Inequalities. 3:1-8
160 This is based on the description in 'Biological Sequence Analysis' by
161 Durbin et al. in section 3.3
162
163 This algorithm is guaranteed to converge to a local maximum, but not
164 necessarily to the global maxima, so use with care!
165 """
167 """Initialize the trainer.
168
169 Arguments:
170
171 o markov_model - The model we are going to estimate parameters for.
172 This should have the parameters with some initial estimates, that
173 we can build from.
174 """
175 AbstractTrainer.__init__(self, markov_model)
176
179 """Estimate the parameters using training sequences.
180
181 The algorithm for this is taken from Durbin et al. p64, so this
182 is a good place to go for a reference on what is going on.
183
184 Arguments:
185
186 o training_seqs -- A list of TrainingSequence objects to be used
187 for estimating the parameters.
188
189 o stopping_criteria -- A function, that when passed the change
190 in log likelihood and threshold, will indicate if we should stop
191 the estimation iterations.
192
193 o dp_method -- A class instance specifying the dynamic programming
194 implementation we should use to calculate the forward and
195 backward variables. By default, we use the scaling method.
196 """
197 prev_log_likelihood = None
198 num_iterations = 1
199
200 while True:
201 transition_count = self._markov_model.get_blank_transitions()
202 emission_count = self._markov_model.get_blank_emissions()
203
204
205 all_probabilities = []
206
207 for training_seq in training_seqs:
208
209 DP = dp_method(self._markov_model, training_seq)
210 forward_var, seq_prob = DP.forward_algorithm()
211 backward_var = DP.backward_algorithm()
212
213 all_probabilities.append(seq_prob)
214
215
216 transition_count = self.update_transitions(transition_count,
217 training_seq,
218 forward_var,
219 backward_var,
220 seq_prob)
221 emission_count = self.update_emissions(emission_count,
222 training_seq,
223 forward_var,
224 backward_var,
225 seq_prob)
226
227
228 ml_transitions, ml_emissions = \
229 self.estimate_params(transition_count, emission_count)
230 self._markov_model.transition_prob = ml_transitions
231 self._markov_model.emission_prob = ml_emissions
232
233 cur_log_likelihood = self.log_likelihood(all_probabilities)
234
235
236
237 if prev_log_likelihood is not None:
238
239
240
241 log_likelihood_change = abs(abs(cur_log_likelihood) -
242 abs(prev_log_likelihood))
243
244
245
246 if stopping_criteria(log_likelihood_change, num_iterations):
247 break
248
249
250 prev_log_likelihood = cur_log_likelihood
251 num_iterations += 1
252
253 return self._markov_model
254
255 - def update_transitions(self, transition_counts, training_seq,
256 forward_vars, backward_vars, training_seq_prob):
257 """Add the contribution of a new training sequence to the transitions.
258
259 Arguments:
260
261 o transition_counts -- A dictionary of the current counts for the
262 transitions
263
264 o training_seq -- The training sequence we are working with
265
266 o forward_vars -- Probabilities calculated using the forward
267 algorithm.
268
269 o backward_vars -- Probabilities calculated using the backwards
270 algorithm.
271
272 o training_seq_prob - The probability of the current sequence.
273
274 This calculates A_{kl} (the estimated transition counts from state
275 k to state l) using formula 3.20 in Durbin et al.
276 """
277
278 transitions = self._markov_model.transition_prob
279 emissions = self._markov_model.emission_prob
280
281
282 for k in training_seq.states.alphabet.letters:
283 for l in self._markov_model.transitions_from(k):
284 estimated_counts = 0
285
286 for i in range(len(training_seq.emissions) - 1):
287
288 forward_value = forward_vars[(k, i)]
289
290
291 backward_value = backward_vars[(l, i + 1)]
292
293
294 trans_value = transitions[(k, l)]
295
296
297 emm_value = emissions[(l, training_seq.emissions[i + 1])]
298
299 estimated_counts += (forward_value * trans_value *
300 emm_value * backward_value)
301
302
303 transition_counts[(k, l)] += (float(estimated_counts) /
304 training_seq_prob)
305
306 return transition_counts
307
308 - def update_emissions(self, emission_counts, training_seq,
309 forward_vars, backward_vars, training_seq_prob):
310 """Add the contribution of a new training sequence to the emissions
311
312 Arguments:
313
314 o emission_counts -- A dictionary of the current counts for the
315 emissions
316
317 o training_seq -- The training sequence we are working with
318
319 o forward_vars -- Probabilities calculated using the forward
320 algorithm.
321
322 o backward_vars -- Probabilities calculated using the backwards
323 algorithm.
324
325 o training_seq_prob - The probability of the current sequence.
326
327 This calculates E_{k}(b) (the estimated emission probability for
328 emission letter b from state k) using formula 3.21 in Durbin et al.
329 """
330
331 for k in training_seq.states.alphabet.letters:
332
333 for b in training_seq.emissions.alphabet.letters:
334 expected_times = 0
335
336 for i in range(len(training_seq.emissions)):
337
338
339 if training_seq.emissions[i] == b:
340
341 expected_times += (forward_vars[(k, i)] *
342 backward_vars[(k, i)])
343
344
345 emission_counts[(k, b)] += (float(expected_times) /
346 training_seq_prob)
347
348 return emission_counts
349
350
352 """Estimate probabilities with known state sequences.
353
354 This should be used for direct estimation of emission and transition
355 probabilities when both the state path and emission sequence are
356 known for the training examples.
357 """
360
361 - def train(self, training_seqs):
362 """Estimate the Markov Model parameters with known state paths.
363
364 This trainer requires that both the state and the emissions are
365 known for all of the training sequences in the list of
366 TrainingSequence objects.
367 This training will then count all of the transitions and emissions,
368 and use this to estimate the parameters of the model.
369 """
370
371 transition_counts = self._markov_model.get_blank_transitions()
372 emission_counts = self._markov_model.get_blank_emissions()
373
374 for training_seq in training_seqs:
375 emission_counts = self._count_emissions(training_seq,
376 emission_counts)
377 transition_counts = self._count_transitions(training_seq.states,
378 transition_counts)
379
380
381 ml_transitions, ml_emissions = \
382 self.estimate_params(transition_counts,
383 emission_counts)
384 self._markov_model.transition_prob = ml_transitions
385 self._markov_model.emission_prob = ml_emissions
386
387 return self._markov_model
388
390 """Add emissions from the training sequence to the current counts.
391
392 Arguments:
393
394 o training_seq -- A TrainingSequence with states and emissions
395 to get the counts from
396
397 o emission_counts -- The current emission counts to add to.
398 """
399 for index in range(len(training_seq.emissions)):
400 cur_state = training_seq.states[index]
401 cur_emission = training_seq.emissions[index]
402
403 try:
404 emission_counts[(cur_state, cur_emission)] += 1
405 except KeyError:
406 raise KeyError("Unexpected emission (%s, %s)"
407 % (cur_state, cur_emission))
408 return emission_counts
409
411 """Add transitions from the training sequence to the current counts.
412
413 Arguments:
414
415 o state_seq -- A Seq object with the states of the current training
416 sequence.
417
418 o transition_counts -- The current transition counts to add to.
419 """
420 for cur_pos in range(len(state_seq) - 1):
421 cur_state = state_seq[cur_pos]
422 next_state = state_seq[cur_pos + 1]
423
424 try:
425 transition_counts[(cur_state, next_state)] += 1
426 except KeyError:
427 raise KeyError("Unexpected transition (%s, %s)" %
428 (cur_state, next_state))
429
430 return transition_counts
431