1
2
3
4
5
6 """Provide trainers which estimate parameters based on training sequences.
7
8 These should be used to 'train' a Markov Model prior to actually using
9 it to decode state paths. When supplied training sequences and a model
10 to work from, these classes will estimate parameters of the model.
11
12 This aims to estimate two parameters:
13
14 * a_{kl} -- the number of times there is a transition from k to l in the
15 training data.
16
17 * e_{k}(b) -- the number of emissions of the state b from the letter k
18 in the training data.
19 """
20
21 import math
22
23
24 from .DynamicProgramming import ScaledDPAlgorithms
25
26
28 """Hold a training sequence with emissions and optionally, a state path.
29 """
30 - def __init__(self, emissions, state_path):
31 """Initialize a training sequence.
32
33 Arguments:
34
35 o emissions - A Seq object containing the sequence of emissions in
36 the training sequence, and the alphabet of the sequence.
37
38 o state_path - A Seq object containing the sequence of states and
39 the alphabet of the states. If there is no known state path, then
40 the sequence of states should be an empty string.
41 """
42 if len(state_path) > 0:
43 assert len(emissions) == len(state_path), \
44 "State path does not match associated emissions."
45 self.emissions = emissions
46 self.states = state_path
47
48
50 """Provide generic functionality needed in all trainers.
51 """
53 self._markov_model = markov_model
54
56 """Calculate the log likelihood of the training seqs.
57
58 Arguments:
59
60 o probabilities -- A list of the probabilities of each training
61 sequence under the current parameters, calculated using the forward
62 algorithm.
63 """
64 total_likelihood = 0
65 for probability in probabilities:
66 total_likelihood += math.log(probability)
67
68 return total_likelihood
69
71 """Get a maximum likelihood estimation of transition and emmission.
72
73 Arguments:
74
75 o transition_counts -- A dictionary with the total number of counts
76 of transitions between two states.
77
78 o emissions_counts -- A dictionary with the total number of counts
79 of emmissions of a particular emission letter by a state letter.
80
81 This then returns the maximum likelihood estimators for the
82 transitions and emissions, estimated by formulas 3.18 in
83 Durbin et al:
84
85 a_{kl} = A_{kl} / sum(A_{kl'})
86 e_{k}(b) = E_{k}(b) / sum(E_{k}(b'))
87
88 Returns:
89 Transition and emission dictionaries containing the maximum
90 likelihood estimators.
91 """
92
93 ml_transitions = self.ml_estimator(transition_counts)
94 ml_emissions = self.ml_estimator(emission_counts)
95
96 return ml_transitions, ml_emissions
97
99 """Calculate the maximum likelihood estimator.
100
101 This can calculate maximum likelihoods for both transitions
102 and emissions.
103
104 Arguments:
105
106 o counts -- A dictionary of the counts for each item.
107
108 See estimate_params for a description of the formula used for
109 calculation.
110 """
111
112 all_ordered = sorted(counts)
113
114 ml_estimation = {}
115
116
117 cur_letter = None
118 cur_letter_counts = 0
119
120 for cur_item in all_ordered:
121
122 if cur_item[0] != cur_letter:
123
124 cur_letter = cur_item[0]
125
126
127 cur_letter_counts = counts[cur_item]
128
129
130 cur_position = all_ordered.index(cur_item) + 1
131
132
133
134 while (cur_position < len(all_ordered) and
135 all_ordered[cur_position][0] == cur_item[0]):
136 cur_letter_counts += counts[all_ordered[cur_position]]
137 cur_position += 1
138
139 else:
140 pass
141
142
143 cur_ml = float(counts[cur_item]) / float(cur_letter_counts)
144 ml_estimation[cur_item] = cur_ml
145
146 return ml_estimation
147
148
150 """Trainer that uses the Baum-Welch algorithm to estimate parameters.
151
152 These should be used when a training sequence for an HMM has unknown
153 paths for the actual states, and you need to make an estimation of the
154 model parameters from the observed emissions.
155
156 This uses the Baum-Welch algorithm, first described in
157 Baum, L.E. 1972. Inequalities. 3:1-8
158 This is based on the description in 'Biological Sequence Analysis' by
159 Durbin et al. in section 3.3
160
161 This algorithm is guaranteed to converge to a local maximum, but not
162 necessarily to the global maxima, so use with care!
163 """
165 """Initialize the trainer.
166
167 Arguments:
168
169 o markov_model - The model we are going to estimate parameters for.
170 This should have the parameters with some initial estimates, that
171 we can build from.
172 """
173 AbstractTrainer.__init__(self, markov_model)
174
177 """Estimate the parameters using training sequences.
178
179 The algorithm for this is taken from Durbin et al. p64, so this
180 is a good place to go for a reference on what is going on.
181
182 Arguments:
183
184 o training_seqs -- A list of TrainingSequence objects to be used
185 for estimating the parameters.
186
187 o stopping_criteria -- A function, that when passed the change
188 in log likelihood and threshold, will indicate if we should stop
189 the estimation iterations.
190
191 o dp_method -- A class instance specifying the dynamic programming
192 implementation we should use to calculate the forward and
193 backward variables. By default, we use the scaling method.
194 """
195 prev_log_likelihood = None
196 num_iterations = 1
197
198 while True:
199 transition_count = self._markov_model.get_blank_transitions()
200 emission_count = self._markov_model.get_blank_emissions()
201
202
203 all_probabilities = []
204
205 for training_seq in training_seqs:
206
207 DP = dp_method(self._markov_model, training_seq)
208 forward_var, seq_prob = DP.forward_algorithm()
209 backward_var = DP.backward_algorithm()
210
211 all_probabilities.append(seq_prob)
212
213
214 transition_count = self.update_transitions(transition_count,
215 training_seq,
216 forward_var,
217 backward_var,
218 seq_prob)
219 emission_count = self.update_emissions(emission_count,
220 training_seq,
221 forward_var,
222 backward_var,
223 seq_prob)
224
225
226 ml_transitions, ml_emissions = \
227 self.estimate_params(transition_count, emission_count)
228 self._markov_model.transition_prob = ml_transitions
229 self._markov_model.emission_prob = ml_emissions
230
231 cur_log_likelihood = self.log_likelihood(all_probabilities)
232
233
234
235 if prev_log_likelihood is not None:
236
237
238
239 log_likelihood_change = abs(abs(cur_log_likelihood) -
240 abs(prev_log_likelihood))
241
242
243
244 if stopping_criteria(log_likelihood_change, num_iterations):
245 break
246
247
248 prev_log_likelihood = cur_log_likelihood
249 num_iterations += 1
250
251 return self._markov_model
252
253 - def update_transitions(self, transition_counts, training_seq,
254 forward_vars, backward_vars, training_seq_prob):
255 """Add the contribution of a new training sequence to the transitions.
256
257 Arguments:
258
259 o transition_counts -- A dictionary of the current counts for the
260 transitions
261
262 o training_seq -- The training sequence we are working with
263
264 o forward_vars -- Probabilities calculated using the forward
265 algorithm.
266
267 o backward_vars -- Probabilities calculated using the backwards
268 algorithm.
269
270 o training_seq_prob - The probability of the current sequence.
271
272 This calculates A_{kl} (the estimated transition counts from state
273 k to state l) using formula 3.20 in Durbin et al.
274 """
275
276 transitions = self._markov_model.transition_prob
277 emissions = self._markov_model.emission_prob
278
279
280 for k in training_seq.states.alphabet.letters:
281 for l in self._markov_model.transitions_from(k):
282 estimated_counts = 0
283
284 for i in range(len(training_seq.emissions) - 1):
285
286 forward_value = forward_vars[(k, i)]
287
288
289 backward_value = backward_vars[(l, i + 1)]
290
291
292 trans_value = transitions[(k, l)]
293
294
295 emm_value = emissions[(l, training_seq.emissions[i + 1])]
296
297 estimated_counts += (forward_value * trans_value *
298 emm_value * backward_value)
299
300
301 transition_counts[(k, l)] += (float(estimated_counts) /
302 training_seq_prob)
303
304 return transition_counts
305
306 - def update_emissions(self, emission_counts, training_seq,
307 forward_vars, backward_vars, training_seq_prob):
308 """Add the contribution of a new training sequence to the emissions
309
310 Arguments:
311
312 o emission_counts -- A dictionary of the current counts for the
313 emissions
314
315 o training_seq -- The training sequence we are working with
316
317 o forward_vars -- Probabilities calculated using the forward
318 algorithm.
319
320 o backward_vars -- Probabilities calculated using the backwards
321 algorithm.
322
323 o training_seq_prob - The probability of the current sequence.
324
325 This calculates E_{k}(b) (the estimated emission probability for
326 emission letter b from state k) using formula 3.21 in Durbin et al.
327 """
328
329 for k in training_seq.states.alphabet.letters:
330
331 for b in training_seq.emissions.alphabet.letters:
332 expected_times = 0
333
334 for i in range(len(training_seq.emissions)):
335
336
337 if training_seq.emissions[i] == b:
338
339 expected_times += (forward_vars[(k, i)] *
340 backward_vars[(k, i)])
341
342
343 emission_counts[(k, b)] += (float(expected_times) /
344 training_seq_prob)
345
346 return emission_counts
347
348
350 """Estimate probabilities with known state sequences.
351
352 This should be used for direct estimation of emission and transition
353 probabilities when both the state path and emission sequence are
354 known for the training examples.
355 """
358
359 - def train(self, training_seqs):
360 """Estimate the Markov Model parameters with known state paths.
361
362 This trainer requires that both the state and the emissions are
363 known for all of the training sequences in the list of
364 TrainingSequence objects.
365 This training will then count all of the transitions and emissions,
366 and use this to estimate the parameters of the model.
367 """
368
369 transition_counts = self._markov_model.get_blank_transitions()
370 emission_counts = self._markov_model.get_blank_emissions()
371
372 for training_seq in training_seqs:
373 emission_counts = self._count_emissions(training_seq,
374 emission_counts)
375 transition_counts = self._count_transitions(training_seq.states,
376 transition_counts)
377
378
379 ml_transitions, ml_emissions = \
380 self.estimate_params(transition_counts,
381 emission_counts)
382 self._markov_model.transition_prob = ml_transitions
383 self._markov_model.emission_prob = ml_emissions
384
385 return self._markov_model
386
388 """Add emissions from the training sequence to the current counts.
389
390 Arguments:
391
392 o training_seq -- A TrainingSequence with states and emissions
393 to get the counts from
394
395 o emission_counts -- The current emission counts to add to.
396 """
397 for index in range(len(training_seq.emissions)):
398 cur_state = training_seq.states[index]
399 cur_emission = training_seq.emissions[index]
400
401 try:
402 emission_counts[(cur_state, cur_emission)] += 1
403 except KeyError:
404 raise KeyError("Unexpected emission (%s, %s)"
405 % (cur_state, cur_emission))
406 return emission_counts
407
409 """Add transitions from the training sequence to the current counts.
410
411 Arguments:
412
413 o state_seq -- A Seq object with the states of the current training
414 sequence.
415
416 o transition_counts -- The current transition counts to add to.
417 """
418 for cur_pos in range(len(state_seq) - 1):
419 cur_state = state_seq[cur_pos]
420 next_state = state_seq[cur_pos + 1]
421
422 try:
423 transition_counts[(cur_state, next_state)] += 1
424 except KeyError:
425 raise KeyError("Unexpected transition (%s, %s)" %
426 (cur_state, next_state))
427
428 return transition_counts
429