1
2
3
4
5
6 """Provide trainers which estimate parameters based on training sequences.
7
8 These should be used to 'train' a Markov Model prior to actually using
9 it to decode state paths. When supplied training sequences and a model
10 to work from, these classes will estimate parameters of the model.
11
12 This aims to estimate two parameters:
13
14 - a_{kl} -- the number of times there is a transition from k to l in the
15 training data.
16
17 - e_{k}(b) -- the number of emissions of the state b from the letter k
18 in the training data.
19 """
20
21 import math
22
23
24 from .DynamicProgramming import ScaledDPAlgorithms
25
26 __docformat__ = "restructuredtext en"
27
29 """Hold a training sequence with emissions and optionally, a state path.
30 """
31 - def __init__(self, emissions, state_path):
32 """Initialize a training sequence.
33
34 Arguments:
35
36 o emissions - A Seq object containing the sequence of emissions in
37 the training sequence, and the alphabet of the sequence.
38
39 o state_path - A Seq object containing the sequence of states and
40 the alphabet of the states. If there is no known state path, then
41 the sequence of states should be an empty string.
42 """
43 if len(state_path) > 0:
44 assert len(emissions) == len(state_path), \
45 "State path does not match associated emissions."
46 self.emissions = emissions
47 self.states = state_path
48
49
51 """Provide generic functionality needed in all trainers.
52 """
54 self._markov_model = markov_model
55
57 """Calculate the log likelihood of the training seqs.
58
59 Arguments:
60
61 o probabilities -- A list of the probabilities of each training
62 sequence under the current parameters, calculated using the forward
63 algorithm.
64 """
65 total_likelihood = 0
66 for probability in probabilities:
67 total_likelihood += math.log(probability)
68
69 return total_likelihood
70
72 """Get a maximum likelihood estimation of transition and emmission.
73
74 Arguments:
75
76 o transition_counts -- A dictionary with the total number of counts
77 of transitions between two states.
78
79 o emissions_counts -- A dictionary with the total number of counts
80 of emmissions of a particular emission letter by a state letter.
81
82 This then returns the maximum likelihood estimators for the
83 transitions and emissions, estimated by formulas 3.18 in
84 Durbin et al:
85
86 a_{kl} = A_{kl} / sum(A_{kl'})
87 e_{k}(b) = E_{k}(b) / sum(E_{k}(b'))
88
89 Returns:
90 Transition and emission dictionaries containing the maximum
91 likelihood estimators.
92 """
93
94 ml_transitions = self.ml_estimator(transition_counts)
95 ml_emissions = self.ml_estimator(emission_counts)
96
97 return ml_transitions, ml_emissions
98
100 """Calculate the maximum likelihood estimator.
101
102 This can calculate maximum likelihoods for both transitions
103 and emissions.
104
105 Arguments:
106
107 o counts -- A dictionary of the counts for each item.
108
109 See estimate_params for a description of the formula used for
110 calculation.
111 """
112
113 all_ordered = sorted(counts)
114
115 ml_estimation = {}
116
117
118 cur_letter = None
119 cur_letter_counts = 0
120
121 for cur_item in all_ordered:
122
123 if cur_item[0] != cur_letter:
124
125 cur_letter = cur_item[0]
126
127
128 cur_letter_counts = counts[cur_item]
129
130
131 cur_position = all_ordered.index(cur_item) + 1
132
133
134
135 while (cur_position < len(all_ordered) and
136 all_ordered[cur_position][0] == cur_item[0]):
137 cur_letter_counts += counts[all_ordered[cur_position]]
138 cur_position += 1
139
140 else:
141 pass
142
143
144 cur_ml = float(counts[cur_item]) / float(cur_letter_counts)
145 ml_estimation[cur_item] = cur_ml
146
147 return ml_estimation
148
149
151 """Trainer that uses the Baum-Welch algorithm to estimate parameters.
152
153 These should be used when a training sequence for an HMM has unknown
154 paths for the actual states, and you need to make an estimation of the
155 model parameters from the observed emissions.
156
157 This uses the Baum-Welch algorithm, first described in
158 Baum, L.E. 1972. Inequalities. 3:1-8
159 This is based on the description in 'Biological Sequence Analysis' by
160 Durbin et al. in section 3.3
161
162 This algorithm is guaranteed to converge to a local maximum, but not
163 necessarily to the global maxima, so use with care!
164 """
166 """Initialize the trainer.
167
168 Arguments:
169
170 o markov_model - The model we are going to estimate parameters for.
171 This should have the parameters with some initial estimates, that
172 we can build from.
173 """
174 AbstractTrainer.__init__(self, markov_model)
175
178 """Estimate the parameters using training sequences.
179
180 The algorithm for this is taken from Durbin et al. p64, so this
181 is a good place to go for a reference on what is going on.
182
183 Arguments:
184
185 o training_seqs -- A list of TrainingSequence objects to be used
186 for estimating the parameters.
187
188 o stopping_criteria -- A function, that when passed the change
189 in log likelihood and threshold, will indicate if we should stop
190 the estimation iterations.
191
192 o dp_method -- A class instance specifying the dynamic programming
193 implementation we should use to calculate the forward and
194 backward variables. By default, we use the scaling method.
195 """
196 prev_log_likelihood = None
197 num_iterations = 1
198
199 while True:
200 transition_count = self._markov_model.get_blank_transitions()
201 emission_count = self._markov_model.get_blank_emissions()
202
203
204 all_probabilities = []
205
206 for training_seq in training_seqs:
207
208 DP = dp_method(self._markov_model, training_seq)
209 forward_var, seq_prob = DP.forward_algorithm()
210 backward_var = DP.backward_algorithm()
211
212 all_probabilities.append(seq_prob)
213
214
215 transition_count = self.update_transitions(transition_count,
216 training_seq,
217 forward_var,
218 backward_var,
219 seq_prob)
220 emission_count = self.update_emissions(emission_count,
221 training_seq,
222 forward_var,
223 backward_var,
224 seq_prob)
225
226
227 ml_transitions, ml_emissions = \
228 self.estimate_params(transition_count, emission_count)
229 self._markov_model.transition_prob = ml_transitions
230 self._markov_model.emission_prob = ml_emissions
231
232 cur_log_likelihood = self.log_likelihood(all_probabilities)
233
234
235
236 if prev_log_likelihood is not None:
237
238
239
240 log_likelihood_change = abs(abs(cur_log_likelihood) -
241 abs(prev_log_likelihood))
242
243
244
245 if stopping_criteria(log_likelihood_change, num_iterations):
246 break
247
248
249 prev_log_likelihood = cur_log_likelihood
250 num_iterations += 1
251
252 return self._markov_model
253
254 - def update_transitions(self, transition_counts, training_seq,
255 forward_vars, backward_vars, training_seq_prob):
256 """Add the contribution of a new training sequence to the transitions.
257
258 Arguments:
259
260 o transition_counts -- A dictionary of the current counts for the
261 transitions
262
263 o training_seq -- The training sequence we are working with
264
265 o forward_vars -- Probabilities calculated using the forward
266 algorithm.
267
268 o backward_vars -- Probabilities calculated using the backwards
269 algorithm.
270
271 o training_seq_prob - The probability of the current sequence.
272
273 This calculates A_{kl} (the estimated transition counts from state
274 k to state l) using formula 3.20 in Durbin et al.
275 """
276
277 transitions = self._markov_model.transition_prob
278 emissions = self._markov_model.emission_prob
279
280
281 for k in training_seq.states.alphabet.letters:
282 for l in self._markov_model.transitions_from(k):
283 estimated_counts = 0
284
285 for i in range(len(training_seq.emissions) - 1):
286
287 forward_value = forward_vars[(k, i)]
288
289
290 backward_value = backward_vars[(l, i + 1)]
291
292
293 trans_value = transitions[(k, l)]
294
295
296 emm_value = emissions[(l, training_seq.emissions[i + 1])]
297
298 estimated_counts += (forward_value * trans_value *
299 emm_value * backward_value)
300
301
302 transition_counts[(k, l)] += (float(estimated_counts) /
303 training_seq_prob)
304
305 return transition_counts
306
307 - def update_emissions(self, emission_counts, training_seq,
308 forward_vars, backward_vars, training_seq_prob):
309 """Add the contribution of a new training sequence to the emissions
310
311 Arguments:
312
313 o emission_counts -- A dictionary of the current counts for the
314 emissions
315
316 o training_seq -- The training sequence we are working with
317
318 o forward_vars -- Probabilities calculated using the forward
319 algorithm.
320
321 o backward_vars -- Probabilities calculated using the backwards
322 algorithm.
323
324 o training_seq_prob - The probability of the current sequence.
325
326 This calculates E_{k}(b) (the estimated emission probability for
327 emission letter b from state k) using formula 3.21 in Durbin et al.
328 """
329
330 for k in training_seq.states.alphabet.letters:
331
332 for b in training_seq.emissions.alphabet.letters:
333 expected_times = 0
334
335 for i in range(len(training_seq.emissions)):
336
337
338 if training_seq.emissions[i] == b:
339
340 expected_times += (forward_vars[(k, i)] *
341 backward_vars[(k, i)])
342
343
344 emission_counts[(k, b)] += (float(expected_times) /
345 training_seq_prob)
346
347 return emission_counts
348
349
351 """Estimate probabilities with known state sequences.
352
353 This should be used for direct estimation of emission and transition
354 probabilities when both the state path and emission sequence are
355 known for the training examples.
356 """
359
360 - def train(self, training_seqs):
361 """Estimate the Markov Model parameters with known state paths.
362
363 This trainer requires that both the state and the emissions are
364 known for all of the training sequences in the list of
365 TrainingSequence objects.
366 This training will then count all of the transitions and emissions,
367 and use this to estimate the parameters of the model.
368 """
369
370 transition_counts = self._markov_model.get_blank_transitions()
371 emission_counts = self._markov_model.get_blank_emissions()
372
373 for training_seq in training_seqs:
374 emission_counts = self._count_emissions(training_seq,
375 emission_counts)
376 transition_counts = self._count_transitions(training_seq.states,
377 transition_counts)
378
379
380 ml_transitions, ml_emissions = \
381 self.estimate_params(transition_counts,
382 emission_counts)
383 self._markov_model.transition_prob = ml_transitions
384 self._markov_model.emission_prob = ml_emissions
385
386 return self._markov_model
387
389 """Add emissions from the training sequence to the current counts.
390
391 Arguments:
392
393 o training_seq -- A TrainingSequence with states and emissions
394 to get the counts from
395
396 o emission_counts -- The current emission counts to add to.
397 """
398 for index in range(len(training_seq.emissions)):
399 cur_state = training_seq.states[index]
400 cur_emission = training_seq.emissions[index]
401
402 try:
403 emission_counts[(cur_state, cur_emission)] += 1
404 except KeyError:
405 raise KeyError("Unexpected emission (%s, %s)"
406 % (cur_state, cur_emission))
407 return emission_counts
408
410 """Add transitions from the training sequence to the current counts.
411
412 Arguments:
413
414 o state_seq -- A Seq object with the states of the current training
415 sequence.
416
417 o transition_counts -- The current transition counts to add to.
418 """
419 for cur_pos in range(len(state_seq) - 1):
420 cur_state = state_seq[cur_pos]
421 next_state = state_seq[cur_pos + 1]
422
423 try:
424 transition_counts[(cur_state, next_state)] += 1
425 except KeyError:
426 raise KeyError("Unexpected transition (%s, %s)" %
427 (cur_state, next_state))
428
429 return transition_counts
430