Package Bio :: Package HMM :: Module Trainer
[hide private]
[frames] | no frames]

Source Code for Module Bio.HMM.Trainer

  1  # This code is part of the Biopython distribution and governed by its 
  2  # license.  Please see the LICENSE file that should have been included 
  3  # as part of this package. 
  4  # 
  5   
  6  """Provide trainers which estimate parameters based on training sequences. 
  7   
  8  These should be used to 'train' a Markov Model prior to actually using 
  9  it to decode state paths. When supplied training sequences and a model 
 10  to work from, these classes will estimate parameters of the model. 
 11   
 12  This aims to estimate two parameters: 
 13   
 14      - a_{kl} -- the number of times there is a transition from k to l in the 
 15        training data. 
 16   
 17      - e_{k}(b) -- the number of emissions of the state b from the letter k 
 18        in the training data. 
 19  """ 
 20  # standard modules 
 21  import math 
 22   
 23  # local stuff 
 24  from .DynamicProgramming import ScaledDPAlgorithms 
 25   
 26  __docformat__ = "restructuredtext en" 
 27   
28 -class TrainingSequence(object):
29 """Hold a training sequence with emissions and optionally, a state path. 30 """
31 - def __init__(self, emissions, state_path):
32 """Initialize a training sequence. 33 34 Arguments: 35 36 o emissions - A Seq object containing the sequence of emissions in 37 the training sequence, and the alphabet of the sequence. 38 39 o state_path - A Seq object containing the sequence of states and 40 the alphabet of the states. If there is no known state path, then 41 the sequence of states should be an empty string. 42 """ 43 if len(state_path) > 0: 44 assert len(emissions) == len(state_path), \ 45 "State path does not match associated emissions." 46 self.emissions = emissions 47 self.states = state_path
48 49
50 -class AbstractTrainer(object):
51 """Provide generic functionality needed in all trainers. 52 """
53 - def __init__(self, markov_model):
54 self._markov_model = markov_model
55
56 - def log_likelihood(self, probabilities):
57 """Calculate the log likelihood of the training seqs. 58 59 Arguments: 60 61 o probabilities -- A list of the probabilities of each training 62 sequence under the current parameters, calculated using the forward 63 algorithm. 64 """ 65 total_likelihood = 0 66 for probability in probabilities: 67 total_likelihood += math.log(probability) 68 69 return total_likelihood
70
71 - def estimate_params(self, transition_counts, emission_counts):
72 """Get a maximum likelihood estimation of transition and emmission. 73 74 Arguments: 75 76 o transition_counts -- A dictionary with the total number of counts 77 of transitions between two states. 78 79 o emissions_counts -- A dictionary with the total number of counts 80 of emmissions of a particular emission letter by a state letter. 81 82 This then returns the maximum likelihood estimators for the 83 transitions and emissions, estimated by formulas 3.18 in 84 Durbin et al: 85 86 a_{kl} = A_{kl} / sum(A_{kl'}) 87 e_{k}(b) = E_{k}(b) / sum(E_{k}(b')) 88 89 Returns: 90 Transition and emission dictionaries containing the maximum 91 likelihood estimators. 92 """ 93 # now calculate the information 94 ml_transitions = self.ml_estimator(transition_counts) 95 ml_emissions = self.ml_estimator(emission_counts) 96 97 return ml_transitions, ml_emissions
98
99 - def ml_estimator(self, counts):
100 """Calculate the maximum likelihood estimator. 101 102 This can calculate maximum likelihoods for both transitions 103 and emissions. 104 105 Arguments: 106 107 o counts -- A dictionary of the counts for each item. 108 109 See estimate_params for a description of the formula used for 110 calculation. 111 """ 112 # get an ordered list of all items 113 all_ordered = sorted(counts) 114 115 ml_estimation = {} 116 117 # the total counts for the current letter we are on 118 cur_letter = None 119 cur_letter_counts = 0 120 121 for cur_item in all_ordered: 122 # if we are on a new letter (ie. the first letter of the tuple) 123 if cur_item[0] != cur_letter: 124 # set the new letter we are working with 125 cur_letter = cur_item[0] 126 127 # count up the total counts for this letter 128 cur_letter_counts = counts[cur_item] 129 130 # add counts for all other items with the same first letter 131 cur_position = all_ordered.index(cur_item) + 1 132 133 # keep adding while we have the same first letter or until 134 # we get to the end of the ordered list 135 while (cur_position < len(all_ordered) and 136 all_ordered[cur_position][0] == cur_item[0]): 137 cur_letter_counts += counts[all_ordered[cur_position]] 138 cur_position += 1 139 # otherwise we've already got the total counts for this letter 140 else: 141 pass 142 143 # now calculate the ml and add it to the estimation 144 cur_ml = float(counts[cur_item]) / float(cur_letter_counts) 145 ml_estimation[cur_item] = cur_ml 146 147 return ml_estimation
148 149
150 -class BaumWelchTrainer(AbstractTrainer):
151 """Trainer that uses the Baum-Welch algorithm to estimate parameters. 152 153 These should be used when a training sequence for an HMM has unknown 154 paths for the actual states, and you need to make an estimation of the 155 model parameters from the observed emissions. 156 157 This uses the Baum-Welch algorithm, first described in 158 Baum, L.E. 1972. Inequalities. 3:1-8 159 This is based on the description in 'Biological Sequence Analysis' by 160 Durbin et al. in section 3.3 161 162 This algorithm is guaranteed to converge to a local maximum, but not 163 necessarily to the global maxima, so use with care! 164 """
165 - def __init__(self, markov_model):
166 """Initialize the trainer. 167 168 Arguments: 169 170 o markov_model - The model we are going to estimate parameters for. 171 This should have the parameters with some initial estimates, that 172 we can build from. 173 """ 174 AbstractTrainer.__init__(self, markov_model)
175
176 - def train(self, training_seqs, stopping_criteria, 177 dp_method=ScaledDPAlgorithms):
178 """Estimate the parameters using training sequences. 179 180 The algorithm for this is taken from Durbin et al. p64, so this 181 is a good place to go for a reference on what is going on. 182 183 Arguments: 184 185 o training_seqs -- A list of TrainingSequence objects to be used 186 for estimating the parameters. 187 188 o stopping_criteria -- A function, that when passed the change 189 in log likelihood and threshold, will indicate if we should stop 190 the estimation iterations. 191 192 o dp_method -- A class instance specifying the dynamic programming 193 implementation we should use to calculate the forward and 194 backward variables. By default, we use the scaling method. 195 """ 196 prev_log_likelihood = None 197 num_iterations = 1 198 199 while True: 200 transition_count = self._markov_model.get_blank_transitions() 201 emission_count = self._markov_model.get_blank_emissions() 202 203 # remember all of the sequence probabilities 204 all_probabilities = [] 205 206 for training_seq in training_seqs: 207 # calculate the forward and backward variables 208 DP = dp_method(self._markov_model, training_seq) 209 forward_var, seq_prob = DP.forward_algorithm() 210 backward_var = DP.backward_algorithm() 211 212 all_probabilities.append(seq_prob) 213 214 # update the counts for transitions and emissions 215 transition_count = self.update_transitions(transition_count, 216 training_seq, 217 forward_var, 218 backward_var, 219 seq_prob) 220 emission_count = self.update_emissions(emission_count, 221 training_seq, 222 forward_var, 223 backward_var, 224 seq_prob) 225 226 # update the markov model with the new probabilities 227 ml_transitions, ml_emissions = \ 228 self.estimate_params(transition_count, emission_count) 229 self._markov_model.transition_prob = ml_transitions 230 self._markov_model.emission_prob = ml_emissions 231 232 cur_log_likelihood = self.log_likelihood(all_probabilities) 233 234 # if we have previously calculated the log likelihood (ie. 235 # not the first round), see if we can finish 236 if prev_log_likelihood is not None: 237 # XXX log likelihoods are negatives -- am I calculating 238 # the change properly, or should I use the negatives... 239 # I'm not sure at all if this is right. 240 log_likelihood_change = abs(abs(cur_log_likelihood) - 241 abs(prev_log_likelihood)) 242 243 # check whether we have completed enough iterations to have 244 # a good estimation 245 if stopping_criteria(log_likelihood_change, num_iterations): 246 break 247 248 # set up for another round of iterations 249 prev_log_likelihood = cur_log_likelihood 250 num_iterations += 1 251 252 return self._markov_model
253
254 - def update_transitions(self, transition_counts, training_seq, 255 forward_vars, backward_vars, training_seq_prob):
256 """Add the contribution of a new training sequence to the transitions. 257 258 Arguments: 259 260 o transition_counts -- A dictionary of the current counts for the 261 transitions 262 263 o training_seq -- The training sequence we are working with 264 265 o forward_vars -- Probabilities calculated using the forward 266 algorithm. 267 268 o backward_vars -- Probabilities calculated using the backwards 269 algorithm. 270 271 o training_seq_prob - The probability of the current sequence. 272 273 This calculates A_{kl} (the estimated transition counts from state 274 k to state l) using formula 3.20 in Durbin et al. 275 """ 276 # set up the transition and emission probabilities we are using 277 transitions = self._markov_model.transition_prob 278 emissions = self._markov_model.emission_prob 279 280 # loop over the possible combinations of state path letters 281 for k in training_seq.states.alphabet.letters: 282 for l in self._markov_model.transitions_from(k): 283 estimated_counts = 0 284 # now loop over the entire training sequence 285 for i in range(len(training_seq.emissions) - 1): 286 # the forward value of k at the current position 287 forward_value = forward_vars[(k, i)] 288 289 # the backward value of l in the next position 290 backward_value = backward_vars[(l, i + 1)] 291 292 # the probability of a transition from k to l 293 trans_value = transitions[(k, l)] 294 295 # the probability of getting the emission at the next pos 296 emm_value = emissions[(l, training_seq.emissions[i + 1])] 297 298 estimated_counts += (forward_value * trans_value * 299 emm_value * backward_value) 300 301 # update the transition approximation 302 transition_counts[(k, l)] += (float(estimated_counts) / 303 training_seq_prob) 304 305 return transition_counts
306
307 - def update_emissions(self, emission_counts, training_seq, 308 forward_vars, backward_vars, training_seq_prob):
309 """Add the contribution of a new training sequence to the emissions 310 311 Arguments: 312 313 o emission_counts -- A dictionary of the current counts for the 314 emissions 315 316 o training_seq -- The training sequence we are working with 317 318 o forward_vars -- Probabilities calculated using the forward 319 algorithm. 320 321 o backward_vars -- Probabilities calculated using the backwards 322 algorithm. 323 324 o training_seq_prob - The probability of the current sequence. 325 326 This calculates E_{k}(b) (the estimated emission probability for 327 emission letter b from state k) using formula 3.21 in Durbin et al. 328 """ 329 # loop over the possible combinations of state path letters 330 for k in training_seq.states.alphabet.letters: 331 # now loop over all of the possible emissions 332 for b in training_seq.emissions.alphabet.letters: 333 expected_times = 0 334 # finally loop over the entire training sequence 335 for i in range(len(training_seq.emissions)): 336 # only count the forward and backward probability if the 337 # emission at the position is the same as b 338 if training_seq.emissions[i] == b: 339 # f_{k}(i) b_{k}(i) 340 expected_times += (forward_vars[(k, i)] * 341 backward_vars[(k, i)]) 342 343 # add to E_{k}(b) 344 emission_counts[(k, b)] += (float(expected_times) / 345 training_seq_prob) 346 347 return emission_counts
348 349
350 -class KnownStateTrainer(AbstractTrainer):
351 """Estimate probabilities with known state sequences. 352 353 This should be used for direct estimation of emission and transition 354 probabilities when both the state path and emission sequence are 355 known for the training examples. 356 """
357 - def __init__(self, markov_model):
358 AbstractTrainer.__init__(self, markov_model)
359
360 - def train(self, training_seqs):
361 """Estimate the Markov Model parameters with known state paths. 362 363 This trainer requires that both the state and the emissions are 364 known for all of the training sequences in the list of 365 TrainingSequence objects. 366 This training will then count all of the transitions and emissions, 367 and use this to estimate the parameters of the model. 368 """ 369 # count up all of the transitions and emissions 370 transition_counts = self._markov_model.get_blank_transitions() 371 emission_counts = self._markov_model.get_blank_emissions() 372 373 for training_seq in training_seqs: 374 emission_counts = self._count_emissions(training_seq, 375 emission_counts) 376 transition_counts = self._count_transitions(training_seq.states, 377 transition_counts) 378 379 # update the markov model from the counts 380 ml_transitions, ml_emissions = \ 381 self.estimate_params(transition_counts, 382 emission_counts) 383 self._markov_model.transition_prob = ml_transitions 384 self._markov_model.emission_prob = ml_emissions 385 386 return self._markov_model
387
388 - def _count_emissions(self, training_seq, emission_counts):
389 """Add emissions from the training sequence to the current counts. 390 391 Arguments: 392 393 o training_seq -- A TrainingSequence with states and emissions 394 to get the counts from 395 396 o emission_counts -- The current emission counts to add to. 397 """ 398 for index in range(len(training_seq.emissions)): 399 cur_state = training_seq.states[index] 400 cur_emission = training_seq.emissions[index] 401 402 try: 403 emission_counts[(cur_state, cur_emission)] += 1 404 except KeyError: 405 raise KeyError("Unexpected emission (%s, %s)" 406 % (cur_state, cur_emission)) 407 return emission_counts
408
409 - def _count_transitions(self, state_seq, transition_counts):
410 """Add transitions from the training sequence to the current counts. 411 412 Arguments: 413 414 o state_seq -- A Seq object with the states of the current training 415 sequence. 416 417 o transition_counts -- The current transition counts to add to. 418 """ 419 for cur_pos in range(len(state_seq) - 1): 420 cur_state = state_seq[cur_pos] 421 next_state = state_seq[cur_pos + 1] 422 423 try: 424 transition_counts[(cur_state, next_state)] += 1 425 except KeyError: 426 raise KeyError("Unexpected transition (%s, %s)" % 427 (cur_state, next_state)) 428 429 return transition_counts
430