Package Bio :: Package HMM :: Module Trainer
[hide private]
[frames] | no frames]

Source Code for Module Bio.HMM.Trainer

  1  # This code is part of the Biopython distribution and governed by its 
  2  # license.  Please see the LICENSE file that should have been included 
  3  # as part of this package. 
  4  # 
  5   
  6  """Provide trainers which estimate parameters based on training sequences. 
  7   
  8  These should be used to 'train' a Markov Model prior to actually using 
  9  it to decode state paths. When supplied training sequences and a model 
 10  to work from, these classes will estimate parameters of the model. 
 11   
 12  This aims to estimate two parameters: 
 13   
 14      - a_{kl} -- the number of times there is a transition from k to l in the 
 15        training data. 
 16   
 17      - e_{k}(b) -- the number of emissions of the state b from the letter k 
 18        in the training data. 
 19  """ 
 20  # standard modules 
 21  import math 
 22   
 23  # local stuff 
 24  from .DynamicProgramming import ScaledDPAlgorithms 
 25   
 26  __docformat__ = "restructuredtext en" 
 27   
 28   
29 -class TrainingSequence(object):
30 """Hold a training sequence with emissions and optionally, a state path. 31 """
32 - def __init__(self, emissions, state_path):
33 """Initialize a training sequence. 34 35 Arguments: 36 37 o emissions - A Seq object containing the sequence of emissions in 38 the training sequence, and the alphabet of the sequence. 39 40 o state_path - A Seq object containing the sequence of states and 41 the alphabet of the states. If there is no known state path, then 42 the sequence of states should be an empty string. 43 """ 44 if len(state_path) > 0: 45 assert len(emissions) == len(state_path), \ 46 "State path does not match associated emissions." 47 self.emissions = emissions 48 self.states = state_path
49 50
51 -class AbstractTrainer(object):
52 """Provide generic functionality needed in all trainers. 53 """
54 - def __init__(self, markov_model):
55 self._markov_model = markov_model
56
57 - def log_likelihood(self, probabilities):
58 """Calculate the log likelihood of the training seqs. 59 60 Arguments: 61 62 o probabilities -- A list of the probabilities of each training 63 sequence under the current parameters, calculated using the forward 64 algorithm. 65 """ 66 total_likelihood = 0 67 for probability in probabilities: 68 total_likelihood += math.log(probability) 69 70 return total_likelihood
71
72 - def estimate_params(self, transition_counts, emission_counts):
73 """Get a maximum likelihood estimation of transition and emmission. 74 75 Arguments: 76 77 o transition_counts -- A dictionary with the total number of counts 78 of transitions between two states. 79 80 o emissions_counts -- A dictionary with the total number of counts 81 of emmissions of a particular emission letter by a state letter. 82 83 This then returns the maximum likelihood estimators for the 84 transitions and emissions, estimated by formulas 3.18 in 85 Durbin et al: 86 87 a_{kl} = A_{kl} / sum(A_{kl'}) 88 e_{k}(b) = E_{k}(b) / sum(E_{k}(b')) 89 90 Returns: 91 Transition and emission dictionaries containing the maximum 92 likelihood estimators. 93 """ 94 # now calculate the information 95 ml_transitions = self.ml_estimator(transition_counts) 96 ml_emissions = self.ml_estimator(emission_counts) 97 98 return ml_transitions, ml_emissions
99
100 - def ml_estimator(self, counts):
101 """Calculate the maximum likelihood estimator. 102 103 This can calculate maximum likelihoods for both transitions 104 and emissions. 105 106 Arguments: 107 108 o counts -- A dictionary of the counts for each item. 109 110 See estimate_params for a description of the formula used for 111 calculation. 112 """ 113 # get an ordered list of all items 114 all_ordered = sorted(counts) 115 116 ml_estimation = {} 117 118 # the total counts for the current letter we are on 119 cur_letter = None 120 cur_letter_counts = 0 121 122 for cur_item in all_ordered: 123 # if we are on a new letter (ie. the first letter of the tuple) 124 if cur_item[0] != cur_letter: 125 # set the new letter we are working with 126 cur_letter = cur_item[0] 127 128 # count up the total counts for this letter 129 cur_letter_counts = counts[cur_item] 130 131 # add counts for all other items with the same first letter 132 cur_position = all_ordered.index(cur_item) + 1 133 134 # keep adding while we have the same first letter or until 135 # we get to the end of the ordered list 136 while (cur_position < len(all_ordered) and 137 all_ordered[cur_position][0] == cur_item[0]): 138 cur_letter_counts += counts[all_ordered[cur_position]] 139 cur_position += 1 140 # otherwise we've already got the total counts for this letter 141 else: 142 pass 143 144 # now calculate the ml and add it to the estimation 145 cur_ml = float(counts[cur_item]) / float(cur_letter_counts) 146 ml_estimation[cur_item] = cur_ml 147 148 return ml_estimation
149 150
151 -class BaumWelchTrainer(AbstractTrainer):
152 """Trainer that uses the Baum-Welch algorithm to estimate parameters. 153 154 These should be used when a training sequence for an HMM has unknown 155 paths for the actual states, and you need to make an estimation of the 156 model parameters from the observed emissions. 157 158 This uses the Baum-Welch algorithm, first described in 159 Baum, L.E. 1972. Inequalities. 3:1-8 160 This is based on the description in 'Biological Sequence Analysis' by 161 Durbin et al. in section 3.3 162 163 This algorithm is guaranteed to converge to a local maximum, but not 164 necessarily to the global maxima, so use with care! 165 """
166 - def __init__(self, markov_model):
167 """Initialize the trainer. 168 169 Arguments: 170 171 o markov_model - The model we are going to estimate parameters for. 172 This should have the parameters with some initial estimates, that 173 we can build from. 174 """ 175 AbstractTrainer.__init__(self, markov_model)
176
177 - def train(self, training_seqs, stopping_criteria, 178 dp_method=ScaledDPAlgorithms):
179 """Estimate the parameters using training sequences. 180 181 The algorithm for this is taken from Durbin et al. p64, so this 182 is a good place to go for a reference on what is going on. 183 184 Arguments: 185 186 o training_seqs -- A list of TrainingSequence objects to be used 187 for estimating the parameters. 188 189 o stopping_criteria -- A function, that when passed the change 190 in log likelihood and threshold, will indicate if we should stop 191 the estimation iterations. 192 193 o dp_method -- A class instance specifying the dynamic programming 194 implementation we should use to calculate the forward and 195 backward variables. By default, we use the scaling method. 196 """ 197 prev_log_likelihood = None 198 num_iterations = 1 199 200 while True: 201 transition_count = self._markov_model.get_blank_transitions() 202 emission_count = self._markov_model.get_blank_emissions() 203 204 # remember all of the sequence probabilities 205 all_probabilities = [] 206 207 for training_seq in training_seqs: 208 # calculate the forward and backward variables 209 DP = dp_method(self._markov_model, training_seq) 210 forward_var, seq_prob = DP.forward_algorithm() 211 backward_var = DP.backward_algorithm() 212 213 all_probabilities.append(seq_prob) 214 215 # update the counts for transitions and emissions 216 transition_count = self.update_transitions(transition_count, 217 training_seq, 218 forward_var, 219 backward_var, 220 seq_prob) 221 emission_count = self.update_emissions(emission_count, 222 training_seq, 223 forward_var, 224 backward_var, 225 seq_prob) 226 227 # update the markov model with the new probabilities 228 ml_transitions, ml_emissions = \ 229 self.estimate_params(transition_count, emission_count) 230 self._markov_model.transition_prob = ml_transitions 231 self._markov_model.emission_prob = ml_emissions 232 233 cur_log_likelihood = self.log_likelihood(all_probabilities) 234 235 # if we have previously calculated the log likelihood (ie. 236 # not the first round), see if we can finish 237 if prev_log_likelihood is not None: 238 # XXX log likelihoods are negatives -- am I calculating 239 # the change properly, or should I use the negatives... 240 # I'm not sure at all if this is right. 241 log_likelihood_change = abs(abs(cur_log_likelihood) - 242 abs(prev_log_likelihood)) 243 244 # check whether we have completed enough iterations to have 245 # a good estimation 246 if stopping_criteria(log_likelihood_change, num_iterations): 247 break 248 249 # set up for another round of iterations 250 prev_log_likelihood = cur_log_likelihood 251 num_iterations += 1 252 253 return self._markov_model
254
255 - def update_transitions(self, transition_counts, training_seq, 256 forward_vars, backward_vars, training_seq_prob):
257 """Add the contribution of a new training sequence to the transitions. 258 259 Arguments: 260 261 o transition_counts -- A dictionary of the current counts for the 262 transitions 263 264 o training_seq -- The training sequence we are working with 265 266 o forward_vars -- Probabilities calculated using the forward 267 algorithm. 268 269 o backward_vars -- Probabilities calculated using the backwards 270 algorithm. 271 272 o training_seq_prob - The probability of the current sequence. 273 274 This calculates A_{kl} (the estimated transition counts from state 275 k to state l) using formula 3.20 in Durbin et al. 276 """ 277 # set up the transition and emission probabilities we are using 278 transitions = self._markov_model.transition_prob 279 emissions = self._markov_model.emission_prob 280 281 # loop over the possible combinations of state path letters 282 for k in training_seq.states.alphabet.letters: 283 for l in self._markov_model.transitions_from(k): 284 estimated_counts = 0 285 # now loop over the entire training sequence 286 for i in range(len(training_seq.emissions) - 1): 287 # the forward value of k at the current position 288 forward_value = forward_vars[(k, i)] 289 290 # the backward value of l in the next position 291 backward_value = backward_vars[(l, i + 1)] 292 293 # the probability of a transition from k to l 294 trans_value = transitions[(k, l)] 295 296 # the probability of getting the emission at the next pos 297 emm_value = emissions[(l, training_seq.emissions[i + 1])] 298 299 estimated_counts += (forward_value * trans_value * 300 emm_value * backward_value) 301 302 # update the transition approximation 303 transition_counts[(k, l)] += (float(estimated_counts) / 304 training_seq_prob) 305 306 return transition_counts
307
308 - def update_emissions(self, emission_counts, training_seq, 309 forward_vars, backward_vars, training_seq_prob):
310 """Add the contribution of a new training sequence to the emissions 311 312 Arguments: 313 314 o emission_counts -- A dictionary of the current counts for the 315 emissions 316 317 o training_seq -- The training sequence we are working with 318 319 o forward_vars -- Probabilities calculated using the forward 320 algorithm. 321 322 o backward_vars -- Probabilities calculated using the backwards 323 algorithm. 324 325 o training_seq_prob - The probability of the current sequence. 326 327 This calculates E_{k}(b) (the estimated emission probability for 328 emission letter b from state k) using formula 3.21 in Durbin et al. 329 """ 330 # loop over the possible combinations of state path letters 331 for k in training_seq.states.alphabet.letters: 332 # now loop over all of the possible emissions 333 for b in training_seq.emissions.alphabet.letters: 334 expected_times = 0 335 # finally loop over the entire training sequence 336 for i in range(len(training_seq.emissions)): 337 # only count the forward and backward probability if the 338 # emission at the position is the same as b 339 if training_seq.emissions[i] == b: 340 # f_{k}(i) b_{k}(i) 341 expected_times += (forward_vars[(k, i)] * 342 backward_vars[(k, i)]) 343 344 # add to E_{k}(b) 345 emission_counts[(k, b)] += (float(expected_times) / 346 training_seq_prob) 347 348 return emission_counts
349 350
351 -class KnownStateTrainer(AbstractTrainer):
352 """Estimate probabilities with known state sequences. 353 354 This should be used for direct estimation of emission and transition 355 probabilities when both the state path and emission sequence are 356 known for the training examples. 357 """
358 - def __init__(self, markov_model):
359 AbstractTrainer.__init__(self, markov_model)
360
361 - def train(self, training_seqs):
362 """Estimate the Markov Model parameters with known state paths. 363 364 This trainer requires that both the state and the emissions are 365 known for all of the training sequences in the list of 366 TrainingSequence objects. 367 This training will then count all of the transitions and emissions, 368 and use this to estimate the parameters of the model. 369 """ 370 # count up all of the transitions and emissions 371 transition_counts = self._markov_model.get_blank_transitions() 372 emission_counts = self._markov_model.get_blank_emissions() 373 374 for training_seq in training_seqs: 375 emission_counts = self._count_emissions(training_seq, 376 emission_counts) 377 transition_counts = self._count_transitions(training_seq.states, 378 transition_counts) 379 380 # update the markov model from the counts 381 ml_transitions, ml_emissions = \ 382 self.estimate_params(transition_counts, 383 emission_counts) 384 self._markov_model.transition_prob = ml_transitions 385 self._markov_model.emission_prob = ml_emissions 386 387 return self._markov_model
388
389 - def _count_emissions(self, training_seq, emission_counts):
390 """Add emissions from the training sequence to the current counts. 391 392 Arguments: 393 394 o training_seq -- A TrainingSequence with states and emissions 395 to get the counts from 396 397 o emission_counts -- The current emission counts to add to. 398 """ 399 for index in range(len(training_seq.emissions)): 400 cur_state = training_seq.states[index] 401 cur_emission = training_seq.emissions[index] 402 403 try: 404 emission_counts[(cur_state, cur_emission)] += 1 405 except KeyError: 406 raise KeyError("Unexpected emission (%s, %s)" 407 % (cur_state, cur_emission)) 408 return emission_counts
409
410 - def _count_transitions(self, state_seq, transition_counts):
411 """Add transitions from the training sequence to the current counts. 412 413 Arguments: 414 415 o state_seq -- A Seq object with the states of the current training 416 sequence. 417 418 o transition_counts -- The current transition counts to add to. 419 """ 420 for cur_pos in range(len(state_seq) - 1): 421 cur_state = state_seq[cur_pos] 422 next_state = state_seq[cur_pos + 1] 423 424 try: 425 transition_counts[(cur_state, next_state)] += 1 426 except KeyError: 427 raise KeyError("Unexpected transition (%s, %s)" % 428 (cur_state, next_state)) 429 430 return transition_counts
431