Package Bio :: Module MarkovModel
[hide private]
[frames] | no frames]

Source Code for Module Bio.MarkovModel

  1  # This code is part of the Biopython distribution and governed by its 
  2  # license.  Please see the LICENSE file that should have been included 
  3  # as part of this package. 
  4  # 
  5   
  6  """ 
  7  This is an implementation of a state-emitting MarkovModel.  I am using 
  8  terminology similar to Manning and Schutze. 
  9   
 10   
 11   
 12  Functions: 
 13  train_bw        Train a markov model using the Baum-Welch algorithm. 
 14  train_visible   Train a visible markov model using MLE. 
 15  find_states     Find the a state sequence that explains some observations. 
 16   
 17  load            Load a MarkovModel. 
 18  save            Save a MarkovModel. 
 19   
 20  Classes: 
 21  MarkovModel     Holds the description of a markov model 
 22  """ 
 23   
 24  import numpy 
 25   
 26  try: 
 27      logaddexp = numpy.logaddexp 
 28  except AttributeError: 
 29      # Numpy versions older than 1.3 do not contain logaddexp. 
 30      # Once we require Numpy version 1.3 or later, we should revisit this 
 31      # module to see if we can simplify some of the other functions in 
 32      # this module. 
 33      import warnings 
 34      warnings.warn("For optimal speed, please update to Numpy version 1.3 or later (current version is %s)" % numpy.__version__) 
 35   
36 - def logaddexp(logx, logy):
37 if logy - logx > 100: 38 return logy 39 elif logx - logy > 100: 40 return logx 41 minxy = min(logx, logy) 42 return minxy + numpy.log(numpy.exp(logx-minxy) + numpy.exp(logy-minxy))
43 44
45 -def itemindex(values):
46 d = {} 47 entries = enumerate(values[::-1]) 48 n = len(values)-1 49 for index, key in entries: 50 d[key] = n-index 51 return d
52 53 numpy.random.seed() 54 55 VERY_SMALL_NUMBER = 1E-300 56 LOG0 = numpy.log(VERY_SMALL_NUMBER) 57 58
59 -class MarkovModel(object):
60 - def __init__(self, states, alphabet, 61 p_initial=None, p_transition=None, p_emission=None):
62 self.states = states 63 self.alphabet = alphabet 64 self.p_initial = p_initial 65 self.p_transition = p_transition 66 self.p_emission = p_emission
67
68 - def __str__(self):
69 from Bio._py3k import StringIO 70 handle = StringIO() 71 save(self, handle) 72 handle.seek(0) 73 return handle.read()
74 75
76 -def _readline_and_check_start(handle, start):
77 line = handle.readline() 78 if not line.startswith(start): 79 raise ValueError("I expected %r but got %r" % (start, line)) 80 return line
81 82
83 -def load(handle):
84 """load(handle) -> MarkovModel()""" 85 # Load the states. 86 line = _readline_and_check_start(handle, "STATES:") 87 states = line.split()[1:] 88 89 # Load the alphabet. 90 line = _readline_and_check_start(handle, "ALPHABET:") 91 alphabet = line.split()[1:] 92 93 mm = MarkovModel(states, alphabet) 94 N, M = len(states), len(alphabet) 95 96 # Load the initial probabilities. 97 mm.p_initial = numpy.zeros(N) 98 line = _readline_and_check_start(handle, "INITIAL:") 99 for i in range(len(states)): 100 line = _readline_and_check_start(handle, " %s:" % states[i]) 101 mm.p_initial[i] = float(line.split()[-1]) 102 103 # Load the transition. 104 mm.p_transition = numpy.zeros((N, N)) 105 line = _readline_and_check_start(handle, "TRANSITION:") 106 for i in range(len(states)): 107 line = _readline_and_check_start(handle, " %s:" % states[i]) 108 mm.p_transition[i,:] = [float(v) for v in line.split()[1:]] 109 110 # Load the emission. 111 mm.p_emission = numpy.zeros((N, M)) 112 line = _readline_and_check_start(handle, "EMISSION:") 113 for i in range(len(states)): 114 line = _readline_and_check_start(handle, " %s:" % states[i]) 115 mm.p_emission[i,:] = [float(v) for v in line.split()[1:]] 116 117 return mm
118 119
120 -def save(mm, handle):
121 """save(mm, handle)""" 122 # This will fail if there are spaces in the states or alphabet. 123 w = handle.write 124 w("STATES: %s\n" % ' '.join(mm.states)) 125 w("ALPHABET: %s\n" % ' '.join(mm.alphabet)) 126 w("INITIAL:\n") 127 for i in range(len(mm.p_initial)): 128 w(" %s: %g\n" % (mm.states[i], mm.p_initial[i])) 129 w("TRANSITION:\n") 130 for i in range(len(mm.p_transition)): 131 w(" %s: %s\n" % (mm.states[i], ' '.join(str(x) for x in mm.p_transition[i]))) 132 w("EMISSION:\n") 133 for i in range(len(mm.p_emission)): 134 w(" %s: %s\n" % (mm.states[i], ' '.join(str(x) for x in mm.p_emission[i])))
135 136 137 # XXX allow them to specify starting points
138 -def train_bw(states, alphabet, training_data, 139 pseudo_initial=None, pseudo_transition=None, pseudo_emission=None, 140 update_fn=None, 141 ):
142 """train_bw(states, alphabet, training_data[, pseudo_initial] 143 [, pseudo_transition][, pseudo_emission][, update_fn]) -> MarkovModel 144 145 Train a MarkovModel using the Baum-Welch algorithm. states is a list 146 of strings that describe the names of each state. alphabet is a 147 list of objects that indicate the allowed outputs. training_data 148 is a list of observations. Each observation is a list of objects 149 from the alphabet. 150 151 pseudo_initial, pseudo_transition, and pseudo_emission are 152 optional parameters that you can use to assign pseudo-counts to 153 different matrices. They should be matrices of the appropriate 154 size that contain numbers to add to each parameter matrix, before 155 normalization. 156 157 update_fn is an optional callback that takes parameters 158 (iteration, log_likelihood). It is called once per iteration. 159 160 """ 161 N, M = len(states), len(alphabet) 162 if not training_data: 163 raise ValueError("No training data given.") 164 if pseudo_initial is not None: 165 pseudo_initial = numpy.asarray(pseudo_initial) 166 if pseudo_initial.shape != (N,): 167 raise ValueError("pseudo_initial not shape len(states)") 168 if pseudo_transition is not None: 169 pseudo_transition = numpy.asarray(pseudo_transition) 170 if pseudo_transition.shape != (N, N): 171 raise ValueError("pseudo_transition not shape " + 172 "len(states) X len(states)") 173 if pseudo_emission is not None: 174 pseudo_emission = numpy.asarray(pseudo_emission) 175 if pseudo_emission.shape != (N, M): 176 raise ValueError("pseudo_emission not shape " + 177 "len(states) X len(alphabet)") 178 179 # Training data is given as a list of members of the alphabet. 180 # Replace those with indexes into the alphabet list for easier 181 # computation. 182 training_outputs = [] 183 indexes = itemindex(alphabet) 184 for outputs in training_data: 185 training_outputs.append([indexes[x] for x in outputs]) 186 187 # Do some sanity checking on the outputs. 188 lengths = [len(x) for x in training_outputs] 189 if min(lengths) == 0: 190 raise ValueError("I got training data with outputs of length 0") 191 192 # Do the training with baum welch. 193 x = _baum_welch(N, M, training_outputs, 194 pseudo_initial=pseudo_initial, 195 pseudo_transition=pseudo_transition, 196 pseudo_emission=pseudo_emission, 197 update_fn=update_fn) 198 p_initial, p_transition, p_emission = x 199 return MarkovModel(states, alphabet, p_initial, p_transition, p_emission)
200 201 MAX_ITERATIONS = 1000 202 203
204 -def _baum_welch(N, M, training_outputs, 205 p_initial=None, p_transition=None, p_emission=None, 206 pseudo_initial=None, pseudo_transition=None, 207 pseudo_emission=None, update_fn=None):
208 # Returns (p_initial, p_transition, p_emission) 209 if p_initial is None: 210 p_initial = _random_norm(N) 211 else: 212 p_initial = _copy_and_check(p_initial, (N,)) 213 214 if p_transition is None: 215 p_transition = _random_norm((N, N)) 216 else: 217 p_transition = _copy_and_check(p_transition, (N, N)) 218 if p_emission is None: 219 p_emission = _random_norm((N, M)) 220 else: 221 p_emission = _copy_and_check(p_emission, (N, M)) 222 223 # Do all the calculations in log space to avoid underflows. 224 lp_initial = numpy.log(p_initial) 225 lp_transition = numpy.log(p_transition) 226 lp_emission = numpy.log(p_emission) 227 if pseudo_initial is not None: 228 lpseudo_initial = numpy.log(pseudo_initial) 229 else: 230 lpseudo_initial = None 231 if pseudo_transition is not None: 232 lpseudo_transition = numpy.log(pseudo_transition) 233 else: 234 lpseudo_transition = None 235 if pseudo_emission is not None: 236 lpseudo_emission = numpy.log(pseudo_emission) 237 else: 238 lpseudo_emission = None 239 240 # Iterate through each sequence of output, updating the parameters 241 # to the HMM. Stop when the log likelihoods of the sequences 242 # stops varying. 243 prev_llik = None 244 for i in range(MAX_ITERATIONS): 245 llik = LOG0 246 for outputs in training_outputs: 247 x = _baum_welch_one( 248 N, M, outputs, 249 lp_initial, lp_transition, lp_emission, 250 lpseudo_initial, lpseudo_transition, lpseudo_emission,) 251 llik += x 252 if update_fn is not None: 253 update_fn(i, llik) 254 if prev_llik is not None and numpy.fabs(prev_llik-llik) < 0.1: 255 break 256 prev_llik = llik 257 else: 258 raise RuntimeError("HMM did not converge in %d iterations" 259 % MAX_ITERATIONS) 260 261 # Return everything back in normal space. 262 return [numpy.exp(x) for x in (lp_initial, lp_transition, lp_emission)]
263 264
265 -def _baum_welch_one(N, M, outputs, 266 lp_initial, lp_transition, lp_emission, 267 lpseudo_initial, lpseudo_transition, lpseudo_emission):
268 # Do one iteration of Baum-Welch based on a sequence of output. 269 # NOTE: This will change the values of lp_initial, lp_transition, 270 # and lp_emission in place. 271 T = len(outputs) 272 fmat = _forward(N, T, lp_initial, lp_transition, lp_emission, outputs) 273 bmat = _backward(N, T, lp_transition, lp_emission, outputs) 274 275 # Calculate the probability of traversing each arc for any given 276 # transition. 277 lp_arc = numpy.zeros((N, N, T)) 278 for t in range(T): 279 k = outputs[t] 280 lp_traverse = numpy.zeros((N, N)) # P going over one arc. 281 for i in range(N): 282 for j in range(N): 283 # P(getting to this arc) 284 # P(making this transition) 285 # P(emitting this character) 286 # P(going to the end) 287 lp = fmat[i][t] + \ 288 lp_transition[i][j] + \ 289 lp_emission[i][k] + \ 290 bmat[j][t+1] 291 lp_traverse[i][j] = lp 292 # Normalize the probability for this time step. 293 lp_arc[:,:, t] = lp_traverse - _logsum(lp_traverse) 294 295 # Sum of all the transitions out of state i at time t. 296 lp_arcout_t = numpy.zeros((N, T)) 297 for t in range(T): 298 for i in range(N): 299 lp_arcout_t[i][t] = _logsum(lp_arc[i,:, t]) 300 301 # Sum of all the transitions out of state i. 302 lp_arcout = numpy.zeros(N) 303 for i in range(N): 304 lp_arcout[i] = _logsum(lp_arcout_t[i,:]) 305 306 # UPDATE P_INITIAL. 307 lp_initial = lp_arcout_t[:, 0] 308 if lpseudo_initial is not None: 309 lp_initial = _logvecadd(lp_initial, lpseudo_initial) 310 lp_initial = lp_initial - _logsum(lp_initial) 311 312 # UPDATE P_TRANSITION. p_transition[i][j] is the sum of all the 313 # transitions from i to j, normalized by the sum of the 314 # transitions out of i. 315 for i in range(N): 316 for j in range(N): 317 lp_transition[i][j] = _logsum(lp_arc[i, j,:]) - lp_arcout[i] 318 if lpseudo_transition is not None: 319 lp_transition[i] = _logvecadd(lp_transition[i], lpseudo_transition) 320 lp_transition[i] = lp_transition[i] - _logsum(lp_transition[i]) 321 322 # UPDATE P_EMISSION. lp_emission[i][k] is the sum of all the 323 # transitions out of i when k is observed, divided by the sum of 324 # the transitions out of i. 325 for i in range(N): 326 ksum = numpy.zeros(M)+LOG0 # ksum[k] is the sum of all i with k. 327 for t in range(T): 328 k = outputs[t] 329 for j in range(N): 330 ksum[k] = logaddexp(ksum[k], lp_arc[i, j, t]) 331 ksum = ksum - _logsum(ksum) # Normalize 332 if lpseudo_emission is not None: 333 ksum = _logvecadd(ksum, lpseudo_emission[i]) 334 ksum = ksum - _logsum(ksum) # Renormalize 335 lp_emission[i,:] = ksum 336 337 # Calculate the log likelihood of the output based on the forward 338 # matrix. Since the parameters of the HMM has changed, the log 339 # likelihoods are going to be a step behind, and we might be doing 340 # one extra iteration of training. The alternative is to rerun 341 # the _forward algorithm and calculate from the clean one, but 342 # that may be more expensive than overshooting the training by one 343 # step. 344 return _logsum(fmat[:, T])
345 346
347 -def _forward(N, T, lp_initial, lp_transition, lp_emission, outputs):
348 # Implement the forward algorithm. This actually calculates a 349 # Nx(T+1) matrix, where the last column is the total probability 350 # of the output. 351 352 matrix = numpy.zeros((N, T+1)) 353 354 # Initialize the first column to be the initial values. 355 matrix[:, 0] = lp_initial 356 for t in range(1, T+1): 357 k = outputs[t-1] 358 for j in range(N): 359 # The probability of the state is the sum of the 360 # transitions from all the states from time t-1. 361 lprob = LOG0 362 for i in range(N): 363 lp = matrix[i][t-1] + \ 364 lp_transition[i][j] + \ 365 lp_emission[i][k] 366 lprob = logaddexp(lprob, lp) 367 matrix[j][t] = lprob 368 return matrix
369 370
371 -def _backward(N, T, lp_transition, lp_emission, outputs):
372 matrix = numpy.zeros((N, T+1)) 373 for t in range(T-1, -1, -1): 374 k = outputs[t] 375 for i in range(N): 376 # The probability of the state is the sum of the 377 # transitions from all the states from time t+1. 378 lprob = LOG0 379 for j in range(N): 380 lp = matrix[j][t+1] + \ 381 lp_transition[i][j] + \ 382 lp_emission[i][k] 383 lprob = logaddexp(lprob, lp) 384 matrix[i][t] = lprob 385 return matrix
386 387
388 -def train_visible(states, alphabet, training_data, 389 pseudo_initial=None, pseudo_transition=None, 390 pseudo_emission=None):
391 """train_visible(states, alphabet, training_data[, pseudo_initial] 392 [, pseudo_transition][, pseudo_emission]) -> MarkovModel 393 394 Train a visible MarkovModel using maximum likelihoood estimates 395 for each of the parameters. states is a list of strings that 396 describe the names of each state. alphabet is a list of objects 397 that indicate the allowed outputs. training_data is a list of 398 (outputs, observed states) where outputs is a list of the emission 399 from the alphabet, and observed states is a list of states from 400 states. 401 402 pseudo_initial, pseudo_transition, and pseudo_emission are 403 optional parameters that you can use to assign pseudo-counts to 404 different matrices. They should be matrices of the appropriate 405 size that contain numbers to add to each parameter matrix 406 407 """ 408 N, M = len(states), len(alphabet) 409 if pseudo_initial is not None: 410 pseudo_initial = numpy.asarray(pseudo_initial) 411 if pseudo_initial.shape != (N,): 412 raise ValueError("pseudo_initial not shape len(states)") 413 if pseudo_transition is not None: 414 pseudo_transition = numpy.asarray(pseudo_transition) 415 if pseudo_transition.shape != (N, N): 416 raise ValueError("pseudo_transition not shape " + 417 "len(states) X len(states)") 418 if pseudo_emission is not None: 419 pseudo_emission = numpy.asarray(pseudo_emission) 420 if pseudo_emission.shape != (N, M): 421 raise ValueError("pseudo_emission not shape " + 422 "len(states) X len(alphabet)") 423 424 # Training data is given as a list of members of the alphabet. 425 # Replace those with indexes into the alphabet list for easier 426 # computation. 427 training_states, training_outputs = [], [] 428 states_indexes = itemindex(states) 429 outputs_indexes = itemindex(alphabet) 430 for toutputs, tstates in training_data: 431 if len(tstates) != len(toutputs): 432 raise ValueError("states and outputs not aligned") 433 training_states.append([states_indexes[x] for x in tstates]) 434 training_outputs.append([outputs_indexes[x] for x in toutputs]) 435 436 x = _mle(N, M, training_outputs, training_states, 437 pseudo_initial, pseudo_transition, pseudo_emission) 438 p_initial, p_transition, p_emission = x 439 440 return MarkovModel(states, alphabet, p_initial, p_transition, p_emission)
441 442
443 -def _mle(N, M, training_outputs, training_states, pseudo_initial, 444 pseudo_transition, pseudo_emission):
445 # p_initial is the probability that a sequence of states starts 446 # off with a particular one. 447 p_initial = numpy.zeros(N) 448 if pseudo_initial: 449 p_initial = p_initial + pseudo_initial 450 for states in training_states: 451 p_initial[states[0]] += 1 452 p_initial = _normalize(p_initial) 453 454 # p_transition is the probability that a state leads to the next 455 # one. C(i,j)/C(i) where i and j are states. 456 p_transition = numpy.zeros((N, N)) 457 if pseudo_transition: 458 p_transition = p_transition + pseudo_transition 459 for states in training_states: 460 for n in range(len(states)-1): 461 i, j = states[n], states[n+1] 462 p_transition[i, j] += 1 463 for i in range(len(p_transition)): 464 p_transition[i,:] = p_transition[i,:] / sum(p_transition[i,:]) 465 466 # p_emission is the probability of an output given a state. 467 # C(s,o)|C(s) where o is an output and s is a state. 468 p_emission = numpy.zeros((N, M)) 469 if pseudo_emission: 470 p_emission = p_emission + pseudo_emission 471 p_emission = numpy.ones((N, M)) 472 for outputs, states in zip(training_outputs, training_states): 473 for o, s in zip(outputs, states): 474 p_emission[s, o] += 1 475 for i in range(len(p_emission)): 476 p_emission[i,:] = p_emission[i,:] / sum(p_emission[i,:]) 477 478 return p_initial, p_transition, p_emission
479 480
481 -def _argmaxes(vector, allowance=None):
482 return [numpy.argmax(vector)]
483 484
485 -def find_states(markov_model, output):
486 """find_states(markov_model, output) -> list of (states, score)""" 487 mm = markov_model 488 N = len(mm.states) 489 490 # _viterbi does calculations in log space. Add a tiny bit to the 491 # matrices so that the logs will not break. 492 lp_initial = numpy.log(mm.p_initial + VERY_SMALL_NUMBER) 493 lp_transition = numpy.log(mm.p_transition + VERY_SMALL_NUMBER) 494 lp_emission = numpy.log(mm.p_emission + VERY_SMALL_NUMBER) 495 # Change output into a list of indexes into the alphabet. 496 indexes = itemindex(mm.alphabet) 497 output = [indexes[x] for x in output] 498 499 # Run the viterbi algorithm. 500 results = _viterbi(N, lp_initial, lp_transition, lp_emission, output) 501 502 for i in range(len(results)): 503 states, score = results[i] 504 results[i] = [mm.states[x] for x in states], numpy.exp(score) 505 return results
506 507
508 -def _viterbi(N, lp_initial, lp_transition, lp_emission, output):
509 # The Viterbi algorithm finds the most likely set of states for a 510 # given output. Returns a list of states. 511 512 T = len(output) 513 # Store the backtrace in a NxT matrix. 514 backtrace = [] # list of indexes of states in previous timestep. 515 for i in range(N): 516 backtrace.append([None] * T) 517 518 # Store the best scores. 519 scores = numpy.zeros((N, T)) 520 scores[:, 0] = lp_initial + lp_emission[:, output[0]] 521 for t in range(1, T): 522 k = output[t] 523 for j in range(N): 524 # Find the most likely place it came from. 525 i_scores = scores[:, t-1] + \ 526 lp_transition[:, j] + \ 527 lp_emission[j, k] 528 indexes = _argmaxes(i_scores) 529 scores[j, t] = i_scores[indexes[0]] 530 backtrace[j][t] = indexes 531 532 # Do the backtrace. First, find a good place to start. Then, 533 # we'll follow the backtrace matrix to find the list of states. 534 # In the event of ties, there may be multiple paths back through 535 # the matrix, which implies a recursive solution. We'll simulate 536 # it by keeping our own stack. 537 in_process = [] # list of (t, states, score) 538 results = [] # return values. list of (states, score) 539 indexes = _argmaxes(scores[:, T-1]) # pick the first place 540 for i in indexes: 541 in_process.append((T-1, [i], scores[i][T-1])) 542 while in_process: 543 t, states, score = in_process.pop() 544 if t == 0: 545 results.append((states, score)) 546 else: 547 indexes = backtrace[states[0]][t] 548 for i in indexes: 549 in_process.append((t-1, [i]+states, score)) 550 return results
551 552
553 -def _normalize(matrix):
554 # Make sure numbers add up to 1.0 555 if len(matrix.shape) == 1: 556 matrix = matrix / float(sum(matrix)) 557 elif len(matrix.shape) == 2: 558 # Normalize by rows. 559 for i in range(len(matrix)): 560 matrix[i,:] = matrix[i,:] / sum(matrix[i,:]) 561 else: 562 raise ValueError("I cannot handle matrixes of that shape") 563 return matrix
564 565
566 -def _uniform_norm(shape):
567 matrix = numpy.ones(shape) 568 return _normalize(matrix)
569 570
571 -def _random_norm(shape):
572 matrix = numpy.random.random(shape) 573 return _normalize(matrix)
574 575
576 -def _copy_and_check(matrix, desired_shape):
577 # Copy the matrix. 578 matrix = numpy.array(matrix, copy=1) 579 # Check the dimensions. 580 if matrix.shape != desired_shape: 581 raise ValueError("Incorrect dimension") 582 # Make sure it's normalized. 583 if len(matrix.shape) == 1: 584 if numpy.fabs(sum(matrix)-1.0) > 0.01: 585 raise ValueError("matrix not normalized to 1.0") 586 elif len(matrix.shape) == 2: 587 for i in range(len(matrix)): 588 if numpy.fabs(sum(matrix[i])-1.0) > 0.01: 589 raise ValueError("matrix %d not normalized to 1.0" % i) 590 else: 591 raise ValueError("I don't handle matrices > 2 dimensions") 592 return matrix
593 594
595 -def _logsum(matrix):
596 if len(matrix.shape) > 1: 597 vec = numpy.reshape(matrix, (numpy.product(matrix.shape),)) 598 else: 599 vec = matrix 600 sum = LOG0 601 for num in vec: 602 sum = logaddexp(sum, num) 603 return sum
604 605
606 -def _logvecadd(logvec1, logvec2):
607 assert len(logvec1) == len(logvec2), "vectors aren't the same length" 608 sumvec = numpy.zeros(len(logvec1)) 609 for i in range(len(logvec1)): 610 sumvec[i] = logaddexp(logvec1[i], logvec2[i]) 611 return sumvec
612 613
614 -def _exp_logsum(numbers):
615 sum = _logsum(numbers) 616 return numpy.exp(sum)
617