Package Bio :: Module LogisticRegression
[hide private]
[frames] | no frames]

Source Code for Module Bio.LogisticRegression

  1  #!/usr/bin/env python 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Code for doing logistic regressions. 
  6   
  7   
  8  Classes: 
  9  LogisticRegression    Holds information for a LogisticRegression classifier. 
 10   
 11   
 12  Functions: 
 13  train        Train a new classifier. 
 14  calculate    Calculate the probabilities of each class, given an observation. 
 15  classify     Classify an observation into a class. 
 16  """ 
 17   
 18  from __future__ import print_function 
 19   
 20  import numpy 
 21  import numpy.linalg 
 22   
 23   
24 -class LogisticRegression(object):
25 """Holds information necessary to do logistic regression 26 classification. 27 28 Members: 29 beta List of the weights for each dimension. 30 31 """
32 - def __init__(self):
33 """LogisticRegression()""" 34 self.beta = []
35 36
37 -def train(xs, ys, update_fn=None, typecode=None):
38 """train(xs, ys[, update_fn]) -> LogisticRegression 39 40 Train a logistic regression classifier on a training set. xs is a 41 list of observations and ys is a list of the class assignments, 42 which should be 0 or 1. xs and ys should contain the same number 43 of elements. update_fn is an optional callback function that 44 takes as parameters that iteration number and log likelihood. 45 46 """ 47 if len(xs) != len(ys): 48 raise ValueError("xs and ys should be the same length.") 49 classes = set(ys) 50 if classes != set([0, 1]): 51 raise ValueError("Classes should be 0's and 1's") 52 if typecode is None: 53 typecode = 'd' 54 55 # Dimensionality of the data is the dimensionality of the 56 # observations plus a constant dimension. 57 N, ndims = len(xs), len(xs[0]) + 1 58 if N==0 or ndims==1: 59 raise ValueError("No observations or observation of 0 dimension.") 60 61 # Make an X array, with a constant first dimension. 62 X = numpy.ones((N, ndims), typecode) 63 X[:, 1:] = xs 64 Xt = numpy.transpose(X) 65 y = numpy.asarray(ys, typecode) 66 67 # Initialize the beta parameter to 0. 68 beta = numpy.zeros(ndims, typecode) 69 70 MAX_ITERATIONS = 500 71 CONVERGE_THRESHOLD = 0.01 72 stepsize = 1.0 73 # Now iterate using Newton-Raphson until the log-likelihoods 74 # converge. 75 i = 0 76 old_beta = old_llik = None 77 while i < MAX_ITERATIONS: 78 # Calculate the probabilities. p = e^(beta X) / (1+e^(beta X)) 79 ebetaX = numpy.exp(numpy.dot(beta, Xt)) 80 p = ebetaX / (1+ebetaX) 81 82 # Find the log likelihood score and see if I've converged. 83 logp = y*numpy.log(p) + (1-y)*numpy.log(1-p) 84 llik = sum(logp) 85 if update_fn is not None: 86 update_fn(iter, llik) 87 if old_llik is not None: 88 # Check to see if the likelihood decreased. If it did, then 89 # restore the old beta parameters and half the step size. 90 if llik < old_llik: 91 stepsize = stepsize / 2.0 92 beta = old_beta 93 # If I've converged, then stop. 94 if numpy.fabs(llik-old_llik) <= CONVERGE_THRESHOLD: 95 break 96 old_llik, old_beta = llik, beta 97 i += 1 98 99 W = numpy.identity(N) * p 100 Xtyp = numpy.dot(Xt, y-p) # Calculate the first derivative. 101 XtWX = numpy.dot(numpy.dot(Xt, W), X) # Calculate the second derivative. 102 #u, s, vt = singular_value_decomposition(XtWX) 103 #print("U %s" % u) 104 #print("S %s" % s) 105 delta = numpy.linalg.solve(XtWX, Xtyp) 106 if numpy.fabs(stepsize-1.0) > 0.001: 107 delta = delta * stepsize 108 beta = beta + delta # Update beta. 109 else: 110 raise RuntimeError("Didn't converge.") 111 112 lr = LogisticRegression() 113 lr.beta = [float(x) for x in beta] # Convert back to regular array. 114 return lr
115 116
117 -def calculate(lr, x):
118 """calculate(lr, x) -> list of probabilities 119 120 Calculate the probability for each class. lr is a 121 LogisticRegression object. x is the observed data. Returns a 122 list of the probability that it fits each class. 123 124 """ 125 # Insert a constant term for x. 126 x = numpy.asarray([1.0] + x) 127 # Calculate the probability. p = e^(beta X) / (1+e^(beta X)) 128 ebetaX = numpy.exp(numpy.dot(lr.beta, x)) 129 p = ebetaX / (1+ebetaX) 130 return [1-p, p]
131 132
133 -def classify(lr, x):
134 """classify(lr, x) -> 1 or 0 135 136 Classify an observation into a class. 137 138 """ 139 probs = calculate(lr, x) 140 if probs[0] > probs[1]: 141 return 0 142 return 1
143