Package Bio :: Module LogisticRegression
[hide private]
[frames] | no frames]

Source Code for Module Bio.LogisticRegression

  1  #!/usr/bin/env python 
  2   
  3  """ 
  4  This module provides code for doing logistic regressions. 
  5   
  6   
  7  Classes: 
  8  LogisticRegression    Holds information for a LogisticRegression classifier. 
  9   
 10   
 11  Functions: 
 12  train        Train a new classifier. 
 13  calculate    Calculate the probabilities of each class, given an observation. 
 14  classify     Classify an observation into a class. 
 15  """ 
 16  try: 
 17      from Numeric import * 
 18      from LinearAlgebra import *  # inverse 
 19  except ImportError, x: 
 20      raise ImportError, "This module requires Numeric (precursor to NumPy) with the LinearAlgebra lib" 
 21   
 22  from Bio import listfns 
 23   
24 -class LogisticRegression:
25 """Holds information necessary to do logistic regression 26 classification. 27 28 Members: 29 beta List of the weights for each dimension. 30 31 """
32 - def __init__(self):
33 """LogisticRegression()""" 34 beta = []
35
36 -def train(xs, ys, update_fn=None, typecode=None):
37 """train(xs, ys[, update_fn]) -> LogisticRegression 38 39 Train a logistic regression classifier on a training set. xs is a 40 list of observations and ys is a list of the class assignments, 41 which should be 0 or 1. xs and ys should contain the same number 42 of elements. update_fn is an optional callback function that 43 takes as parameters that iteration number and log likelihood. 44 45 """ 46 if len(xs) != len(ys): 47 raise ValueError, "xs and ys should be the same length." 48 if not xs or not xs[0]: 49 raise ValueError, "No observations or observation of 0 dimension." 50 classes = listfns.items(ys) 51 classes.sort() 52 if classes != [0, 1]: 53 raise ValueError, "Classes should be 0's and 1's" 54 if typecode is None: 55 typecode = Float 56 57 # Dimensionality of the data is the dimensionality of the 58 # observations plus a constant dimension. 59 N, ndims = len(xs), len(xs[0]) + 1 60 61 # Make an X array, with a constant first dimension. 62 X = ones((N, ndims), typecode) 63 X[:, 1:] = xs 64 Xt = transpose(X) 65 y = asarray(ys, typecode) 66 67 # Initialize the beta parameter to 0. 68 beta = zeros(ndims, typecode) 69 70 MAX_ITERATIONS = 500 71 CONVERGE_THRESHOLD = 0.01 72 stepsize = 1.0 73 # Now iterate using Newton-Raphson until the log-likelihoods 74 # converge. 75 iter = 0 76 old_beta = old_llik = None 77 while iter < MAX_ITERATIONS: 78 # Calculate the probabilities. p = e^(beta X) / (1+e^(beta X)) 79 ebetaX = exp(dot(beta, Xt)) 80 p = ebetaX / (1+ebetaX) 81 82 # Find the log likelihood score and see if I've converged. 83 logp = y*log(p) + (1-y)*log(1-p) 84 llik = sum(logp) 85 if update_fn is not None: 86 update_fn(iter, llik) 87 # Check to see if the likelihood decreased. If it did, then 88 # restore the old beta parameters and half the step size. 89 if llik < old_llik: 90 stepsize = stepsize / 2.0 91 beta = old_beta 92 # If I've converged, then stop. 93 if old_llik is not None and fabs(llik-old_llik) <= CONVERGE_THRESHOLD: 94 break 95 old_llik, old_beta = llik, beta 96 iter += 1 97 98 W = identity(N) * p 99 Xtyp = dot(Xt, y-p) # Calculate the first derivative. 100 XtWX = dot(dot(Xt, W), X) # Calculate the second derivative. 101 #u, s, vt = singular_value_decomposition(XtWX) 102 #print "U", u 103 #print "S", s 104 delta = dot(inverse(XtWX), Xtyp) 105 if fabs(stepsize-1.0) > 0.001: 106 delta = delta * stepsize 107 beta = beta + delta # Update beta. 108 else: 109 raise AssertionError, "Didn't converge." 110 111 lr = LogisticRegression() 112 lr.beta = map(float, beta) # Convert back to regular array. 113 return lr
114
115 -def calculate(lr, x):
116 """calculate(lr, x) -> list of probabilities 117 118 Calculate the probability for each class. lr is a 119 LogisticRegression object. x is the observed data. Returns a 120 list of the probability that it fits each class. 121 122 """ 123 # Insert a constant term for x. 124 x = asarray([1.0] + x) 125 # Calculate the probability. p = e^(beta X) / (1+e^(beta X)) 126 ebetaX = exp(dot(lr.beta, x)) 127 p = ebetaX / (1+ebetaX) 128 return [1-p, p]
129
130 -def classify(lr, x):
131 """classify(lr, x) -> 1 or 0 132 133 Classify an observation into a class. 134 135 """ 136 probs = calculate(lr, x) 137 if probs[0] > probs[1]: 138 return 0 139 return 1
140