1
2
3 """
4 This module provides code for doing logistic regressions.
5
6
7 Classes:
8 LogisticRegression Holds information for a LogisticRegression classifier.
9
10
11 Functions:
12 train Train a new classifier.
13 calculate Calculate the probabilities of each class, given an observation.
14 classify Classify an observation into a class.
15 """
16 try:
17 from Numeric import *
18 from LinearAlgebra import *
19 except ImportError, x:
20 raise ImportError, "This module requires Numeric (precursor to NumPy) with the LinearAlgebra lib"
21
22 from Bio import listfns
23
25 """Holds information necessary to do logistic regression
26 classification.
27
28 Members:
29 beta List of the weights for each dimension.
30
31 """
33 """LogisticRegression()"""
34 beta = []
35
36 -def train(xs, ys, update_fn=None, typecode=None):
37 """train(xs, ys[, update_fn]) -> LogisticRegression
38
39 Train a logistic regression classifier on a training set. xs is a
40 list of observations and ys is a list of the class assignments,
41 which should be 0 or 1. xs and ys should contain the same number
42 of elements. update_fn is an optional callback function that
43 takes as parameters that iteration number and log likelihood.
44
45 """
46 if len(xs) != len(ys):
47 raise ValueError, "xs and ys should be the same length."
48 if not xs or not xs[0]:
49 raise ValueError, "No observations or observation of 0 dimension."
50 classes = listfns.items(ys)
51 classes.sort()
52 if classes != [0, 1]:
53 raise ValueError, "Classes should be 0's and 1's"
54 if typecode is None:
55 typecode = Float
56
57
58
59 N, ndims = len(xs), len(xs[0]) + 1
60
61
62 X = ones((N, ndims), typecode)
63 X[:, 1:] = xs
64 Xt = transpose(X)
65 y = asarray(ys, typecode)
66
67
68 beta = zeros(ndims, typecode)
69
70 MAX_ITERATIONS = 500
71 CONVERGE_THRESHOLD = 0.01
72 stepsize = 1.0
73
74
75 iter = 0
76 old_beta = old_llik = None
77 while iter < MAX_ITERATIONS:
78
79 ebetaX = exp(dot(beta, Xt))
80 p = ebetaX / (1+ebetaX)
81
82
83 logp = y*log(p) + (1-y)*log(1-p)
84 llik = sum(logp)
85 if update_fn is not None:
86 update_fn(iter, llik)
87
88
89 if llik < old_llik:
90 stepsize = stepsize / 2.0
91 beta = old_beta
92
93 if old_llik is not None and fabs(llik-old_llik) <= CONVERGE_THRESHOLD:
94 break
95 old_llik, old_beta = llik, beta
96 iter += 1
97
98 W = identity(N) * p
99 Xtyp = dot(Xt, y-p)
100 XtWX = dot(dot(Xt, W), X)
101
102
103
104 delta = dot(inverse(XtWX), Xtyp)
105 if fabs(stepsize-1.0) > 0.001:
106 delta = delta * stepsize
107 beta = beta + delta
108 else:
109 raise AssertionError, "Didn't converge."
110
111 lr = LogisticRegression()
112 lr.beta = map(float, beta)
113 return lr
114
116 """calculate(lr, x) -> list of probabilities
117
118 Calculate the probability for each class. lr is a
119 LogisticRegression object. x is the observed data. Returns a
120 list of the probability that it fits each class.
121
122 """
123
124 x = asarray([1.0] + x)
125
126 ebetaX = exp(dot(lr.beta, x))
127 p = ebetaX / (1+ebetaX)
128 return [1-p, p]
129
131 """classify(lr, x) -> 1 or 0
132
133 Classify an observation into a class.
134
135 """
136 probs = calculate(lr, x)
137 if probs[0] > probs[1]:
138 return 0
139 return 1
140