1
2
3
4
5
6 """This provides code for a general Naive Bayes learner.
7
8 Naive Bayes is a supervised classification algorithm that uses Bayes
9 rule to compute the fit between a new observation and some previously
10 observed data. The observations are discrete feature vectors, with
11 the Bayes assumption that the features are independent. Although this
12 is hardly ever true, the classifier works well enough in practice.
13
14 Glossary:
15 observation A feature vector of discrete data.
16 class A possible classification for an observation.
17
18
19 Classes:
20 NaiveBayes Holds information for a naive Bayes classifier.
21
22 Functions:
23 train Train a new naive Bayes classifier.
24 calculate Calculate the probabilities of each class, given an observation.
25 classify Classify an observation into a class.
26
27 """
28
29
30
31
32 try:
33 from Numeric import *
34 except ImportError, x:
35 raise ImportError, "This module requires Numeric (precursor to NumPy)"
36
37 from Bio import mathfns, listfns
38
40 """Holds information for a NaiveBayes classifier.
41
42 Members:
43 classes List of the possible classes of data.
44 p_conditional CLASS x DIM array of dicts of value -> P(value|class,dim)
45 p_prior List of the prior probabilities for every class.
46 dimensionality Dimensionality of the data.
47
48 """
50 self.classes = []
51 self.p_conditional = None
52 self.p_prior = []
53 self.dimensionality = None
54
56 """calculate(nb, observation[, scale]) -> probability dict
57
58 Calculate log P(class|observation) for each class. nb is a NaiveBayes
59 classifier that has been trained. observation is a list representing
60 the observed data. scale is whether the probability should be
61 scaled by P(observation). By default, no scaling is done. The return
62 value is a dictionary where the keys is the class and the value is the
63 log probability of the class.
64
65 """
66
67
68
69
70
71 if len(observation) != nb.dimensionality:
72 raise ValueError, "observation in %d dimension, but classifier in %d" \
73 % (len(observation), nb.dimensionality)
74
75
76 lp_observation_class = []
77 for i in range(len(nb.classes)):
78
79 probs = [None] * len(observation)
80 for j in range(len(observation)):
81 probs[j] = nb.p_conditional[i][j].get(observation[j], 0)
82 lprobs = [mathfns.safe_log(x, -10000) for x in probs]
83 lprob = sum(lprobs)
84 lp_observation_class.append(lprob)
85
86
87 lp_prior = map(math.log, nb.p_prior)
88
89
90 lp_observation = 0.0
91 if scale:
92
93 obs = zeros(len(nb.classes), Float32)
94 for i in range(len(nb.classes)):
95 obs[i] = mathfns.safe_exp(lp_prior[i]+lp_observation_class[i],
96 under=1E-300)
97 lp_observation = math.log(sum(obs))
98
99
100 lp_class_observation = {}
101 for i in range(len(nb.classes)):
102 lp_class_observation[nb.classes[i]] = \
103 lp_observation_class[i] + lp_prior[i] - lp_observation
104
105 return lp_class_observation
106
108 """classify(nb, observation) -> class
109
110 Classify an observation into a class.
111
112 """
113
114 probs = calculate(nb, observation, scale=0)
115 max_prob = max_class = None
116 for klass in nb.classes:
117 if max_prob is None or probs[klass] > max_prob:
118 max_prob, max_class = probs[klass], klass
119 return max_class
120
121 -def train(training_set, results, priors=None, typecode=None):
122 """train(training_set, results[, priors]) -> NaiveBayes
123
124 Train a naive bayes classifier on a training set. training_set is a
125 list of observations. results is a list of the class assignments
126 for each observation. Thus, training_set and results must be the same
127 length. priors is an optional dictionary specifying the prior
128 probabilities for each type of result. If not specified, the priors
129 will be estimated from the training results.
130
131 """
132 if not len(training_set):
133 raise ValueError, "No data in the training set."
134 if len(training_set) != len(results):
135 raise ValueError, "training_set and results should be parallel lists."
136
137
138
139
140
141
142
143
144 dimensions = [len(x) for x in training_set]
145 if min(dimensions) != max(dimensions):
146 raise ValueError, "observations have different dimensionality"
147
148 nb = NaiveBayes()
149 nb.dimensionality = dimensions[0]
150
151
152 nb.classes = listfns.items(results)
153 nb.classes.sort()
154
155
156 if priors is not None:
157 percs = priors
158 else:
159 percs = listfns.contents(results)
160 nb.p_prior = zeros(len(nb.classes), Float16)
161 for i in range(len(nb.classes)):
162 nb.p_prior[i] = percs[nb.classes[i]]
163
164
165
166
167
168
169
170 c2i = listfns.itemindex(nb.classes)
171 observations = [[] for c in nb.classes]
172 for i in range(len(results)):
173 klass, obs = results[i], training_set[i]
174 observations[c2i[klass]].append(obs)
175
176 for i in range(len(observations)):
177
178 observations[i] = asarray(observations[i], typecode)
179
180
181
182 nb.p_conditional = []
183 for i in range(len(nb.classes)):
184 class_observations = observations[i]
185 nb.p_conditional.append([None] * nb.dimensionality)
186 for j in range(nb.dimensionality):
187
188 values = class_observations[:, j]
189
190
191
192
193
194 nb.p_conditional[i][j] = listfns.contents(values)
195 return nb
196