1
2
3
4
5
6
7
8
9
10
11 """
12 Naive Bayes Classifier -- Beta version
13 """
14
15 from operator import itemgetter
16 from nltk_lite.probability import *
17 from nltk_lite.contrib.classify import *
18
20 """
21 The Naive Bayes Classifier is a supervised classifier.
22 It needs to be trained with representative examples of
23 each class. From these examples the classifier
24 calculates the most probable classification of the sample.
25
26
27 P(class) * P(features|class)
28 P(class|features) = -------------------------
29 P(features)
30
31 Internal data structures:
32 _feature_dectector:
33 holds a feature detector function
34 _classes:
35 holds a list of classes supplied during training
36 _cls_prob_dist:
37 hols a Probability Distribution, namely GoodTuringProbDist
38 this structure is defined in probabilty.py in nltk_lite
39 this structure is indexed by classnames
40 _feat_prob_dist:
41 holds Conditional Probability Distribution, conditions are
42 class name, and feature type name
43 these probability distributions are indexed by feature values
44 this structure is defined in probabilty.py in nltk_lite
45 """
46
48 """
49 @param feature_detector: feature detector produced function, which takes
50 a sample of object to be classified (eg: string or list of words) and returns
51 a list of tuples (feature_type_name, list of values of this feature type)
52 """
53 self._feature_detector = feature_detector
54
56 """
57 @param gold: dictionary of class names to representative examples
58 function takes representative examples of classes
59 then creates frequency distributions of these classes
60 these freqdists are used to create probability distributions
61 """
62 cls_freq_dist = FreqDist()
63 feat_freq_dist = ConditionalFreqDist()
64 self._classes = []
65 feature_values = {}
66
67 for cls in gold:
68 self._classes.append(cls)
69 for (fname, fvals) in self._feature_detector(gold[cls]):
70 for fval in fvals:
71
72 cls_freq_dist.inc(cls)
73
74
75 feat_freq_dist[cls, fname].inc(fval)
76
77
78 if fname not in feature_values: feature_values[fname] = set()
79 feature_values[fname].add(fval)
80
81
82 self._cls_prob_dist = GoodTuringProbDist(cls_freq_dist, cls_freq_dist.B())
83
84
85 def make_probdist(freqdist, (cls, fname)):
86 return GoodTuringProbDist(freqdist, len(feature_values[fname]))
87 self._feat_prob_dist = ConditionalProbDist(feat_freq_dist, make_probdist, True)
88
90 """
91 @param sample: sample to be classified
92 @ret: Dictionary (class to probability)
93 """
94 return self._naivebayes(sample)
95
97 """
98 @param sample: sample to be tested
99 @ret: Dictionary (class to probability)
100
101 naivebayes classifier:
102 creates a probability distribution based on sample string
103
104 sums the log probabilities of each feature value
105 for each class and feature type
106 and with the probability of the resepective class
107 """
108 sample_feats = self._feature_detector(sample)
109
110 logprob_dict = {}
111 score = {}
112 for cls in self._classes:
113
114 logprob_dict[cls] = self._cls_prob_dist.prob(cls)
115
116 for fname, fvals in sample_feats:
117 for cls in self._classes:
118 probdist = self._feat_prob_dist[cls, fname]
119 for fval in fvals:
120 if fval in probdist.samples():
121 logprob_dict[cls] += probdist.logprob(fval)
122
123 dicttmp = DictionaryProbDist(logprob_dict, normalize=True, log=True)
124 for sample in dicttmp.samples():
125 score[sample] = dicttmp.prob(sample)
126
127 return score
128
130 return '<NaiveBayesClassifier: classes=%d>' % len(self._classes)
131
132
133
134
135
136
137
167
168
200
201
202
204 from nltk_lite.contrib import classify
205 from nltk_lite import detect
206
207 fd = detect.feature({"1-tup": lambda t: [t[n] for n in range(len(t))],
208 "2-tup": lambda t: [t[n:n+2] for n in range(len(t))]})
209
210 classifier = classify.NaiveBayes(fd)
211 training_data = {"class a": "aaaaaab",
212 "class b": "bbbbbba"}
213 classifier.train(training_data)
214
215 result = classifier.get_class_dict("aaababb")
216
217 for cls in result:
218 print cls, ':', result[cls]
219
220 """
221 expected values:
222 class_probs a = 0.5
223 b = 0.5
224 class a: 'a' = 6/7
225 'b' = 1/7
226 'aa' = 5/6
227 'ab' = 1/6
228 b: 'a' = 1/7
229 'b' = 6/7
230 'bb' = 5/6
231 'ba' = 1/6
232 sample: 'a' = 4
233 'b' = 3
234 'aa' = 2
235 'ab' = 2
236 'ba' = 1
237 'bb' = 1
238
239 score a: 0.5 * 6/7^4 * 1/7^3 * 5/6^2 * 1/6^2 = 1.5 e-5
240 score b: 0.5 * 1/7^4 * 6/7^3 * 5/6 * 1/6 = 0.0014~
241 """
242
244 from nltk_lite.contrib import classify
245 from nltk_lite import detect
246
247 from nltk_lite.corpora import genesis
248 from itertools import islice
249
250 fd = detect.feature({"2-tup": lambda t: [' '.join(t)[n:n+2] for n in range(len(' '.join(t))-1)],
251 "words": lambda t: t})
252
253 classifier = classify.NaiveBayes(fd)
254 training_data = {}
255 training_data["english-kjv"] = list(islice(genesis.raw("english-kjv"), 0, 400))
256 training_data["french"] = list(islice(genesis.raw("french"), 0, 400))
257 training_data["finnish"] = list(islice(genesis.raw("finnish"), 0, 400))
258
259 classifier.train(training_data)
260
261 result = classifier.get_class_probs(list(islice(genesis.raw("english-kjv"), 150, 200)))
262
263 print 'english-kjv :', result.prob('english-kjv')
264 print 'french :', result.prob('french')
265 print 'finnish :', result.prob('finnish')
266
267 if __name__ == '__main__':
268 demo2()
269