1
2
3
4
5
6
7
8
9 from nltk_lite.cluster import *
10
11 -class EM(VectorSpace):
12 """
13 The Gaussian EM clusterer models the vectors as being produced by
14 a mixture of k Gaussian sources. The parameters of these sources
15 (prior probability, mean and covariance matrix) are then found to
16 maximise the likelihood of the given data. This is done with the
17 expectation maximisation algorithm. It starts with k arbitrarily
18 chosen means, priors and covariance matrices. It then calculates
19 the membership probabilities for each vector in each of the
20 clusters; this is the 'E' step. The cluster parameters are then
21 updated in the 'M' step using the maximum likelihood estimate from
22 the cluster membership probabilities. This process continues until
23 the likelihood of the data does not significantly increase.
24 """
25
26 - def __init__(self, initial_means, priors=None, covariance_matrices=None,
27 conv_threshold=1e-6, bias=0.1, normalise=False,
28 svd_dimensions=None):
29 """
30 Creates an EM clusterer with the given starting parameters,
31 convergence threshold and vector mangling parameters.
32
33 @param initial_means: the means of the gaussian cluster centers
34 @type initial_means: [seq of] numpy array or seq of SparseArray
35 @param priors: the prior probability for each cluster
36 @type priors: numpy array or seq of float
37 @param covariance_matrices: the covariance matrix for each cluster
38 @type covariance_matrices: [seq of] numpy array
39 @param conv_threshold: maximum change in likelihood before deemed
40 convergent
41 @type conv_threshold: int or float
42 @param bias: variance bias used to ensure non-singular covariance
43 matrices
44 @type bias: float
45 @param normalise: should vectors be normalised to length 1
46 @type normalise: boolean
47 @param svd_dimensions: number of dimensions to use in reducing vector
48 dimensionsionality with SVD
49 @type svd_dimensions: int
50 """
51 VectorSpace.__init__(self, normalise, svd_dimensions)
52 self._means = array(initial_means, numpy.float64)
53 self._num_clusters = len(initial_means)
54 self._conv_threshold = conv_threshold
55 self._covariance_matrices = covariance_matrices
56 self._priors = priors
57 self._bias = bias
58
60 return self._num_clusters
61
63 assert len(vectors) > 0
64
65
66 dimensions = len(vectors[0])
67 means = self._means
68 priors = self._priors
69 if not priors:
70 priors = self._priors = numpy.ones(self._num_clusters,
71 numpy.float64) / self._num_clusters
72 covariances = self._covariance_matrices
73 if not covariances:
74 covariances = self._covariance_matrices = \
75 [ numpy.identity(dimensions, numpy.float64)
76 for i in range(self._num_clusters) ]
77
78
79 lastl = self._loglikelihood(vectors, priors, means, covariances)
80 converged = False
81
82 while not converged:
83 if trace: print 'iteration; loglikelihood', lastl
84
85 h = numpy.zeros((len(vectors), self._num_clusters),
86 numpy.float64)
87 for i in range(len(vectors)):
88 for j in range(self._num_clusters):
89 h[i,j] = priors[j] * self._gaussian(means[j],
90 covariances[j], vectors[i])
91 h[i,:] /= sum(h[i,:])
92
93
94 for j in range(self._num_clusters):
95 covariance_before = covariances[j]
96 new_covariance = numpy.zeros((dimensions, dimensions),
97 numpy.float64)
98 new_mean = numpy.zeros(dimensions, numpy.float64)
99 sum_hj = 0.0
100 for i in range(len(vectors)):
101 delta = vectors[i] - means[j]
102 new_covariance += h[i,j] * \
103 numpy.multiply.outer(delta, delta)
104 sum_hj += h[i,j]
105 new_mean += h[i,j] * vectors[i]
106 covariances[j] = new_covariance / sum_hj
107 means[j] = new_mean / sum_hj
108 priors[j] = sum_hj / len(vectors)
109
110
111 covariances[j] += self._bias * \
112 numpy.identity(dimensions, numpy.float64)
113
114
115 l = self._loglikelihood(vectors, priors, means, covariances)
116
117
118 if abs(lastl - l) < self._conv_threshold:
119 converged = True
120 lastl = l
121
123 best = None
124 for j in range(self._num_clusters):
125 p = self._priors[j] * self._gaussian(self._means[j],
126 self._covariance_matrices[j], vector)
127 if not best or p > best[0]:
128 best = (p, j)
129 return best[1]
130
135
137 m = len(mean)
138 assert cvm.shape == (m, m), \
139 'bad sized covariance matrix, %s' % str(cvm.shape)
140 try:
141 det = linalg.det(cvm)
142 inv = linalg.inv(cvm)
143 a = det ** -0.5 * (2 * numpy.pi) ** (-m / 2.0)
144 dx = x - mean
145 print dx, inv
146 b = -0.5 * numpy.dot( numpy.dot(dx, inv), dx)
147 return a * numpy.exp(b)
148 except OverflowError:
149
150
151 return 0
152
154 llh = 0.0
155 for vector in vectors:
156 p = 0
157 for j in range(len(priors)):
158 p += priors[j] * \
159 self._gaussian(means[j], covariances[j], vector)
160 llh += numpy.log(p)
161 return llh
162
164 return '<EM Clusterer means=%s>' % list(self._means)
165
167 """
168 Returns the euclidean distance between vectors u and v. This is equivalent
169 to the length of the vector (u - v).
170 """
171 diff = u - v
172 return math.sqrt(numpy.dot(diff, diff))
173
175 """
176 Returns the cosine of the angle between vectors v and u. This is equal to
177 u.v / |u||v|.
178 """
179 return numpy.dot(u, v) / (math.sqrt(numpy.dot(u, u)) * math.sqrt(numpy.dot(v, v)))
180
182 """
183 Non-interactive demonstration of the clusterers with simple 2-D data.
184 """
185
186 from nltk_lite import cluster
187
188
189
190 vectors = [array(f) for f in [[0.5, 0.5], [1.5, 0.5], [1, 3]]]
191 means = [[4, 2], [4, 2.01]]
192
193 clusterer = cluster.EM(means, bias=0.1)
194 clusters = clusterer.cluster(vectors, True, trace=True)
195
196 print 'Clustered:', vectors
197 print 'As: ', clusters
198 print
199
200 for c in range(2):
201 print 'Cluster:', c
202 print 'Prior: ', clusterer._priors[c]
203 print 'Mean: ', clusterer._means[c]
204 print 'Covar: ', clusterer._covariance_matrices[c]
205 print
206
207
208 vector = array([2, 2])
209 print 'classify(%s):' % vector,
210 print clusterer.classify(vector)
211
212
213 vector = array([2, 2])
214 print 'classification_probdist(%s):' % vector
215 pdist = clusterer.classification_probdist(vector)
216 for sample in pdist.samples():
217 print '%s => %.0f%%' % (sample,
218 pdist.prob(sample) *100)
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255 if __name__ == '__main__':
256 demo()
257