1
2
3
4
5
6
7
8
9 from nltk_lite.cluster import *
10
12 """
13 The K-means clusterer starts with k arbitrary chosen means then allocates
14 each vector to the cluster with the closest mean. It then recalculates the
15 means of each cluster as the centroid of the vectors in the cluster. This
16 process repeats until the cluster memberships stabilise. This is a
17 hill-climbing algorithm which may converge to a local maximum. Hence the
18 clustering is often repeated with random initial means and the most
19 commonly occuring output means are chosen.
20 """
21
22 - def __init__(self, num_means, distance, repeats=1,
23 conv_test=1e-6, initial_means=None,
24 normalise=False, svd_dimensions=None,
25 rng=None):
26 """
27 @param num_means: the number of means to use (may use fewer)
28 @type num_means: int
29 @param distance: measure of distance between two vectors
30 @type distance: function taking two vectors and returing a float
31 @param repeats: number of randomised clustering trials to use
32 @type repeats: int
33 @param conv_test: maximum variation in mean differences before
34 deemed convergent
35 @type conv_test: number
36 @param initial_means: set of k initial means
37 @type initial_means: sequence of vectors
38 @param normalise: should vectors be normalised to length 1
39 @type normalise: boolean
40 @param svd_dimensions: number of dimensions to use in reducing vector
41 dimensionsionality with SVD
42 @type svd_dimensions: int
43 @param rng: random number generator (or None)
44 @type rng: Random
45 """
46 VectorSpace.__init__(self, normalise, svd_dimensions)
47 self._num_means = num_means
48 self._distance = distance
49 self._max_difference = conv_test
50 assert not initial_means or len(initial_means) == num_means
51 self._means = initial_means
52 assert repeats >= 1
53 assert not (initial_means and repeats > 1)
54 self._repeats = repeats
55 if rng: self._rng = rng
56 else: self._rng = random.Random()
57
59 if self._means and self._repeats > 1:
60 print 'Warning: means will be discarded for subsequent trials'
61
62 meanss = []
63 for trial in range(self._repeats):
64 if trace: print 'k-means trial', trial
65 if not self._means or trial > 1:
66 self._means = self._rng.sample(vectors, self._num_means)
67 self._cluster_vectorspace(vectors, trace)
68 meanss.append(self._means)
69
70 if len(meanss) > 1:
71
72
73 for means in meanss:
74 means.sort(cmp = _vector_compare)
75
76
77 min_difference = min_means = None
78 for i in range(len(meanss)):
79 d = 0
80 for j in range(len(meanss)):
81 if i != j:
82 d += self._sum_distances(meanss[i], meanss[j])
83 if min_difference == None or d < min_difference:
84 min_difference, min_means = d, meanss[i]
85
86
87 self._means = min_means
88
90 if self._num_means < len(vectors):
91
92 converged = False
93 while not converged:
94
95
96 clusters = [[] for m in range(self._num_means)]
97 for vector in vectors:
98 index = self.classify_vectorspace(vector)
99 clusters[index].append(vector)
100
101 if trace: print 'iteration'
102
103
104
105
106 new_means = map(self._centroid, clusters)
107
108
109 difference = self._sum_distances(self._means, new_means)
110 if difference < self._max_difference:
111 converged = True
112
113
114 self._means = new_means
115
117
118
119 best_distance = best_index = None
120 for index in range(len(self._means)):
121 mean = self._means[index]
122 dist = self._distance(vector, mean)
123 if best_distance == None or dist < best_distance:
124 best_index, best_distance = index, dist
125 return best_index
126
128 if self._means:
129 return len(self._means)
130 else:
131 return self._num_means
132
134 """
135 The means used for clustering.
136 """
137 return self._means
138
140 difference = 0.0
141 for u, v in zip(vectors1, vectors2):
142 difference += self._distance(u, v)
143 return difference
144
151
153 return '<KMeans Clusterer means=%s repeats=%d>' % \
154 (self._means, self._repeats)
155
157 xs, ys = sum(x), sum(y)
158 if xs < ys: return -1
159 elif xs > ys: return 1
160 else: return 0
161
162
163
165
166
167 from nltk_lite import cluster
168
169 vectors = [array(f) for f in [[2, 1], [1, 3], [4, 7], [6, 7]]]
170 means = [[4, 3], [5, 5]]
171
172 clusterer = cluster.KMeans(2, euclidean_distance, initial_means=means)
173 clusters = clusterer.cluster(vectors, True, trace=True)
174
175 print 'Clustered:', vectors
176 print 'As:', clusters
177 print 'Means:', clusterer.means()
178 print
179
180 vectors = [array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]
181
182
183
184
185 clusterer = cluster.KMeans(2, euclidean_distance, repeats=10)
186 clusters = clusterer.cluster(vectors, True)
187 print 'Clustered:', vectors
188 print 'As:', clusters
189 print 'Means:', clusterer.means()
190 print
191
192
193 vector = array([3, 3])
194 print 'classify(%s):' % vector,
195 print clusterer.classify(vector)
196 print
197
198 if __name__ == '__main__':
199 demo()
200