1
2
3
4
5
6
7
8
9 from nltk_lite.cluster import *
10
12 """
13 The GAAC clusterer starts with each of the N vectors as singleton
14 clusters. It then iteratively merges pairs of clusters which have the
15 closest centroids. This continues until there is only one cluster. The
16 order of merges gives rise to a dendogram: a tree with the earlier merges
17 lower than later merges. The membership of a given number of clusters c, 1
18 <= c <= N, can be found by cutting the dendogram at depth c.
19
20 This clusterer uses the cosine similarity metric only, which allows for
21 efficient speed-up in the clustering process.
22 """
23
24 - def __init__(self, num_clusters=1, normalise=True, svd_dimensions=None):
29
30 - def cluster(self, vectors, assign_clusters=False, trace=False):
35
37
38 clusters = [[vector] for vector in vectors]
39
40
41 vector_sum = copy.copy(vectors)
42
43 while len(clusters) > max(self._num_clusters, 1):
44
45
46 best = None
47 for i in range(len(clusters)):
48 for j in range(i + 1, len(clusters)):
49 sim = self._average_similarity(
50 vector_sum[i], len(clusters[i]),
51 vector_sum[j], len(clusters[j]))
52 if not best or sim > best[0]:
53 best = (sim, i, j)
54
55
56 i, j = best[1:]
57 sum = clusters[i] + clusters[j]
58 if trace: print 'merging %d and %d' % (i, j)
59
60 clusters[i] = sum
61 del clusters[j]
62 vector_sum[i] = vector_sum[i] + vector_sum[j]
63 del vector_sum[j]
64
65 self._dendogram.merge(i, j)
66
67 self.update_clusters(self._num_clusters)
68
86
88 best = None
89 for i in range(self._num_clusters):
90 centroid = self._centroids[i]
91 sim = self._average_similarity(vector, 1, centroid, 1)
92 if not best or sim > best[0]:
93 best = (sim, i)
94 return best[1]
95
97 """
98 @return: The dendogram representing the current clustering
99 @rtype: Dendogram
100 """
101 return self._dendogram
102
104 return self._num_clusters
105
110
112 return '<GroupAverageAgglomerative Clusterer n=%d>' % self._num_clusters
113
115 """
116 Non-interactive demonstration of the clusterers with simple 2-D data.
117 """
118
119 from nltk_lite import cluster
120
121
122 vectors = [array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]
123
124
125 clusterer = cluster.GroupAverageAgglomerative(4)
126 clusters = clusterer.cluster(vectors, True)
127
128 print 'Clusterer:', clusterer
129 print 'Clustered:', vectors
130 print 'As:', clusters
131 print
132
133
134 clusterer.dendogram().show()
135
136
137 vector = array([3, 3])
138 print 'classify(%s):' % vector,
139 print clusterer.classify(vector)
140 print
141
142
143 if __name__ == '__main__':
144 demo()
145