Package nltk_lite :: Package cluster :: Module gaac
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.cluster.gaac

  1  # Natural Language Toolkit: Group Average Agglomerative Clusterer 
  2  # 
  3  # Copyright (C) 2004-2007 University of Melbourne 
  4  # Author: Trevor Cohn <tacohn@cs.mu.oz.au> 
  5  # Porting: Steven Bird <sb@csse.unimelb.edu.au> 
  6  # URL: <http://nltk.sf.net> 
  7  # For license information, see LICENSE.TXT 
  8   
  9  from nltk_lite.cluster import * 
 10   
11 -class GroupAverageAgglomerative(VectorSpace):
12 """ 13 The GAAC clusterer starts with each of the N vectors as singleton 14 clusters. It then iteratively merges pairs of clusters which have the 15 closest centroids. This continues until there is only one cluster. The 16 order of merges gives rise to a dendogram: a tree with the earlier merges 17 lower than later merges. The membership of a given number of clusters c, 1 18 <= c <= N, can be found by cutting the dendogram at depth c. 19 20 This clusterer uses the cosine similarity metric only, which allows for 21 efficient speed-up in the clustering process. 22 """ 23
24 - def __init__(self, num_clusters=1, normalise=True, svd_dimensions=None):
25 VectorSpace.__init__(self, normalise, svd_dimensions) 26 self._num_clusters = num_clusters 27 self._dendogram = None 28 self._groups_values = None
29
30 - def cluster(self, vectors, assign_clusters=False, trace=False):
31 # stores the merge order 32 self._dendogram = Dendogram( 33 [array(vector, numpy.float64) for vector in vectors]) 34 return VectorSpace.cluster(self, vectors, assign_clusters, trace)
35
36 - def cluster_vectorspace(self, vectors, trace=False):
37 # create a cluster for each vector 38 clusters = [[vector] for vector in vectors] 39 40 # the sum vectors 41 vector_sum = copy.copy(vectors) 42 43 while len(clusters) > max(self._num_clusters, 1): 44 # find the two best candidate clusters to merge, based on their 45 # S(union c_i, c_j) 46 best = None 47 for i in range(len(clusters)): 48 for j in range(i + 1, len(clusters)): 49 sim = self._average_similarity( 50 vector_sum[i], len(clusters[i]), 51 vector_sum[j], len(clusters[j])) 52 if not best or sim > best[0]: 53 best = (sim, i, j) 54 55 # merge them and replace in cluster list 56 i, j = best[1:] 57 sum = clusters[i] + clusters[j] 58 if trace: print 'merging %d and %d' % (i, j) 59 60 clusters[i] = sum 61 del clusters[j] 62 vector_sum[i] = vector_sum[i] + vector_sum[j] 63 del vector_sum[j] 64 65 self._dendogram.merge(i, j) 66 67 self.update_clusters(self._num_clusters)
68
69 - def update_clusters(self, num_clusters):
70 clusters = self._dendogram.groups(num_clusters) 71 self._centroids = [] 72 for cluster in clusters: 73 assert len(cluster) > 0 74 if self._should_normalise: 75 centroid = self._normalise(cluster[0]) 76 else: 77 centroid = array(cluster[0]) 78 for vector in cluster[1:]: 79 if self._should_normalise: 80 centroid += self._normalise(vector) 81 else: 82 centroid += vector 83 centroid /= float(len(cluster)) 84 self._centroids.append(centroid) 85 self._num_clusters = len(self._centroids)
86
87 - def classify_vectorspace(self, vector):
88 best = None 89 for i in range(self._num_clusters): 90 centroid = self._centroids[i] 91 sim = self._average_similarity(vector, 1, centroid, 1) 92 if not best or sim > best[0]: 93 best = (sim, i) 94 return best[1]
95
96 - def dendogram(self):
97 """ 98 @return: The dendogram representing the current clustering 99 @rtype: Dendogram 100 """ 101 return self._dendogram
102
103 - def num_clusters(self):
104 return self._num_clusters
105
106 - def _average_similarity(self, v1, l1, v2, l2):
107 sum = v1 + v2 108 length = l1 + l2 109 return (numpy.dot(sum, sum) - length) / (length * (length - 1))
110
111 - def __repr__(self):
112 return '<GroupAverageAgglomerative Clusterer n=%d>' % self._num_clusters
113
114 -def demo():
115 """ 116 Non-interactive demonstration of the clusterers with simple 2-D data. 117 """ 118 119 from nltk_lite import cluster 120 121 # use a set of tokens with 2D indices 122 vectors = [array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]] 123 124 # test the GAAC clusterer with 4 clusters 125 clusterer = cluster.GroupAverageAgglomerative(4) 126 clusters = clusterer.cluster(vectors, True) 127 128 print 'Clusterer:', clusterer 129 print 'Clustered:', vectors 130 print 'As:', clusters 131 print 132 133 # show the dendogram 134 clusterer.dendogram().show() 135 136 # classify a new vector 137 vector = array([3, 3]) 138 print 'classify(%s):' % vector, 139 print clusterer.classify(vector) 140 print
141 142 143 if __name__ == '__main__': 144 demo() 145