Package Bio :: Package Cluster
[hide private]
[frames] | no frames]

Source Code for Package Bio.Cluster

  1  from Numeric import * 
  2  from cluster import * 
  3   
  4   
5 -def _treesort(order, nodeorder, nodecounts, tree):
6 nNodes = len(tree) 7 nElements = nNodes + 1 8 neworder = zeros(nElements,'d') 9 clusterids = range(nElements) 10 for i in range(nNodes): 11 i1 = tree[i].left 12 i2 = tree[i].right 13 if i1 < 0: 14 order1 = nodeorder[-i1-1] 15 count1 = nodecounts[-i1-1] 16 else: 17 order1 = order[i1] 18 count1 = 1 19 if i2 < 0: 20 order2 = nodeorder[-i2-1] 21 count2 = nodecounts[-i2-1] 22 else: 23 order2 = order[i2] 24 count2 = 1 25 # If order1 and order2 are equal, their order is determined by the order in which they were clustered 26 if i1 < i2: 27 if order1 < order2: 28 increase = count1 29 else: 30 increase = count2 31 for j in range(nElements): 32 clusterid = clusterids[j] 33 if clusterid==i1 and order1>=order2: neworder[j] += increase 34 if clusterid==i2 and order1<order2: neworder[j] += increase 35 if clusterid==i1 or clusterid==i2: clusterids[j] = -i-1 36 else: 37 if order1<=order2: 38 increase = count1 39 else: 40 increase = count2 41 for j in range(nElements): 42 clusterid = clusterids[j] 43 if clusterid==i1 and order1>order2: neworder[j] += increase 44 if clusterid==i2 and order1<=order2: neworder[j] += increase 45 if clusterid==i1 or clusterid==i2: clusterids[j] = -i-1 46 return argsort(neworder)
47
48 -def _savetree(jobname, tree, order, transpose):
49 if transpose==0: 50 extension = ".gtr" 51 keyword = "GENE" 52 else: 53 extension = ".atr" 54 keyword = "ARRY" 55 nnodes = len(tree) 56 outputfile = open(jobname+extension, "w"); 57 nodeindex = 0 58 nodeID = [''] * (nnodes) 59 nodecounts = zeros(nnodes) 60 nodeorder = zeros(nnodes,'d') 61 nodedist = array([node.distance for node in tree]) 62 for nodeindex in range(nnodes): 63 min1 = tree[nodeindex].left 64 min2 = tree[nodeindex].right 65 nodeID[nodeindex] = "NODE%dX" % (nodeindex+1) 66 outputfile.write(nodeID[nodeindex]) 67 outputfile.write("\t") 68 if min1 < 0: 69 index1 = -min1-1 70 order1 = nodeorder[index1] 71 counts1 = nodecounts[index1] 72 outputfile.write(nodeID[index1]+"\t") 73 nodedist[nodeindex] = max(nodedist[nodeindex],nodedist[index1]) 74 else: 75 order1 = order[min1] 76 counts1 = 1 77 outputfile.write("%s%dX\t" % (keyword, min1)) 78 if min2 < 0: 79 index2 = -min2-1 80 order2 = nodeorder[index2] 81 counts2 = nodecounts[index2] 82 outputfile.write(nodeID[index2]+"\t") 83 nodedist[nodeindex] = max(nodedist[nodeindex],nodedist[index2]) 84 else: 85 order2 = order[min2]; 86 counts2 = 1; 87 outputfile.write("%s%dX\t" % (keyword, min2)) 88 outputfile.write(str(1.0-nodedist[nodeindex])) 89 outputfile.write("\n") 90 nodecounts[nodeindex] = counts1 + counts2 91 nodeorder[nodeindex] = (counts1*order1+counts2*order2) / (counts1+counts2) 92 outputfile.close() 93 # Now set up order based on the tree structure 94 index = _treesort(order, nodeorder, nodecounts, tree) 95 return index
96
97 -class DataFile:
98 """DataFile reads a file containing gene expression data following 99 Michael Eisen's format for Cluster/TreeView. A DataFile object 100 has the following members: 101 data: a matrix containing the gene expression data 102 mask: a matrix containing only 1's and 0's, denoting which values 103 are present (1) or missing (0). If all elements of mask are 104 one (no missing data), then mask is set to None. 105 geneid: a list containing a unique identifier for each gene 106 (e.g., ORF name) 107 genename: a list containing an additional description for each gene 108 (e.g., gene name) 109 gweight: the weight to be used for each gene when calculating the 110 distance 111 gorder: an array of real numbers indicating the preferred order of the 112 genes in the output file 113 expid: a list containing a unique identifier for each experimental 114 condition 115 eweight: the weight to be used for each experimental condition when 116 calculating the distance 117 eorder: an array of real numbers indication the preferred order in the 118 output file of the experimental conditions 119 uniqid: the string that was used instead of UNIQID in the input file."""
120 - def __init__(self, filename=None):
121 """Reads a data file in the format corresponding to Michael Eisen's 122 Cluster/TreeView program, and stores the data in a DataFile object""" 123 self.data = None 124 self.mask = None 125 self.geneid = None 126 self.genename = None 127 self.gweight = None 128 self.gorder = None 129 self.expid = None 130 self.eweight = None 131 self.eorder = None 132 self.uniqid = None 133 if not filename: return 134 inputfile = open(filename) 135 lines = inputfile.readlines() 136 inputfile.close() 137 lines = [line.strip("\r\n").split("\t") for line in lines] 138 line = lines[0] 139 n = len(line) 140 self.uniqid = line[0] 141 self.expid = [] 142 cols = {0: "GENEID"} 143 for word in line[1:]: 144 if word=="NAME": 145 cols[line.index(word)] = word 146 self.genename = [] 147 elif word=="GWEIGHT": 148 cols[line.index(word)] = word 149 self.gweight = [] 150 elif word=="GORDER": 151 cols[line.index(word)] = word 152 self.gorder = [] 153 else: self.expid.append(word) 154 self.geneid = [] 155 self.data = [] 156 self.mask = [] 157 needmask = 0 158 for line in lines[1:]: 159 assert len(line)==n, "Line with %d columns found (expected %d)" % (len(line), n) 160 if line[0]=="EWEIGHT": 161 i = max(cols) + 1 162 self.eweight = map(float, line[i:]) 163 continue 164 if line[0]=="EORDER": 165 i = max(cols) + 1 166 self.eorder = map(float, line[i:]) 167 continue 168 rowdata = [] 169 rowmask = [] 170 n = len(line) 171 for i in range(n): 172 word = line[i] 173 if i in cols: 174 if cols[i]=="GENEID": self.geneid.append(word) 175 if cols[i]=="NAME": self.genename.append(word) 176 if cols[i]=="GWEIGHT": self.gweight.append(float(word)) 177 if cols[i]=="GORDER": self.gorder.append(float(word)) 178 continue 179 if not word: 180 rowdata.append(0.0) 181 rowmask.append(0) 182 needmask = 1 183 else: 184 rowdata.append(float(word)) 185 rowmask.append(1) 186 self.data.append(rowdata) 187 self.mask.append(rowmask) 188 self.data = array(self.data) 189 if needmask: self.mask = array(self.mask) 190 else: self.mask = None 191 if self.gweight: self.gweight = array(self.gweight) 192 if self.gorder: self.gorder = array(self.gorder)
193
194 - def treecluster(self, transpose=0, method='m', dist='e'):
195 if transpose==0: weight = self.eweight 196 else: weight = self.gweight 197 return treecluster(self.data, self.mask, weight, transpose, method, dist)
198
199 - def kcluster(self, nclusters=2, transpose=0, npass=1, method='a', dist='e', initialid=None):
200 if transpose==0: weight = self.eweight 201 else: weight = self.gweight 202 clusterid, error, nfound = kcluster(self.data, nclusters, self.mask, weight, transpose, npass, method, dist, initialid) 203 return clusterid, error, nfound
204
205 - def somcluster(self, transpose=0, nxgrid=2, nygrid=1, inittau=0.02, niter=1, dist='e'):
206 if transpose==0: weight = self.eweight 207 else: weight = self.gweight 208 clusterid, celldata = somcluster(self.data, self.mask, weight, transpose, nxgrid, nygrid, inittau, niter, dist) 209 return clusterid, celldata
210
211 - def clustercentroids(self, clusterid=None, method='a', transpose=0):
212 cdata, cmask = clustercentroids(self.data, self.mask, clusterid, method, transpose) 213 return cdata, cmask
214
215 - def clusterdistance(self, index1=[0], index2=[0], method='a', dist='e', 216 transpose=0):
217 if transpose==0: weight = self.eweight 218 else: weight = self.gweight 219 return clusterdistance(self.data, self.mask, weight, index1, index2, method, dist, transpose)
220
221 - def distancematrix(self, transpose=0, dist='e'):
222 if transpose==0: weight = self.eweight 223 else: weight = self.gweight 224 return distancematrix(self.data, self.mask, weight, transpose, dist)
225
226 - def save(self, jobname, geneclusters=None, expclusters=None):
227 """save(jobname, geneclusters=None, expclusters=None) 228 saves the clustering results. The saved files follow the convention 229 for Java TreeView program, which can therefore be used to view the 230 clustering result. 231 Arguments: 232 jobname: The base name of the files to be saved. The filenames are 233 jobname.cdt, jobname.gtr, and jobname.atr for 234 hierarchical clustering, and jobname-K*.cdt, 235 jobname-K*.kgg, jobname-K*.kag for k-means clustering 236 results. 237 geneclusters=None: For hierarchical clustering results, 238 geneclusters is an (ngenes-1 x 2) array that describes 239 the hierarchical clustering result for genes. This array 240 can be calculated by the hierarchical clustering methods 241 implemented in treecluster. 242 For k-means clustering results, geneclusters is a vector 243 containing ngenes integers, describing to which cluster a 244 given gene belongs. This vector can be calculated by 245 kcluster. 246 expclusters=None: For hierarchical clustering results, expclusters 247 is an (nexps-1 x 2) array that describes the hierarchical 248 clustering result for experimental conditions. This array 249 can be calculated by the hierarchical clustering methods 250 implemented in treecluster. 251 For k-means clustering results, expclusters is a vector 252 containing nexps integers, describing to which cluster a 253 given experimental condition belongs. This vector can be 254 calculated by kcluster. 255 """ 256 (ngenes,nexps) = shape(self.data) 257 if self.gorder==None: gorder = arange(ngenes) 258 else: gorder = self.gorder 259 if self.eorder==None: eorder = arange(nexps) 260 else: eorder = self.eorder 261 if geneclusters and expclusters: 262 assert type(geneclusters)==type(expclusters), "found one k-means and one hierarchical clustering solution in geneclusters and expclusters" 263 gid = 0 264 aid = 0 265 filename = jobname 266 postfix = "" 267 if type(geneclusters)==Tree: 268 # Hierarchical clustering result 269 geneindex = _savetree(jobname, geneclusters, gorder, 0) 270 gid = 1 271 elif geneclusters: 272 # k-means clustering result 273 filename = jobname + "_K" 274 k = max(geneclusters+1) 275 kggfilename = "%s_K_G%d.kgg" % (jobname, k) 276 geneindex = self._savekmeans(kggfilename, geneclusters, gorder, 0) 277 postfix = "_G%d" % k 278 else: 279 geneindex = argsort(gorder) 280 if type(expclusters)==Tree: 281 # Hierarchical clustering result 282 expindex = _savetree(jobname, expclusters, eorder, 1) 283 aid = 1 284 elif expclusters: 285 # k-means clustering result 286 filename = jobname + "_K" 287 k = max(expclusters+1) 288 kagfilename = "%s_K_A%d.kag" % (jobname, k) 289 expindex = self._savekmeans(kagfilename, expclusters, eorder, 1) 290 postfix += "_A%d" % k 291 else: 292 expindex = argsort(eorder) 293 filename = filename + postfix 294 self._savedata(filename,gid,aid,geneindex,expindex)
295
296 - def _savekmeans(self, filename, clusterids, order, transpose):
297 if transpose==0: 298 label = self.uniqid 299 names = self.geneid 300 else: 301 label = "ARRAY" 302 names = self.expid 303 outputfile = open(filename, "w"); 304 if not outputfile: raise "Error: Unable to open output file" 305 outputfile.write(label + "\tGROUP\n") 306 index = argsort(order) 307 n = len(names) 308 sortedindex = zeros(n) 309 counter = 0 310 cluster = 0 311 while counter < n: 312 for j in index: 313 if clusterids[j]==cluster: 314 outputfile.write("%s\t%s\n" % (names[j], cluster)) 315 sortedindex[counter] = j 316 counter+=1 317 cluster+=1 318 outputfile.close(); 319 return index
320
321 - def _savedata(self, jobname, gid, aid, geneindex, expindex):
322 if self.genename==None: genename = self.geneid 323 else: genename = self.genename 324 (ngenes, nexps) = shape(self.data) 325 outputfile = open(jobname+'.cdt', 'w') 326 if not outputfile: return "Error: Unable to open output file" 327 if self.mask: mask = self.mask 328 else: mask = ones((ngenes,nexps)) 329 if self.gweight: gweight = self.gweight 330 else: gweight = ones(ngenes) 331 if self.eweight: eweight = self.eweight 332 else: eweight = ones(nexps) 333 if gid: outputfile.write ('GID\t') 334 outputfile.write(self.uniqid) 335 outputfile.write('\tNAME\tGWEIGHT') 336 # Now add headers for data columns 337 for j in expindex: outputfile.write('\t%s' % self.expid[j]) 338 outputfile.write('\n') 339 if aid: 340 outputfile.write("AID") 341 if gid: outputfile.write('\t') 342 outputfile.write("\t\t") 343 for j in expindex: outputfile.write ('\tARRY%dX' % j) 344 outputfile.write('\n') 345 outputfile.write('EWEIGHT') 346 if gid: outputfile.write('\t') 347 outputfile.write('\t\t') 348 for j in expindex: outputfile.write('\t%f' % eweight[j]) 349 outputfile.write('\n') 350 for i in geneindex: 351 if gid: outputfile.write('GENE%dX\t' % i) 352 outputfile.write("%s\t%s\t%f" % (self.geneid[i], genename[i], gweight[i])) 353 for j in expindex: 354 outputfile.write('\t') 355 if mask[i][j]: outputfile.write(str(self.data[i][j])) 356 outputfile.write('\n') 357 outputfile.close()
358