1 from Numeric import *
2 from cluster import *
3
4
5 -def _treesort(order, nodeorder, nodecounts, tree):
6 nNodes = len(tree)
7 nElements = nNodes + 1
8 neworder = zeros(nElements,'d')
9 clusterids = range(nElements)
10 for i in range(nNodes):
11 i1 = tree[i].left
12 i2 = tree[i].right
13 if i1 < 0:
14 order1 = nodeorder[-i1-1]
15 count1 = nodecounts[-i1-1]
16 else:
17 order1 = order[i1]
18 count1 = 1
19 if i2 < 0:
20 order2 = nodeorder[-i2-1]
21 count2 = nodecounts[-i2-1]
22 else:
23 order2 = order[i2]
24 count2 = 1
25
26 if i1 < i2:
27 if order1 < order2:
28 increase = count1
29 else:
30 increase = count2
31 for j in range(nElements):
32 clusterid = clusterids[j]
33 if clusterid==i1 and order1>=order2: neworder[j] += increase
34 if clusterid==i2 and order1<order2: neworder[j] += increase
35 if clusterid==i1 or clusterid==i2: clusterids[j] = -i-1
36 else:
37 if order1<=order2:
38 increase = count1
39 else:
40 increase = count2
41 for j in range(nElements):
42 clusterid = clusterids[j]
43 if clusterid==i1 and order1>order2: neworder[j] += increase
44 if clusterid==i2 and order1<=order2: neworder[j] += increase
45 if clusterid==i1 or clusterid==i2: clusterids[j] = -i-1
46 return argsort(neworder)
47
48 -def _savetree(jobname, tree, order, transpose):
49 if transpose==0:
50 extension = ".gtr"
51 keyword = "GENE"
52 else:
53 extension = ".atr"
54 keyword = "ARRY"
55 nnodes = len(tree)
56 outputfile = open(jobname+extension, "w");
57 nodeindex = 0
58 nodeID = [''] * (nnodes)
59 nodecounts = zeros(nnodes)
60 nodeorder = zeros(nnodes,'d')
61 nodedist = array([node.distance for node in tree])
62 for nodeindex in range(nnodes):
63 min1 = tree[nodeindex].left
64 min2 = tree[nodeindex].right
65 nodeID[nodeindex] = "NODE%dX" % (nodeindex+1)
66 outputfile.write(nodeID[nodeindex])
67 outputfile.write("\t")
68 if min1 < 0:
69 index1 = -min1-1
70 order1 = nodeorder[index1]
71 counts1 = nodecounts[index1]
72 outputfile.write(nodeID[index1]+"\t")
73 nodedist[nodeindex] = max(nodedist[nodeindex],nodedist[index1])
74 else:
75 order1 = order[min1]
76 counts1 = 1
77 outputfile.write("%s%dX\t" % (keyword, min1))
78 if min2 < 0:
79 index2 = -min2-1
80 order2 = nodeorder[index2]
81 counts2 = nodecounts[index2]
82 outputfile.write(nodeID[index2]+"\t")
83 nodedist[nodeindex] = max(nodedist[nodeindex],nodedist[index2])
84 else:
85 order2 = order[min2];
86 counts2 = 1;
87 outputfile.write("%s%dX\t" % (keyword, min2))
88 outputfile.write(str(1.0-nodedist[nodeindex]))
89 outputfile.write("\n")
90 nodecounts[nodeindex] = counts1 + counts2
91 nodeorder[nodeindex] = (counts1*order1+counts2*order2) / (counts1+counts2)
92 outputfile.close()
93
94 index = _treesort(order, nodeorder, nodecounts, tree)
95 return index
96
98 """DataFile reads a file containing gene expression data following
99 Michael Eisen's format for Cluster/TreeView. A DataFile object
100 has the following members:
101 data: a matrix containing the gene expression data
102 mask: a matrix containing only 1's and 0's, denoting which values
103 are present (1) or missing (0). If all elements of mask are
104 one (no missing data), then mask is set to None.
105 geneid: a list containing a unique identifier for each gene
106 (e.g., ORF name)
107 genename: a list containing an additional description for each gene
108 (e.g., gene name)
109 gweight: the weight to be used for each gene when calculating the
110 distance
111 gorder: an array of real numbers indicating the preferred order of the
112 genes in the output file
113 expid: a list containing a unique identifier for each experimental
114 condition
115 eweight: the weight to be used for each experimental condition when
116 calculating the distance
117 eorder: an array of real numbers indication the preferred order in the
118 output file of the experimental conditions
119 uniqid: the string that was used instead of UNIQID in the input file."""
121 """Reads a data file in the format corresponding to Michael Eisen's
122 Cluster/TreeView program, and stores the data in a DataFile object"""
123 self.data = None
124 self.mask = None
125 self.geneid = None
126 self.genename = None
127 self.gweight = None
128 self.gorder = None
129 self.expid = None
130 self.eweight = None
131 self.eorder = None
132 self.uniqid = None
133 if not filename: return
134 inputfile = open(filename)
135 lines = inputfile.readlines()
136 inputfile.close()
137 lines = [line.strip("\r\n").split("\t") for line in lines]
138 line = lines[0]
139 n = len(line)
140 self.uniqid = line[0]
141 self.expid = []
142 cols = {0: "GENEID"}
143 for word in line[1:]:
144 if word=="NAME":
145 cols[line.index(word)] = word
146 self.genename = []
147 elif word=="GWEIGHT":
148 cols[line.index(word)] = word
149 self.gweight = []
150 elif word=="GORDER":
151 cols[line.index(word)] = word
152 self.gorder = []
153 else: self.expid.append(word)
154 self.geneid = []
155 self.data = []
156 self.mask = []
157 needmask = 0
158 for line in lines[1:]:
159 assert len(line)==n, "Line with %d columns found (expected %d)" % (len(line), n)
160 if line[0]=="EWEIGHT":
161 i = max(cols) + 1
162 self.eweight = map(float, line[i:])
163 continue
164 if line[0]=="EORDER":
165 i = max(cols) + 1
166 self.eorder = map(float, line[i:])
167 continue
168 rowdata = []
169 rowmask = []
170 n = len(line)
171 for i in range(n):
172 word = line[i]
173 if i in cols:
174 if cols[i]=="GENEID": self.geneid.append(word)
175 if cols[i]=="NAME": self.genename.append(word)
176 if cols[i]=="GWEIGHT": self.gweight.append(float(word))
177 if cols[i]=="GORDER": self.gorder.append(float(word))
178 continue
179 if not word:
180 rowdata.append(0.0)
181 rowmask.append(0)
182 needmask = 1
183 else:
184 rowdata.append(float(word))
185 rowmask.append(1)
186 self.data.append(rowdata)
187 self.mask.append(rowmask)
188 self.data = array(self.data)
189 if needmask: self.mask = array(self.mask)
190 else: self.mask = None
191 if self.gweight: self.gweight = array(self.gweight)
192 if self.gorder: self.gorder = array(self.gorder)
193
194 - def treecluster(self, transpose=0, method='m', dist='e'):
195 if transpose==0: weight = self.eweight
196 else: weight = self.gweight
197 return treecluster(self.data, self.mask, weight, transpose, method, dist)
198
199 - def kcluster(self, nclusters=2, transpose=0, npass=1, method='a', dist='e', initialid=None):
200 if transpose==0: weight = self.eweight
201 else: weight = self.gweight
202 clusterid, error, nfound = kcluster(self.data, nclusters, self.mask, weight, transpose, npass, method, dist, initialid)
203 return clusterid, error, nfound
204
205 - def somcluster(self, transpose=0, nxgrid=2, nygrid=1, inittau=0.02, niter=1, dist='e'):
206 if transpose==0: weight = self.eweight
207 else: weight = self.gweight
208 clusterid, celldata = somcluster(self.data, self.mask, weight, transpose, nxgrid, nygrid, inittau, niter, dist)
209 return clusterid, celldata
210
212 cdata, cmask = clustercentroids(self.data, self.mask, clusterid, method, transpose)
213 return cdata, cmask
214
215 - def clusterdistance(self, index1=[0], index2=[0], method='a', dist='e',
216 transpose=0):
217 if transpose==0: weight = self.eweight
218 else: weight = self.gweight
219 return clusterdistance(self.data, self.mask, weight, index1, index2, method, dist, transpose)
220
222 if transpose==0: weight = self.eweight
223 else: weight = self.gweight
224 return distancematrix(self.data, self.mask, weight, transpose, dist)
225
226 - def save(self, jobname, geneclusters=None, expclusters=None):
227 """save(jobname, geneclusters=None, expclusters=None)
228 saves the clustering results. The saved files follow the convention
229 for Java TreeView program, which can therefore be used to view the
230 clustering result.
231 Arguments:
232 jobname: The base name of the files to be saved. The filenames are
233 jobname.cdt, jobname.gtr, and jobname.atr for
234 hierarchical clustering, and jobname-K*.cdt,
235 jobname-K*.kgg, jobname-K*.kag for k-means clustering
236 results.
237 geneclusters=None: For hierarchical clustering results,
238 geneclusters is an (ngenes-1 x 2) array that describes
239 the hierarchical clustering result for genes. This array
240 can be calculated by the hierarchical clustering methods
241 implemented in treecluster.
242 For k-means clustering results, geneclusters is a vector
243 containing ngenes integers, describing to which cluster a
244 given gene belongs. This vector can be calculated by
245 kcluster.
246 expclusters=None: For hierarchical clustering results, expclusters
247 is an (nexps-1 x 2) array that describes the hierarchical
248 clustering result for experimental conditions. This array
249 can be calculated by the hierarchical clustering methods
250 implemented in treecluster.
251 For k-means clustering results, expclusters is a vector
252 containing nexps integers, describing to which cluster a
253 given experimental condition belongs. This vector can be
254 calculated by kcluster.
255 """
256 (ngenes,nexps) = shape(self.data)
257 if self.gorder==None: gorder = arange(ngenes)
258 else: gorder = self.gorder
259 if self.eorder==None: eorder = arange(nexps)
260 else: eorder = self.eorder
261 if geneclusters and expclusters:
262 assert type(geneclusters)==type(expclusters), "found one k-means and one hierarchical clustering solution in geneclusters and expclusters"
263 gid = 0
264 aid = 0
265 filename = jobname
266 postfix = ""
267 if type(geneclusters)==Tree:
268
269 geneindex = _savetree(jobname, geneclusters, gorder, 0)
270 gid = 1
271 elif geneclusters:
272
273 filename = jobname + "_K"
274 k = max(geneclusters+1)
275 kggfilename = "%s_K_G%d.kgg" % (jobname, k)
276 geneindex = self._savekmeans(kggfilename, geneclusters, gorder, 0)
277 postfix = "_G%d" % k
278 else:
279 geneindex = argsort(gorder)
280 if type(expclusters)==Tree:
281
282 expindex = _savetree(jobname, expclusters, eorder, 1)
283 aid = 1
284 elif expclusters:
285
286 filename = jobname + "_K"
287 k = max(expclusters+1)
288 kagfilename = "%s_K_A%d.kag" % (jobname, k)
289 expindex = self._savekmeans(kagfilename, expclusters, eorder, 1)
290 postfix += "_A%d" % k
291 else:
292 expindex = argsort(eorder)
293 filename = filename + postfix
294 self._savedata(filename,gid,aid,geneindex,expindex)
295
296 - def _savekmeans(self, filename, clusterids, order, transpose):
297 if transpose==0:
298 label = self.uniqid
299 names = self.geneid
300 else:
301 label = "ARRAY"
302 names = self.expid
303 outputfile = open(filename, "w");
304 if not outputfile: raise "Error: Unable to open output file"
305 outputfile.write(label + "\tGROUP\n")
306 index = argsort(order)
307 n = len(names)
308 sortedindex = zeros(n)
309 counter = 0
310 cluster = 0
311 while counter < n:
312 for j in index:
313 if clusterids[j]==cluster:
314 outputfile.write("%s\t%s\n" % (names[j], cluster))
315 sortedindex[counter] = j
316 counter+=1
317 cluster+=1
318 outputfile.close();
319 return index
320
321 - def _savedata(self, jobname, gid, aid, geneindex, expindex):
322 if self.genename==None: genename = self.geneid
323 else: genename = self.genename
324 (ngenes, nexps) = shape(self.data)
325 outputfile = open(jobname+'.cdt', 'w')
326 if not outputfile: return "Error: Unable to open output file"
327 if self.mask: mask = self.mask
328 else: mask = ones((ngenes,nexps))
329 if self.gweight: gweight = self.gweight
330 else: gweight = ones(ngenes)
331 if self.eweight: eweight = self.eweight
332 else: eweight = ones(nexps)
333 if gid: outputfile.write ('GID\t')
334 outputfile.write(self.uniqid)
335 outputfile.write('\tNAME\tGWEIGHT')
336
337 for j in expindex: outputfile.write('\t%s' % self.expid[j])
338 outputfile.write('\n')
339 if aid:
340 outputfile.write("AID")
341 if gid: outputfile.write('\t')
342 outputfile.write("\t\t")
343 for j in expindex: outputfile.write ('\tARRY%dX' % j)
344 outputfile.write('\n')
345 outputfile.write('EWEIGHT')
346 if gid: outputfile.write('\t')
347 outputfile.write('\t\t')
348 for j in expindex: outputfile.write('\t%f' % eweight[j])
349 outputfile.write('\n')
350 for i in geneindex:
351 if gid: outputfile.write('GENE%dX\t' % i)
352 outputfile.write("%s\t%s\t%f" % (self.geneid[i], genename[i], gweight[i]))
353 for j in expindex:
354 outputfile.write('\t')
355 if mask[i][j]: outputfile.write(str(self.data[i][j]))
356 outputfile.write('\n')
357 outputfile.close()
358