1 import numpy
2
3 from Bio.Cluster.cluster import *
4
5
6 -def _treesort(order, nodeorder, nodecounts, tree):
7
8
9 nNodes = len(tree)
10 nElements = nNodes + 1
11 neworder = numpy.zeros(nElements)
12 clusterids = numpy.arange(nElements)
13 for i in range(nNodes):
14 i1 = tree[i].left
15 i2 = tree[i].right
16 if i1 < 0:
17 order1 = nodeorder[-i1-1]
18 count1 = nodecounts[-i1-1]
19 else:
20 order1 = order[i1]
21 count1 = 1
22 if i2 < 0:
23 order2 = nodeorder[-i2-1]
24 count2 = nodecounts[-i2-1]
25 else:
26 order2 = order[i2]
27 count2 = 1
28
29
30 if i1 < i2:
31 if order1 < order2:
32 increase = count1
33 else:
34 increase = count2
35 for j in range(nElements):
36 clusterid = clusterids[j]
37 if clusterid == i1 and order1 >= order2:
38 neworder[j] += increase
39 if clusterid == i2 and order1 < order2:
40 neworder[j] += increase
41 if clusterid == i1 or clusterid == i2:
42 clusterids[j] = -i-1
43 else:
44 if order1 <= order2:
45 increase = count1
46 else:
47 increase = count2
48 for j in range(nElements):
49 clusterid = clusterids[j]
50 if clusterid == i1 and order1 > order2:
51 neworder[j] += increase
52 if clusterid == i2 and order1 <= order2:
53 neworder[j] += increase
54 if clusterid == i1 or clusterid == i2:
55 clusterids[j] = -i-1
56 return numpy.argsort(neworder)
57
58
59 -def _savetree(jobname, tree, order, transpose):
60
61
62 if transpose == 0:
63 extension = ".gtr"
64 keyword = "GENE"
65 else:
66 extension = ".atr"
67 keyword = "ARRY"
68 nnodes = len(tree)
69 outputfile = open(jobname+extension, "w")
70 nodeindex = 0
71 nodeID = [''] * nnodes
72 nodecounts = numpy.zeros(nnodes, int)
73 nodeorder = numpy.zeros(nnodes)
74 nodedist = numpy.array([node.distance for node in tree])
75 for nodeindex in range(nnodes):
76 min1 = tree[nodeindex].left
77 min2 = tree[nodeindex].right
78 nodeID[nodeindex] = "NODE%dX" % (nodeindex+1)
79 outputfile.write(nodeID[nodeindex])
80 outputfile.write("\t")
81 if min1 < 0:
82 index1 = -min1-1
83 order1 = nodeorder[index1]
84 counts1 = nodecounts[index1]
85 outputfile.write(nodeID[index1]+"\t")
86 nodedist[nodeindex] = max(nodedist[nodeindex], nodedist[index1])
87 else:
88 order1 = order[min1]
89 counts1 = 1
90 outputfile.write("%s%dX\t" % (keyword, min1))
91 if min2 < 0:
92 index2 = -min2-1
93 order2 = nodeorder[index2]
94 counts2 = nodecounts[index2]
95 outputfile.write(nodeID[index2]+"\t")
96 nodedist[nodeindex] = max(nodedist[nodeindex], nodedist[index2])
97 else:
98 order2 = order[min2]
99 counts2 = 1
100 outputfile.write("%s%dX\t" % (keyword, min2))
101 outputfile.write(str(1.0-nodedist[nodeindex]))
102 outputfile.write("\n")
103 counts = counts1 + counts2
104 nodecounts[nodeindex] = counts
105 nodeorder[nodeindex] = (counts1*order1+counts2*order2) / counts
106 outputfile.close()
107
108 index = _treesort(order, nodeorder, nodecounts, tree)
109 return index
110
111
113 """Store gene expression data.
114
115 A Record stores the gene expression data and related information contained
116 in a data file following the file format defined for Michael Eisen's
117 Cluster/TreeView program. A Record has the following members:
118
119 data: a matrix containing the gene expression data
120 mask: a matrix containing only 1's and 0's, denoting which values
121 are present (1) or missing (0). If all elements of mask are
122 one (no missing data), then mask is set to None.
123 geneid: a list containing a unique identifier for each gene
124 (e.g., ORF name)
125 genename: a list containing an additional description for each gene
126 (e.g., gene name)
127 gweight: the weight to be used for each gene when calculating the
128 distance
129 gorder: an array of real numbers indicating the preferred order of the
130 genes in the output file
131 expid: a list containing a unique identifier for each experimental
132 condition
133 eweight: the weight to be used for each experimental condition when
134 calculating the distance
135 eorder: an array of real numbers indication the preferred order in the
136 output file of the experimental conditions
137 uniqid: the string that was used instead of UNIQID in the input file.
138
139 """
140
142 """Read gene expression data from the file handle and return a Record.
143
144 The file should be in the format defined for Michael Eisen's
145 Cluster/TreeView program.
146
147 """
148 self.data = None
149 self.mask = None
150 self.geneid = None
151 self.genename = None
152 self.gweight = None
153 self.gorder = None
154 self.expid = None
155 self.eweight = None
156 self.eorder = None
157 self.uniqid = None
158 if not handle:
159 return
160 line = handle.readline().strip("\r\n").split("\t")
161 n = len(line)
162 self.uniqid = line[0]
163 self.expid = []
164 cols = {0: "GENEID"}
165 for word in line[1:]:
166 if word == "NAME":
167 cols[line.index(word)] = word
168 self.genename = []
169 elif word == "GWEIGHT":
170 cols[line.index(word)] = word
171 self.gweight = []
172 elif word=="GORDER":
173 cols[line.index(word)] = word
174 self.gorder = []
175 else:
176 self.expid.append(word)
177 self.geneid = []
178 self.data = []
179 self.mask = []
180 needmask = 0
181 for line in handle:
182 line = line.strip("\r\n").split("\t")
183 if len(line) != n:
184 raise ValueError("Line with %d columns found (expected %d)" %
185 (len(line), n))
186 if line[0] == "EWEIGHT":
187 i = max(cols) + 1
188 self.eweight = numpy.array(line[i:], float)
189 continue
190 if line[0] == "EORDER":
191 i = max(cols) + 1
192 self.eorder = numpy.array(line[i:], float)
193 continue
194 rowdata = []
195 rowmask = []
196 n = len(line)
197 for i in range(n):
198 word = line[i]
199 if i in cols:
200 if cols[i] == "GENEID":
201 self.geneid.append(word)
202 if cols[i] == "NAME":
203 self.genename.append(word)
204 if cols[i] == "GWEIGHT":
205 self.gweight.append(float(word))
206 if cols[i] == "GORDER":
207 self.gorder.append(float(word))
208 continue
209 if not word:
210 rowdata.append(0.0)
211 rowmask.append(0)
212 needmask = 1
213 else:
214 rowdata.append(float(word))
215 rowmask.append(1)
216 self.data.append(rowdata)
217 self.mask.append(rowmask)
218 self.data = numpy.array(self.data)
219 if needmask:
220 self.mask = numpy.array(self.mask, int)
221 else:
222 self.mask = None
223 if self.gweight:
224 self.gweight = numpy.array(self.gweight)
225 if self.gorder:
226 self.gorder = numpy.array(self.gorder)
227
228 - def treecluster(self, transpose=0, method='m', dist='e'):
229 """Apply hierarchical clustering and return a Tree object.
230
231 The pairwise single, complete, centroid, and average linkage hierarchical
232 clustering methods are available.
233
234 transpose: if equal to 0, genes (rows) are clustered;
235 if equal to 1, microarrays (columns) are clustered.
236 dist : specifies the distance function to be used:
237 dist=='e': Euclidean distance
238 dist=='b': City Block distance
239 dist=='c': Pearson correlation
240 dist=='a': absolute value of the correlation
241 dist=='u': uncentered correlation
242 dist=='x': absolute uncentered correlation
243 dist=='s': Spearman's rank correlation
244 dist=='k': Kendall's tau
245 method : specifies which linkage method is used:
246 method=='s': Single pairwise linkage
247 method=='m': Complete (maximum) pairwise linkage (default)
248 method=='c': Centroid linkage
249 method=='a': Average pairwise linkage
250
251 See the description of the Tree class for more information about the Tree
252 object returned by this method.
253
254 """
255 if transpose == 0:
256 weight = self.eweight
257 else:
258 weight = self.gweight
259 return treecluster(self.data, self.mask, weight, transpose, method,
260 dist)
261
262 - def kcluster(self, nclusters=2, transpose=0, npass=1, method='a', dist='e',
263 initialid=None):
264 """Apply k-means or k-median clustering.
265
266 This method returns a tuple (clusterid, error, nfound).
267
268 nclusters: number of clusters (the 'k' in k-means)
269 transpose: if equal to 0, genes (rows) are clustered;
270 if equal to 1, microarrays (columns) are clustered.
271 npass : number of times the k-means clustering algorithm is
272 performed, each time with a different (random) initial
273 condition.
274 method : specifies how the center of a cluster is found:
275 method=='a': arithmetic mean
276 method=='m': median
277 dist : specifies the distance function to be used:
278 dist=='e': Euclidean distance
279 dist=='b': City Block distance
280 dist=='c': Pearson correlation
281 dist=='a': absolute value of the correlation
282 dist=='u': uncentered correlation
283 dist=='x': absolute uncentered correlation
284 dist=='s': Spearman's rank correlation
285 dist=='k': Kendall's tau
286 initialid: the initial clustering from which the algorithm should start.
287 If initialid is None, the routine carries out npass
288 repetitions of the EM algorithm, each time starting from a
289 different random initial clustering. If initialid is given,
290 the routine carries out the EM algorithm only once, starting
291 from the given initial clustering and without randomizing the
292 order in which items are assigned to clusters (i.e., using
293 the same order as in the data matrix). In that case, the
294 k-means algorithm is fully deterministic.
295
296 Return values:
297 clusterid: array containing the number of the cluster to which each
298 gene/microarray was assigned in the best k-means clustering
299 solution that was found in the npass runs;
300 error: the within-cluster sum of distances for the returned k-means
301 clustering solution;
302 nfound: the number of times this solution was found.
303
304 """
305
306 if transpose == 0:
307 weight = self.eweight
308 else:
309 weight = self.gweight
310 return kcluster(self.data, nclusters, self.mask, weight, transpose,
311 npass, method, dist, initialid)
312
313 - def somcluster(self, transpose=0, nxgrid=2, nygrid=1, inittau=0.02,
314 niter=1, dist='e'):
315 """Calculate a self-organizing map on a rectangular grid.
316
317 The somcluster method returns a tuple (clusterid, celldata).
318
319 transpose: if equal to 0, genes (rows) are clustered;
320 if equal to 1, microarrays (columns) are clustered.
321 nxgrid : the horizontal dimension of the rectangular SOM map
322 nygrid : the vertical dimension of the rectangular SOM map
323 inittau : the initial value of tau (the neighborbood function)
324 niter : the number of iterations
325 dist : specifies the distance function to be used:
326 dist=='e': Euclidean distance
327 dist=='b': City Block distance
328 dist=='c': Pearson correlation
329 dist=='a': absolute value of the correlation
330 dist=='u': uncentered correlation
331 dist=='x': absolute uncentered correlation
332 dist=='s': Spearman's rank correlation
333 dist=='k': Kendall's tau
334
335 Return values:
336 clusterid: array with two columns, while the number of rows is equal to
337 the number of genes or the number of microarrays depending on
338 whether genes or microarrays are being clustered. Each row in
339 the array contains the x and y coordinates of the cell in the
340 rectangular SOM grid to which the gene or microarray was
341 assigned.
342 celldata: an array with dimensions (nxgrid, nygrid, number of
343 microarrays) if genes are being clustered, or (nxgrid,
344 nygrid, number of genes) if microarrays are being clustered.
345 Each element [ix][iy] of this array is a 1D vector containing
346 the gene expression data for the centroid of the cluster in
347 the SOM grid cell with coordinates (ix, iy).
348
349 """
350
351 if transpose == 0:
352 weight = self.eweight
353 else:
354 weight = self.gweight
355 return somcluster(self.data, self.mask, weight, transpose,
356 nxgrid, nygrid, inittau, niter, dist)
357
359 """Calculate the cluster centroids and return a tuple (cdata, cmask).
360
361 The centroid is defined as either the mean or the median over all elements
362 for each dimension.
363
364 data : nrows x ncolumns array containing the expression data
365 mask : nrows x ncolumns array of integers, showing which data are
366 missing. If mask[i][j]==0, then data[i][j] is missing.
367 transpose: if equal to 0, gene (row) clusters are considered;
368 if equal to 1, microarray (column) clusters are considered.
369 clusterid: array containing the cluster number for each gene or
370 microarray. The cluster number should be non-negative.
371 method : specifies how the centroid is calculated:
372 method=='a': arithmetic mean over each dimension. (default)
373 method=='m': median over each dimension.
374
375 Return values:
376 cdata : 2D array containing the cluster centroids. If transpose==0,
377 then the dimensions of cdata are nclusters x ncolumns. If
378 transpose==1, then the dimensions of cdata are
379 nrows x nclusters.
380 cmask : 2D array of integers describing which elements in cdata,
381 if any, are missing.
382
383 """
384 return clustercentroids(self.data, self.mask, clusterid, method,
385 transpose)
386
387 - def clusterdistance(self, index1=[0], index2=[0], method='a', dist='e',
388 transpose=0):
389 """Calculate the distance between two clusters.
390
391 index1 : 1D array identifying which genes/microarrays belong to the
392 first cluster. If the cluster contains only one gene, then
393 index1 can also be written as a single integer.
394 index2 : 1D array identifying which genes/microarrays belong to the
395 second cluster. If the cluster contains only one gene, then
396 index2 can also be written as a single integer.
397 transpose: if equal to 0, genes (rows) are clustered;
398 if equal to 1, microarrays (columns) are clustered.
399 dist : specifies the distance function to be used:
400 dist=='e': Euclidean distance
401 dist=='b': City Block distance
402 dist=='c': Pearson correlation
403 dist=='a': absolute value of the correlation
404 dist=='u': uncentered correlation
405 dist=='x': absolute uncentered correlation
406 dist=='s': Spearman's rank correlation
407 dist=='k': Kendall's tau
408 method : specifies how the distance between two clusters is defined:
409 method=='a': the distance between the arithmetic means of the
410 two clusters
411 method=='m': the distance between the medians of the two
412 clusters
413 method=='s': the smallest pairwise distance between members
414 of the two clusters
415 method=='x': the largest pairwise distance between members of
416 the two clusters
417 method=='v': average of the pairwise distances between
418 members of the clusters
419 transpose: if equal to 0: clusters of genes (rows) are considered;
420 if equal to 1: clusters of microarrays (columns) are
421 considered.
422
423 """
424
425 if transpose == 0:
426 weight = self.eweight
427 else:
428 weight = self.gweight
429 return clusterdistance(self.data, self.mask, weight,
430 index1, index2, method, dist, transpose)
431
433 """Calculate the distance matrix and return it as a list of arrays
434
435 transpose: if equal to 0: calculate the distances between genes (rows);
436 if equal to 1: calculate the distances beteeen microarrays
437 (columns).
438 dist : specifies the distance function to be used:
439 dist=='e': Euclidean distance
440 dist=='b': City Block distance
441 dist=='c': Pearson correlation
442 dist=='a': absolute value of the correlation
443 dist=='u': uncentered correlation
444 dist=='x': absolute uncentered correlation
445 dist=='s': Spearman's rank correlation
446 dist=='k': Kendall's tau
447
448 Return value:
449 The distance matrix is returned as a list of 1D arrays containing the
450 distance matrix between the gene expression data. The number of columns
451 in each row is equal to the row number. Hence, the first row has zero
452 elements. An example of the return value is
453 matrix = [[],
454 array([1.]),
455 array([7., 3.]),
456 array([4., 2., 6.])]
457 This corresponds to the distance matrix
458 [0., 1., 7., 4.]
459 [1., 0., 3., 2.]
460 [7., 3., 0., 6.]
461 [4., 2., 6., 0.]
462
463 """
464 if transpose == 0:
465 weight = self.eweight
466 else:
467 weight = self.gweight
468 return distancematrix(self.data, self.mask, weight, transpose, dist)
469
470 - def save(self, jobname, geneclusters=None, expclusters=None):
471 """Save the clustering results.
472
473 The saved files follow the convention for the Java TreeView program,
474 which can therefore be used to view the clustering result.
475
476 Arguments:
477 jobname: The base name of the files to be saved. The filenames are
478 jobname.cdt, jobname.gtr, and jobname.atr for
479 hierarchical clustering, and jobname-K*.cdt,
480 jobname-K*.kgg, jobname-K*.kag for k-means clustering
481 results.
482 geneclusters=None: For hierarchical clustering results, geneclusters
483 is a Tree object as returned by the treecluster method.
484 For k-means clustering results, geneclusters is a vector
485 containing ngenes integers, describing to which cluster a
486 given gene belongs. This vector can be calculated by
487 kcluster.
488 expclusters=None: For hierarchical clustering results, expclusters
489 is a Tree object as returned by the treecluster method.
490 For k-means clustering results, expclusters is a vector
491 containing nexps integers, describing to which cluster a
492 given experimental condition belongs. This vector can be
493 calculated by kcluster.
494
495 """
496 (ngenes, nexps) = numpy.shape(self.data)
497 if self.gorder is None:
498 gorder = numpy.arange(ngenes)
499 else:
500 gorder = self.gorder
501 if self.eorder is None:
502 eorder = numpy.arange(nexps)
503 else:
504 eorder = self.eorder
505 if geneclusters is not None and expclusters is not None and \
506 type(geneclusters) != type(expclusters):
507 raise ValueError("found one k-means and one hierarchical "
508 + "clustering solution in geneclusters and "
509 + "expclusters")
510 gid = 0
511 aid = 0
512 filename = jobname
513 postfix = ""
514 if type(geneclusters) == Tree:
515
516 geneindex = _savetree(jobname, geneclusters, gorder, 0)
517 gid = 1
518 elif geneclusters is not None:
519
520 filename = jobname + "_K"
521 k = max(geneclusters) + 1
522 kggfilename = "%s_K_G%d.kgg" % (jobname, k)
523 geneindex = self._savekmeans(kggfilename, geneclusters, gorder, 0)
524 postfix = "_G%d" % k
525 else:
526 geneindex = numpy.argsort(gorder)
527 if type(expclusters) == Tree:
528
529 expindex = _savetree(jobname, expclusters, eorder, 1)
530 aid = 1
531 elif expclusters is not None:
532
533 filename = jobname + "_K"
534 k = max(expclusters) + 1
535 kagfilename = "%s_K_A%d.kag" % (jobname, k)
536 expindex = self._savekmeans(kagfilename, expclusters, eorder, 1)
537 postfix += "_A%d" % k
538 else:
539 expindex = numpy.argsort(eorder)
540 filename = filename + postfix
541 self._savedata(filename, gid, aid, geneindex, expindex)
542
543 - def _savekmeans(self, filename, clusterids, order, transpose):
544
545 if transpose == 0:
546 label = self.uniqid
547 names = self.geneid
548 else:
549 label = "ARRAY"
550 names = self.expid
551 try:
552 outputfile = open(filename, "w")
553 except IOError:
554 raise IOError("Unable to open output file")
555 outputfile.write(label + "\tGROUP\n")
556 index = numpy.argsort(order)
557 n = len(names)
558 sortedindex = numpy.zeros(n, int)
559 counter = 0
560 cluster = 0
561 while counter < n:
562 for j in index:
563 if clusterids[j] == cluster:
564 outputfile.write("%s\t%s\n" % (names[j], cluster))
565 sortedindex[counter] = j
566 counter += 1
567 cluster += 1
568 outputfile.close()
569 return sortedindex
570
571 - def _savedata(self, jobname, gid, aid, geneindex, expindex):
572
573 if self.genename is None:
574 genename = self.geneid
575 else:
576 genename = self.genename
577 (ngenes, nexps) = numpy.shape(self.data)
578 try:
579 outputfile = open(jobname+'.cdt', 'w')
580 except IOError:
581 raise IOError("Unable to open output file")
582 if self.mask is not None:
583 mask = self.mask
584 else:
585 mask = numpy.ones((ngenes, nexps), int)
586 if self.gweight is not None:
587 gweight = self.gweight
588 else:
589 gweight = numpy.ones(ngenes)
590 if self.eweight is not None:
591 eweight = self.eweight
592 else:
593 eweight = numpy.ones(nexps)
594 if gid:
595 outputfile.write('GID\t')
596 outputfile.write(self.uniqid)
597 outputfile.write('\tNAME\tGWEIGHT')
598
599 for j in expindex:
600 outputfile.write('\t%s' % self.expid[j])
601 outputfile.write('\n')
602 if aid:
603 outputfile.write("AID")
604 if gid:
605 outputfile.write('\t')
606 outputfile.write("\t\t")
607 for j in expindex:
608 outputfile.write('\tARRY%dX' % j)
609 outputfile.write('\n')
610 outputfile.write('EWEIGHT')
611 if gid:
612 outputfile.write('\t')
613 outputfile.write('\t\t')
614 for j in expindex:
615 outputfile.write('\t%f' % eweight[j])
616 outputfile.write('\n')
617 for i in geneindex:
618 if gid:
619 outputfile.write('GENE%dX\t' % i)
620 outputfile.write("%s\t%s\t%f" %
621 (self.geneid[i], genename[i], gweight[i]))
622 for j in expindex:
623 outputfile.write('\t')
624 if mask[i, j]:
625 outputfile.write(str(self.data[i, j]))
626 outputfile.write('\n')
627 outputfile.close()
628
629
631 """Read gene expression data from the file handle and return a Record.
632
633 The file should be in the file format defined for Michael Eisen's
634 Cluster/TreeView program.
635
636 """
637 return Record(handle)
638