Package Bio :: Package Cluster
[hide private]
[frames] | no frames]

Source Code for Package Bio.Cluster

  1  import numpy 
  2   
  3  from Bio.Cluster.cluster import * 
  4   
  5   
6 -def _treesort(order, nodeorder, nodecounts, tree):
7 # Find the order of the nodes consistent with the hierarchical clustering 8 # tree, taking into account the preferred order of nodes. 9 nNodes = len(tree) 10 nElements = nNodes + 1 11 neworder = numpy.zeros(nElements) 12 clusterids = numpy.arange(nElements) 13 for i in range(nNodes): 14 i1 = tree[i].left 15 i2 = tree[i].right 16 if i1 < 0: 17 order1 = nodeorder[-i1-1] 18 count1 = nodecounts[-i1-1] 19 else: 20 order1 = order[i1] 21 count1 = 1 22 if i2 < 0: 23 order2 = nodeorder[-i2-1] 24 count2 = nodecounts[-i2-1] 25 else: 26 order2 = order[i2] 27 count2 = 1 28 # If order1 and order2 are equal, their order is determined 29 # by the order in which they were clustered 30 if i1 < i2: 31 if order1 < order2: 32 increase = count1 33 else: 34 increase = count2 35 for j in range(nElements): 36 clusterid = clusterids[j] 37 if clusterid == i1 and order1 >= order2: 38 neworder[j] += increase 39 if clusterid == i2 and order1 < order2: 40 neworder[j] += increase 41 if clusterid == i1 or clusterid == i2: 42 clusterids[j] = -i-1 43 else: 44 if order1 <= order2: 45 increase = count1 46 else: 47 increase = count2 48 for j in range(nElements): 49 clusterid = clusterids[j] 50 if clusterid == i1 and order1 > order2: 51 neworder[j] += increase 52 if clusterid == i2 and order1 <= order2: 53 neworder[j] += increase 54 if clusterid == i1 or clusterid == i2: 55 clusterids[j] = -i-1 56 return numpy.argsort(neworder)
57 58
59 -def _savetree(jobname, tree, order, transpose):
60 # Save the hierarchical clustering solution given by the tree, following 61 # the specified order, in a file whose name is based on jobname. 62 if transpose == 0: 63 extension = ".gtr" 64 keyword = "GENE" 65 else: 66 extension = ".atr" 67 keyword = "ARRY" 68 nnodes = len(tree) 69 outputfile = open(jobname+extension, "w") 70 nodeindex = 0 71 nodeID = [''] * nnodes 72 nodecounts = numpy.zeros(nnodes, int) 73 nodeorder = numpy.zeros(nnodes) 74 nodedist = numpy.array([node.distance for node in tree]) 75 for nodeindex in range(nnodes): 76 min1 = tree[nodeindex].left 77 min2 = tree[nodeindex].right 78 nodeID[nodeindex] = "NODE%dX" % (nodeindex+1) 79 outputfile.write(nodeID[nodeindex]) 80 outputfile.write("\t") 81 if min1 < 0: 82 index1 = -min1-1 83 order1 = nodeorder[index1] 84 counts1 = nodecounts[index1] 85 outputfile.write(nodeID[index1]+"\t") 86 nodedist[nodeindex] = max(nodedist[nodeindex], nodedist[index1]) 87 else: 88 order1 = order[min1] 89 counts1 = 1 90 outputfile.write("%s%dX\t" % (keyword, min1)) 91 if min2 < 0: 92 index2 = -min2-1 93 order2 = nodeorder[index2] 94 counts2 = nodecounts[index2] 95 outputfile.write(nodeID[index2]+"\t") 96 nodedist[nodeindex] = max(nodedist[nodeindex], nodedist[index2]) 97 else: 98 order2 = order[min2] 99 counts2 = 1 100 outputfile.write("%s%dX\t" % (keyword, min2)) 101 outputfile.write(str(1.0-nodedist[nodeindex])) 102 outputfile.write("\n") 103 counts = counts1 + counts2 104 nodecounts[nodeindex] = counts 105 nodeorder[nodeindex] = (counts1*order1+counts2*order2) / counts 106 outputfile.close() 107 # Now set up order based on the tree structure 108 index = _treesort(order, nodeorder, nodecounts, tree) 109 return index
110 111
112 -class Record(object):
113 """Store gene expression data. 114 115 A Record stores the gene expression data and related information contained 116 in a data file following the file format defined for Michael Eisen's 117 Cluster/TreeView program. A Record has the following members: 118 119 data: a matrix containing the gene expression data 120 mask: a matrix containing only 1's and 0's, denoting which values 121 are present (1) or missing (0). If all elements of mask are 122 one (no missing data), then mask is set to None. 123 geneid: a list containing a unique identifier for each gene 124 (e.g., ORF name) 125 genename: a list containing an additional description for each gene 126 (e.g., gene name) 127 gweight: the weight to be used for each gene when calculating the 128 distance 129 gorder: an array of real numbers indicating the preferred order of the 130 genes in the output file 131 expid: a list containing a unique identifier for each experimental 132 condition 133 eweight: the weight to be used for each experimental condition when 134 calculating the distance 135 eorder: an array of real numbers indication the preferred order in the 136 output file of the experimental conditions 137 uniqid: the string that was used instead of UNIQID in the input file. 138 139 """ 140
141 - def __init__(self, handle=None):
142 """Read gene expression data from the file handle and return a Record. 143 144 The file should be in the format defined for Michael Eisen's 145 Cluster/TreeView program. 146 147 """ 148 self.data = None 149 self.mask = None 150 self.geneid = None 151 self.genename = None 152 self.gweight = None 153 self.gorder = None 154 self.expid = None 155 self.eweight = None 156 self.eorder = None 157 self.uniqid = None 158 if not handle: 159 return 160 line = handle.readline().strip("\r\n").split("\t") 161 n = len(line) 162 self.uniqid = line[0] 163 self.expid = [] 164 cols = {0: "GENEID"} 165 for word in line[1:]: 166 if word == "NAME": 167 cols[line.index(word)] = word 168 self.genename = [] 169 elif word == "GWEIGHT": 170 cols[line.index(word)] = word 171 self.gweight = [] 172 elif word=="GORDER": 173 cols[line.index(word)] = word 174 self.gorder = [] 175 else: 176 self.expid.append(word) 177 self.geneid = [] 178 self.data = [] 179 self.mask = [] 180 needmask = 0 181 for line in handle: 182 line = line.strip("\r\n").split("\t") 183 if len(line) != n: 184 raise ValueError("Line with %d columns found (expected %d)" % 185 (len(line), n)) 186 if line[0] == "EWEIGHT": 187 i = max(cols) + 1 188 self.eweight = numpy.array(line[i:], float) 189 continue 190 if line[0] == "EORDER": 191 i = max(cols) + 1 192 self.eorder = numpy.array(line[i:], float) 193 continue 194 rowdata = [] 195 rowmask = [] 196 n = len(line) 197 for i in range(n): 198 word = line[i] 199 if i in cols: 200 if cols[i] == "GENEID": 201 self.geneid.append(word) 202 if cols[i] == "NAME": 203 self.genename.append(word) 204 if cols[i] == "GWEIGHT": 205 self.gweight.append(float(word)) 206 if cols[i] == "GORDER": 207 self.gorder.append(float(word)) 208 continue 209 if not word: 210 rowdata.append(0.0) 211 rowmask.append(0) 212 needmask = 1 213 else: 214 rowdata.append(float(word)) 215 rowmask.append(1) 216 self.data.append(rowdata) 217 self.mask.append(rowmask) 218 self.data = numpy.array(self.data) 219 if needmask: 220 self.mask = numpy.array(self.mask, int) 221 else: 222 self.mask = None 223 if self.gweight: 224 self.gweight = numpy.array(self.gweight) 225 if self.gorder: 226 self.gorder = numpy.array(self.gorder)
227
228 - def treecluster(self, transpose=0, method='m', dist='e'):
229 """Apply hierarchical clustering and return a Tree object. 230 231 The pairwise single, complete, centroid, and average linkage hierarchical 232 clustering methods are available. 233 234 transpose: if equal to 0, genes (rows) are clustered; 235 if equal to 1, microarrays (columns) are clustered. 236 dist : specifies the distance function to be used: 237 dist=='e': Euclidean distance 238 dist=='b': City Block distance 239 dist=='c': Pearson correlation 240 dist=='a': absolute value of the correlation 241 dist=='u': uncentered correlation 242 dist=='x': absolute uncentered correlation 243 dist=='s': Spearman's rank correlation 244 dist=='k': Kendall's tau 245 method : specifies which linkage method is used: 246 method=='s': Single pairwise linkage 247 method=='m': Complete (maximum) pairwise linkage (default) 248 method=='c': Centroid linkage 249 method=='a': Average pairwise linkage 250 251 See the description of the Tree class for more information about the Tree 252 object returned by this method. 253 254 """ 255 if transpose == 0: 256 weight = self.eweight 257 else: 258 weight = self.gweight 259 return treecluster(self.data, self.mask, weight, transpose, method, 260 dist)
261
262 - def kcluster(self, nclusters=2, transpose=0, npass=1, method='a', dist='e', 263 initialid=None):
264 """Apply k-means or k-median clustering. 265 266 This method returns a tuple (clusterid, error, nfound). 267 268 nclusters: number of clusters (the 'k' in k-means) 269 transpose: if equal to 0, genes (rows) are clustered; 270 if equal to 1, microarrays (columns) are clustered. 271 npass : number of times the k-means clustering algorithm is 272 performed, each time with a different (random) initial 273 condition. 274 method : specifies how the center of a cluster is found: 275 method=='a': arithmetic mean 276 method=='m': median 277 dist : specifies the distance function to be used: 278 dist=='e': Euclidean distance 279 dist=='b': City Block distance 280 dist=='c': Pearson correlation 281 dist=='a': absolute value of the correlation 282 dist=='u': uncentered correlation 283 dist=='x': absolute uncentered correlation 284 dist=='s': Spearman's rank correlation 285 dist=='k': Kendall's tau 286 initialid: the initial clustering from which the algorithm should start. 287 If initialid is None, the routine carries out npass 288 repetitions of the EM algorithm, each time starting from a 289 different random initial clustering. If initialid is given, 290 the routine carries out the EM algorithm only once, starting 291 from the given initial clustering and without randomizing the 292 order in which items are assigned to clusters (i.e., using 293 the same order as in the data matrix). In that case, the 294 k-means algorithm is fully deterministic. 295 296 Return values: 297 clusterid: array containing the number of the cluster to which each 298 gene/microarray was assigned in the best k-means clustering 299 solution that was found in the npass runs; 300 error: the within-cluster sum of distances for the returned k-means 301 clustering solution; 302 nfound: the number of times this solution was found. 303 304 """ 305 306 if transpose == 0: 307 weight = self.eweight 308 else: 309 weight = self.gweight 310 return kcluster(self.data, nclusters, self.mask, weight, transpose, 311 npass, method, dist, initialid)
312
313 - def somcluster(self, transpose=0, nxgrid=2, nygrid=1, inittau=0.02, 314 niter=1, dist='e'):
315 """Calculate a self-organizing map on a rectangular grid. 316 317 The somcluster method returns a tuple (clusterid, celldata). 318 319 transpose: if equal to 0, genes (rows) are clustered; 320 if equal to 1, microarrays (columns) are clustered. 321 nxgrid : the horizontal dimension of the rectangular SOM map 322 nygrid : the vertical dimension of the rectangular SOM map 323 inittau : the initial value of tau (the neighborbood function) 324 niter : the number of iterations 325 dist : specifies the distance function to be used: 326 dist=='e': Euclidean distance 327 dist=='b': City Block distance 328 dist=='c': Pearson correlation 329 dist=='a': absolute value of the correlation 330 dist=='u': uncentered correlation 331 dist=='x': absolute uncentered correlation 332 dist=='s': Spearman's rank correlation 333 dist=='k': Kendall's tau 334 335 Return values: 336 clusterid: array with two columns, while the number of rows is equal to 337 the number of genes or the number of microarrays depending on 338 whether genes or microarrays are being clustered. Each row in 339 the array contains the x and y coordinates of the cell in the 340 rectangular SOM grid to which the gene or microarray was 341 assigned. 342 celldata: an array with dimensions (nxgrid, nygrid, number of 343 microarrays) if genes are being clustered, or (nxgrid, 344 nygrid, number of genes) if microarrays are being clustered. 345 Each element [ix][iy] of this array is a 1D vector containing 346 the gene expression data for the centroid of the cluster in 347 the SOM grid cell with coordinates (ix, iy). 348 349 """ 350 351 if transpose == 0: 352 weight = self.eweight 353 else: 354 weight = self.gweight 355 return somcluster(self.data, self.mask, weight, transpose, 356 nxgrid, nygrid, inittau, niter, dist)
357
358 - def clustercentroids(self, clusterid=None, method='a', transpose=0):
359 """Calculate the cluster centroids and return a tuple (cdata, cmask). 360 361 The centroid is defined as either the mean or the median over all elements 362 for each dimension. 363 364 data : nrows x ncolumns array containing the expression data 365 mask : nrows x ncolumns array of integers, showing which data are 366 missing. If mask[i][j]==0, then data[i][j] is missing. 367 transpose: if equal to 0, gene (row) clusters are considered; 368 if equal to 1, microarray (column) clusters are considered. 369 clusterid: array containing the cluster number for each gene or 370 microarray. The cluster number should be non-negative. 371 method : specifies how the centroid is calculated: 372 method=='a': arithmetic mean over each dimension. (default) 373 method=='m': median over each dimension. 374 375 Return values: 376 cdata : 2D array containing the cluster centroids. If transpose==0, 377 then the dimensions of cdata are nclusters x ncolumns. If 378 transpose==1, then the dimensions of cdata are 379 nrows x nclusters. 380 cmask : 2D array of integers describing which elements in cdata, 381 if any, are missing. 382 383 """ 384 return clustercentroids(self.data, self.mask, clusterid, method, 385 transpose)
386
387 - def clusterdistance(self, index1=[0], index2=[0], method='a', dist='e', 388 transpose=0):
389 """Calculate the distance between two clusters. 390 391 index1 : 1D array identifying which genes/microarrays belong to the 392 first cluster. If the cluster contains only one gene, then 393 index1 can also be written as a single integer. 394 index2 : 1D array identifying which genes/microarrays belong to the 395 second cluster. If the cluster contains only one gene, then 396 index2 can also be written as a single integer. 397 transpose: if equal to 0, genes (rows) are clustered; 398 if equal to 1, microarrays (columns) are clustered. 399 dist : specifies the distance function to be used: 400 dist=='e': Euclidean distance 401 dist=='b': City Block distance 402 dist=='c': Pearson correlation 403 dist=='a': absolute value of the correlation 404 dist=='u': uncentered correlation 405 dist=='x': absolute uncentered correlation 406 dist=='s': Spearman's rank correlation 407 dist=='k': Kendall's tau 408 method : specifies how the distance between two clusters is defined: 409 method=='a': the distance between the arithmetic means of the 410 two clusters 411 method=='m': the distance between the medians of the two 412 clusters 413 method=='s': the smallest pairwise distance between members 414 of the two clusters 415 method=='x': the largest pairwise distance between members of 416 the two clusters 417 method=='v': average of the pairwise distances between 418 members of the clusters 419 transpose: if equal to 0: clusters of genes (rows) are considered; 420 if equal to 1: clusters of microarrays (columns) are 421 considered. 422 423 """ 424 425 if transpose == 0: 426 weight = self.eweight 427 else: 428 weight = self.gweight 429 return clusterdistance(self.data, self.mask, weight, 430 index1, index2, method, dist, transpose)
431
432 - def distancematrix(self, transpose=0, dist='e'):
433 """Calculate the distance matrix and return it as a list of arrays 434 435 transpose: if equal to 0: calculate the distances between genes (rows); 436 if equal to 1: calculate the distances beteeen microarrays 437 (columns). 438 dist : specifies the distance function to be used: 439 dist=='e': Euclidean distance 440 dist=='b': City Block distance 441 dist=='c': Pearson correlation 442 dist=='a': absolute value of the correlation 443 dist=='u': uncentered correlation 444 dist=='x': absolute uncentered correlation 445 dist=='s': Spearman's rank correlation 446 dist=='k': Kendall's tau 447 448 Return value: 449 The distance matrix is returned as a list of 1D arrays containing the 450 distance matrix between the gene expression data. The number of columns 451 in each row is equal to the row number. Hence, the first row has zero 452 elements. An example of the return value is 453 matrix = [[], 454 array([1.]), 455 array([7., 3.]), 456 array([4., 2., 6.])] 457 This corresponds to the distance matrix 458 [0., 1., 7., 4.] 459 [1., 0., 3., 2.] 460 [7., 3., 0., 6.] 461 [4., 2., 6., 0.] 462 463 """ 464 if transpose == 0: 465 weight = self.eweight 466 else: 467 weight = self.gweight 468 return distancematrix(self.data, self.mask, weight, transpose, dist)
469
470 - def save(self, jobname, geneclusters=None, expclusters=None):
471 """Save the clustering results. 472 473 The saved files follow the convention for the Java TreeView program, 474 which can therefore be used to view the clustering result. 475 476 Arguments: 477 jobname: The base name of the files to be saved. The filenames are 478 jobname.cdt, jobname.gtr, and jobname.atr for 479 hierarchical clustering, and jobname-K*.cdt, 480 jobname-K*.kgg, jobname-K*.kag for k-means clustering 481 results. 482 geneclusters=None: For hierarchical clustering results, geneclusters 483 is a Tree object as returned by the treecluster method. 484 For k-means clustering results, geneclusters is a vector 485 containing ngenes integers, describing to which cluster a 486 given gene belongs. This vector can be calculated by 487 kcluster. 488 expclusters=None: For hierarchical clustering results, expclusters 489 is a Tree object as returned by the treecluster method. 490 For k-means clustering results, expclusters is a vector 491 containing nexps integers, describing to which cluster a 492 given experimental condition belongs. This vector can be 493 calculated by kcluster. 494 495 """ 496 (ngenes, nexps) = numpy.shape(self.data) 497 if self.gorder is None: 498 gorder = numpy.arange(ngenes) 499 else: 500 gorder = self.gorder 501 if self.eorder is None: 502 eorder = numpy.arange(nexps) 503 else: 504 eorder = self.eorder 505 if geneclusters is not None and expclusters is not None and \ 506 type(geneclusters) != type(expclusters): 507 raise ValueError("found one k-means and one hierarchical " 508 + "clustering solution in geneclusters and " 509 + "expclusters") 510 gid = 0 511 aid = 0 512 filename = jobname 513 postfix = "" 514 if type(geneclusters) == Tree: 515 # This is a hierarchical clustering result. 516 geneindex = _savetree(jobname, geneclusters, gorder, 0) 517 gid = 1 518 elif geneclusters is not None: 519 # This is a k-means clustering result. 520 filename = jobname + "_K" 521 k = max(geneclusters) + 1 522 kggfilename = "%s_K_G%d.kgg" % (jobname, k) 523 geneindex = self._savekmeans(kggfilename, geneclusters, gorder, 0) 524 postfix = "_G%d" % k 525 else: 526 geneindex = numpy.argsort(gorder) 527 if type(expclusters) == Tree: 528 # This is a hierarchical clustering result. 529 expindex = _savetree(jobname, expclusters, eorder, 1) 530 aid = 1 531 elif expclusters is not None: 532 # This is a k-means clustering result. 533 filename = jobname + "_K" 534 k = max(expclusters) + 1 535 kagfilename = "%s_K_A%d.kag" % (jobname, k) 536 expindex = self._savekmeans(kagfilename, expclusters, eorder, 1) 537 postfix += "_A%d" % k 538 else: 539 expindex = numpy.argsort(eorder) 540 filename = filename + postfix 541 self._savedata(filename, gid, aid, geneindex, expindex)
542
543 - def _savekmeans(self, filename, clusterids, order, transpose):
544 # Save a k-means clustering solution 545 if transpose == 0: 546 label = self.uniqid 547 names = self.geneid 548 else: 549 label = "ARRAY" 550 names = self.expid 551 try: 552 outputfile = open(filename, "w") 553 except IOError: 554 raise IOError("Unable to open output file") 555 outputfile.write(label + "\tGROUP\n") 556 index = numpy.argsort(order) 557 n = len(names) 558 sortedindex = numpy.zeros(n, int) 559 counter = 0 560 cluster = 0 561 while counter < n: 562 for j in index: 563 if clusterids[j] == cluster: 564 outputfile.write("%s\t%s\n" % (names[j], cluster)) 565 sortedindex[counter] = j 566 counter += 1 567 cluster += 1 568 outputfile.close() 569 return sortedindex
570
571 - def _savedata(self, jobname, gid, aid, geneindex, expindex):
572 # Save the clustered data. 573 if self.genename is None: 574 genename = self.geneid 575 else: 576 genename = self.genename 577 (ngenes, nexps) = numpy.shape(self.data) 578 try: 579 outputfile = open(jobname+'.cdt', 'w') 580 except IOError: 581 raise IOError("Unable to open output file") 582 if self.mask is not None: 583 mask = self.mask 584 else: 585 mask = numpy.ones((ngenes, nexps), int) 586 if self.gweight is not None: 587 gweight = self.gweight 588 else: 589 gweight = numpy.ones(ngenes) 590 if self.eweight is not None: 591 eweight = self.eweight 592 else: 593 eweight = numpy.ones(nexps) 594 if gid: 595 outputfile.write('GID\t') 596 outputfile.write(self.uniqid) 597 outputfile.write('\tNAME\tGWEIGHT') 598 # Now add headers for data columns. 599 for j in expindex: 600 outputfile.write('\t%s' % self.expid[j]) 601 outputfile.write('\n') 602 if aid: 603 outputfile.write("AID") 604 if gid: 605 outputfile.write('\t') 606 outputfile.write("\t\t") 607 for j in expindex: 608 outputfile.write('\tARRY%dX' % j) 609 outputfile.write('\n') 610 outputfile.write('EWEIGHT') 611 if gid: 612 outputfile.write('\t') 613 outputfile.write('\t\t') 614 for j in expindex: 615 outputfile.write('\t%f' % eweight[j]) 616 outputfile.write('\n') 617 for i in geneindex: 618 if gid: 619 outputfile.write('GENE%dX\t' % i) 620 outputfile.write("%s\t%s\t%f" % 621 (self.geneid[i], genename[i], gweight[i])) 622 for j in expindex: 623 outputfile.write('\t') 624 if mask[i, j]: 625 outputfile.write(str(self.data[i, j])) 626 outputfile.write('\n') 627 outputfile.close()
628 629
630 -def read(handle):
631 """Read gene expression data from the file handle and return a Record. 632 633 The file should be in the file format defined for Michael Eisen's 634 Cluster/TreeView program. 635 636 """ 637 return Record(handle)
638