Package Bio :: Package Cluster
[hide private]
[frames] | no frames]

Source Code for Package Bio.Cluster

  1  # This code is part of the Biopython distribution and governed by its 
  2  # license.  Please see the LICENSE file that should have been included 
  3  # as part of this package. 
  4  # 
  5   
  6  import numpy 
  7   
  8  from Bio.Cluster.cluster import * 
  9   
 10   
11 -def _treesort(order, nodeorder, nodecounts, tree):
12 # Find the order of the nodes consistent with the hierarchical clustering 13 # tree, taking into account the preferred order of nodes. 14 nNodes = len(tree) 15 nElements = nNodes + 1 16 neworder = numpy.zeros(nElements) 17 clusterids = numpy.arange(nElements) 18 for i in range(nNodes): 19 i1 = tree[i].left 20 i2 = tree[i].right 21 if i1 < 0: 22 order1 = nodeorder[-i1-1] 23 count1 = nodecounts[-i1-1] 24 else: 25 order1 = order[i1] 26 count1 = 1 27 if i2 < 0: 28 order2 = nodeorder[-i2-1] 29 count2 = nodecounts[-i2-1] 30 else: 31 order2 = order[i2] 32 count2 = 1 33 # If order1 and order2 are equal, their order is determined 34 # by the order in which they were clustered 35 if i1 < i2: 36 if order1 < order2: 37 increase = count1 38 else: 39 increase = count2 40 for j in range(nElements): 41 clusterid = clusterids[j] 42 if clusterid == i1 and order1 >= order2: 43 neworder[j] += increase 44 if clusterid == i2 and order1 < order2: 45 neworder[j] += increase 46 if clusterid == i1 or clusterid == i2: 47 clusterids[j] = -i-1 48 else: 49 if order1 <= order2: 50 increase = count1 51 else: 52 increase = count2 53 for j in range(nElements): 54 clusterid = clusterids[j] 55 if clusterid == i1 and order1 > order2: 56 neworder[j] += increase 57 if clusterid == i2 and order1 <= order2: 58 neworder[j] += increase 59 if clusterid == i1 or clusterid == i2: 60 clusterids[j] = -i-1 61 return numpy.argsort(neworder)
62 63
64 -def _savetree(jobname, tree, order, transpose):
65 # Save the hierarchical clustering solution given by the tree, following 66 # the specified order, in a file whose name is based on jobname. 67 if transpose == 0: 68 extension = ".gtr" 69 keyword = "GENE" 70 else: 71 extension = ".atr" 72 keyword = "ARRY" 73 nnodes = len(tree) 74 with open(jobname+extension, "w") as outputfile: 75 nodeindex = 0 76 nodeID = [''] * nnodes 77 nodecounts = numpy.zeros(nnodes, int) 78 nodeorder = numpy.zeros(nnodes) 79 nodedist = numpy.array([node.distance for node in tree]) 80 for nodeindex in range(nnodes): 81 min1 = tree[nodeindex].left 82 min2 = tree[nodeindex].right 83 nodeID[nodeindex] = "NODE%dX" % (nodeindex+1) 84 outputfile.write(nodeID[nodeindex]) 85 outputfile.write("\t") 86 if min1 < 0: 87 index1 = -min1-1 88 order1 = nodeorder[index1] 89 counts1 = nodecounts[index1] 90 outputfile.write(nodeID[index1]+"\t") 91 nodedist[nodeindex] = max(nodedist[nodeindex], nodedist[index1]) 92 else: 93 order1 = order[min1] 94 counts1 = 1 95 outputfile.write("%s%dX\t" % (keyword, min1)) 96 if min2 < 0: 97 index2 = -min2-1 98 order2 = nodeorder[index2] 99 counts2 = nodecounts[index2] 100 outputfile.write(nodeID[index2]+"\t") 101 nodedist[nodeindex] = max(nodedist[nodeindex], nodedist[index2]) 102 else: 103 order2 = order[min2] 104 counts2 = 1 105 outputfile.write("%s%dX\t" % (keyword, min2)) 106 outputfile.write(str(1.0-nodedist[nodeindex])) 107 outputfile.write("\n") 108 counts = counts1 + counts2 109 nodecounts[nodeindex] = counts 110 nodeorder[nodeindex] = (counts1*order1+counts2*order2) / counts 111 # Now set up order based on the tree structure 112 index = _treesort(order, nodeorder, nodecounts, tree) 113 return index
114 115
116 -class Record(object):
117 """Store gene expression data. 118 119 A Record stores the gene expression data and related information contained 120 in a data file following the file format defined for Michael Eisen's 121 Cluster/TreeView program. A Record has the following members: 122 123 data: a matrix containing the gene expression data 124 mask: a matrix containing only 1's and 0's, denoting which values 125 are present (1) or missing (0). If all elements of mask are 126 one (no missing data), then mask is set to None. 127 geneid: a list containing a unique identifier for each gene 128 (e.g., ORF name) 129 genename: a list containing an additional description for each gene 130 (e.g., gene name) 131 gweight: the weight to be used for each gene when calculating the 132 distance 133 gorder: an array of real numbers indicating the preferred order of the 134 genes in the output file 135 expid: a list containing a unique identifier for each experimental 136 condition 137 eweight: the weight to be used for each experimental condition when 138 calculating the distance 139 eorder: an array of real numbers indication the preferred order in the 140 output file of the experimental conditions 141 uniqid: the string that was used instead of UNIQID in the input file. 142 143 """ 144
145 - def __init__(self, handle=None):
146 """Read gene expression data from the file handle and return a Record. 147 148 The file should be in the format defined for Michael Eisen's 149 Cluster/TreeView program. 150 151 """ 152 self.data = None 153 self.mask = None 154 self.geneid = None 155 self.genename = None 156 self.gweight = None 157 self.gorder = None 158 self.expid = None 159 self.eweight = None 160 self.eorder = None 161 self.uniqid = None 162 if not handle: 163 return 164 line = handle.readline().strip("\r\n").split("\t") 165 n = len(line) 166 self.uniqid = line[0] 167 self.expid = [] 168 cols = {0: "GENEID"} 169 for word in line[1:]: 170 if word == "NAME": 171 cols[line.index(word)] = word 172 self.genename = [] 173 elif word == "GWEIGHT": 174 cols[line.index(word)] = word 175 self.gweight = [] 176 elif word=="GORDER": 177 cols[line.index(word)] = word 178 self.gorder = [] 179 else: 180 self.expid.append(word) 181 self.geneid = [] 182 self.data = [] 183 self.mask = [] 184 needmask = 0 185 for line in handle: 186 line = line.strip("\r\n").split("\t") 187 if len(line) != n: 188 raise ValueError("Line with %d columns found (expected %d)" % 189 (len(line), n)) 190 if line[0] == "EWEIGHT": 191 i = max(cols) + 1 192 self.eweight = numpy.array(line[i:], float) 193 continue 194 if line[0] == "EORDER": 195 i = max(cols) + 1 196 self.eorder = numpy.array(line[i:], float) 197 continue 198 rowdata = [] 199 rowmask = [] 200 n = len(line) 201 for i in range(n): 202 word = line[i] 203 if i in cols: 204 if cols[i] == "GENEID": 205 self.geneid.append(word) 206 if cols[i] == "NAME": 207 self.genename.append(word) 208 if cols[i] == "GWEIGHT": 209 self.gweight.append(float(word)) 210 if cols[i] == "GORDER": 211 self.gorder.append(float(word)) 212 continue 213 if not word: 214 rowdata.append(0.0) 215 rowmask.append(0) 216 needmask = 1 217 else: 218 rowdata.append(float(word)) 219 rowmask.append(1) 220 self.data.append(rowdata) 221 self.mask.append(rowmask) 222 self.data = numpy.array(self.data) 223 if needmask: 224 self.mask = numpy.array(self.mask, int) 225 else: 226 self.mask = None 227 if self.gweight: 228 self.gweight = numpy.array(self.gweight) 229 if self.gorder: 230 self.gorder = numpy.array(self.gorder)
231
232 - def treecluster(self, transpose=0, method='m', dist='e'):
233 """Apply hierarchical clustering and return a Tree object. 234 235 The pairwise single, complete, centroid, and average linkage hierarchical 236 clustering methods are available. 237 238 transpose: if equal to 0, genes (rows) are clustered; 239 if equal to 1, microarrays (columns) are clustered. 240 dist : specifies the distance function to be used: 241 dist=='e': Euclidean distance 242 dist=='b': City Block distance 243 dist=='c': Pearson correlation 244 dist=='a': absolute value of the correlation 245 dist=='u': uncentered correlation 246 dist=='x': absolute uncentered correlation 247 dist=='s': Spearman's rank correlation 248 dist=='k': Kendall's tau 249 method : specifies which linkage method is used: 250 method=='s': Single pairwise linkage 251 method=='m': Complete (maximum) pairwise linkage (default) 252 method=='c': Centroid linkage 253 method=='a': Average pairwise linkage 254 255 See the description of the Tree class for more information about the Tree 256 object returned by this method. 257 258 """ 259 if transpose == 0: 260 weight = self.eweight 261 else: 262 weight = self.gweight 263 return treecluster(self.data, self.mask, weight, transpose, method, 264 dist)
265
266 - def kcluster(self, nclusters=2, transpose=0, npass=1, method='a', dist='e', 267 initialid=None):
268 """Apply k-means or k-median clustering. 269 270 This method returns a tuple (clusterid, error, nfound). 271 272 nclusters: number of clusters (the 'k' in k-means) 273 transpose: if equal to 0, genes (rows) are clustered; 274 if equal to 1, microarrays (columns) are clustered. 275 npass : number of times the k-means clustering algorithm is 276 performed, each time with a different (random) initial 277 condition. 278 method : specifies how the center of a cluster is found: 279 method=='a': arithmetic mean 280 method=='m': median 281 dist : specifies the distance function to be used: 282 dist=='e': Euclidean distance 283 dist=='b': City Block distance 284 dist=='c': Pearson correlation 285 dist=='a': absolute value of the correlation 286 dist=='u': uncentered correlation 287 dist=='x': absolute uncentered correlation 288 dist=='s': Spearman's rank correlation 289 dist=='k': Kendall's tau 290 initialid: the initial clustering from which the algorithm should start. 291 If initialid is None, the routine carries out npass 292 repetitions of the EM algorithm, each time starting from a 293 different random initial clustering. If initialid is given, 294 the routine carries out the EM algorithm only once, starting 295 from the given initial clustering and without randomizing the 296 order in which items are assigned to clusters (i.e., using 297 the same order as in the data matrix). In that case, the 298 k-means algorithm is fully deterministic. 299 300 Return values: 301 clusterid: array containing the number of the cluster to which each 302 gene/microarray was assigned in the best k-means clustering 303 solution that was found in the npass runs; 304 error: the within-cluster sum of distances for the returned k-means 305 clustering solution; 306 nfound: the number of times this solution was found. 307 308 """ 309 310 if transpose == 0: 311 weight = self.eweight 312 else: 313 weight = self.gweight 314 return kcluster(self.data, nclusters, self.mask, weight, transpose, 315 npass, method, dist, initialid)
316
317 - def somcluster(self, transpose=0, nxgrid=2, nygrid=1, inittau=0.02, 318 niter=1, dist='e'):
319 """Calculate a self-organizing map on a rectangular grid. 320 321 The somcluster method returns a tuple (clusterid, celldata). 322 323 transpose: if equal to 0, genes (rows) are clustered; 324 if equal to 1, microarrays (columns) are clustered. 325 nxgrid : the horizontal dimension of the rectangular SOM map 326 nygrid : the vertical dimension of the rectangular SOM map 327 inittau : the initial value of tau (the neighborbood function) 328 niter : the number of iterations 329 dist : specifies the distance function to be used: 330 dist=='e': Euclidean distance 331 dist=='b': City Block distance 332 dist=='c': Pearson correlation 333 dist=='a': absolute value of the correlation 334 dist=='u': uncentered correlation 335 dist=='x': absolute uncentered correlation 336 dist=='s': Spearman's rank correlation 337 dist=='k': Kendall's tau 338 339 Return values: 340 clusterid: array with two columns, while the number of rows is equal to 341 the number of genes or the number of microarrays depending on 342 whether genes or microarrays are being clustered. Each row in 343 the array contains the x and y coordinates of the cell in the 344 rectangular SOM grid to which the gene or microarray was 345 assigned. 346 celldata: an array with dimensions (nxgrid, nygrid, number of 347 microarrays) if genes are being clustered, or (nxgrid, 348 nygrid, number of genes) if microarrays are being clustered. 349 Each element [ix][iy] of this array is a 1D vector containing 350 the gene expression data for the centroid of the cluster in 351 the SOM grid cell with coordinates (ix, iy). 352 353 """ 354 355 if transpose == 0: 356 weight = self.eweight 357 else: 358 weight = self.gweight 359 return somcluster(self.data, self.mask, weight, transpose, 360 nxgrid, nygrid, inittau, niter, dist)
361
362 - def clustercentroids(self, clusterid=None, method='a', transpose=0):
363 """Calculate the cluster centroids and return a tuple (cdata, cmask). 364 365 The centroid is defined as either the mean or the median over all elements 366 for each dimension. 367 368 data : nrows x ncolumns array containing the expression data 369 mask : nrows x ncolumns array of integers, showing which data are 370 missing. If mask[i][j]==0, then data[i][j] is missing. 371 transpose: if equal to 0, gene (row) clusters are considered; 372 if equal to 1, microarray (column) clusters are considered. 373 clusterid: array containing the cluster number for each gene or 374 microarray. The cluster number should be non-negative. 375 method : specifies how the centroid is calculated: 376 method=='a': arithmetic mean over each dimension. (default) 377 method=='m': median over each dimension. 378 379 Return values: 380 cdata : 2D array containing the cluster centroids. If transpose==0, 381 then the dimensions of cdata are nclusters x ncolumns. If 382 transpose==1, then the dimensions of cdata are 383 nrows x nclusters. 384 cmask : 2D array of integers describing which elements in cdata, 385 if any, are missing. 386 387 """ 388 return clustercentroids(self.data, self.mask, clusterid, method, 389 transpose)
390
391 - def clusterdistance(self, index1=[0], index2=[0], method='a', dist='e', 392 transpose=0):
393 """Calculate the distance between two clusters. 394 395 index1 : 1D array identifying which genes/microarrays belong to the 396 first cluster. If the cluster contains only one gene, then 397 index1 can also be written as a single integer. 398 index2 : 1D array identifying which genes/microarrays belong to the 399 second cluster. If the cluster contains only one gene, then 400 index2 can also be written as a single integer. 401 transpose: if equal to 0, genes (rows) are clustered; 402 if equal to 1, microarrays (columns) are clustered. 403 dist : specifies the distance function to be used: 404 dist=='e': Euclidean distance 405 dist=='b': City Block distance 406 dist=='c': Pearson correlation 407 dist=='a': absolute value of the correlation 408 dist=='u': uncentered correlation 409 dist=='x': absolute uncentered correlation 410 dist=='s': Spearman's rank correlation 411 dist=='k': Kendall's tau 412 method : specifies how the distance between two clusters is defined: 413 method=='a': the distance between the arithmetic means of the 414 two clusters 415 method=='m': the distance between the medians of the two 416 clusters 417 method=='s': the smallest pairwise distance between members 418 of the two clusters 419 method=='x': the largest pairwise distance between members of 420 the two clusters 421 method=='v': average of the pairwise distances between 422 members of the clusters 423 transpose: if equal to 0: clusters of genes (rows) are considered; 424 if equal to 1: clusters of microarrays (columns) are 425 considered. 426 427 """ 428 429 if transpose == 0: 430 weight = self.eweight 431 else: 432 weight = self.gweight 433 return clusterdistance(self.data, self.mask, weight, 434 index1, index2, method, dist, transpose)
435
436 - def distancematrix(self, transpose=0, dist='e'):
437 """Calculate the distance matrix and return it as a list of arrays 438 439 transpose: if equal to 0: calculate the distances between genes (rows); 440 if equal to 1: calculate the distances beteeen microarrays 441 (columns). 442 dist : specifies the distance function to be used: 443 dist=='e': Euclidean distance 444 dist=='b': City Block distance 445 dist=='c': Pearson correlation 446 dist=='a': absolute value of the correlation 447 dist=='u': uncentered correlation 448 dist=='x': absolute uncentered correlation 449 dist=='s': Spearman's rank correlation 450 dist=='k': Kendall's tau 451 452 Return value: 453 The distance matrix is returned as a list of 1D arrays containing the 454 distance matrix between the gene expression data. The number of columns 455 in each row is equal to the row number. Hence, the first row has zero 456 elements. An example of the return value is 457 matrix = [[], 458 array([1.]), 459 array([7., 3.]), 460 array([4., 2., 6.])] 461 This corresponds to the distance matrix 462 [0., 1., 7., 4.] 463 [1., 0., 3., 2.] 464 [7., 3., 0., 6.] 465 [4., 2., 6., 0.] 466 467 """ 468 if transpose == 0: 469 weight = self.eweight 470 else: 471 weight = self.gweight 472 return distancematrix(self.data, self.mask, weight, transpose, dist)
473
474 - def save(self, jobname, geneclusters=None, expclusters=None):
475 """Save the clustering results. 476 477 The saved files follow the convention for the Java TreeView program, 478 which can therefore be used to view the clustering result. 479 480 Arguments: 481 jobname: The base name of the files to be saved. The filenames are 482 jobname.cdt, jobname.gtr, and jobname.atr for 483 hierarchical clustering, and jobname-K*.cdt, 484 jobname-K*.kgg, jobname-K*.kag for k-means clustering 485 results. 486 geneclusters=None: For hierarchical clustering results, geneclusters 487 is a Tree object as returned by the treecluster method. 488 For k-means clustering results, geneclusters is a vector 489 containing ngenes integers, describing to which cluster a 490 given gene belongs. This vector can be calculated by 491 kcluster. 492 expclusters=None: For hierarchical clustering results, expclusters 493 is a Tree object as returned by the treecluster method. 494 For k-means clustering results, expclusters is a vector 495 containing nexps integers, describing to which cluster a 496 given experimental condition belongs. This vector can be 497 calculated by kcluster. 498 499 """ 500 (ngenes, nexps) = numpy.shape(self.data) 501 if self.gorder is None: 502 gorder = numpy.arange(ngenes) 503 else: 504 gorder = self.gorder 505 if self.eorder is None: 506 eorder = numpy.arange(nexps) 507 else: 508 eorder = self.eorder 509 if geneclusters is not None and expclusters is not None and \ 510 type(geneclusters) != type(expclusters): 511 raise ValueError("found one k-means and one hierarchical " 512 + "clustering solution in geneclusters and " 513 + "expclusters") 514 gid = 0 515 aid = 0 516 filename = jobname 517 postfix = "" 518 if isinstance(geneclusters, Tree): 519 # This is a hierarchical clustering result. 520 geneindex = _savetree(jobname, geneclusters, gorder, 0) 521 gid = 1 522 elif geneclusters is not None: 523 # This is a k-means clustering result. 524 filename = jobname + "_K" 525 k = max(geneclusters) + 1 526 kggfilename = "%s_K_G%d.kgg" % (jobname, k) 527 geneindex = self._savekmeans(kggfilename, geneclusters, gorder, 0) 528 postfix = "_G%d" % k 529 else: 530 geneindex = numpy.argsort(gorder) 531 if isinstance(expclusters, Tree): 532 # This is a hierarchical clustering result. 533 expindex = _savetree(jobname, expclusters, eorder, 1) 534 aid = 1 535 elif expclusters is not None: 536 # This is a k-means clustering result. 537 filename = jobname + "_K" 538 k = max(expclusters) + 1 539 kagfilename = "%s_K_A%d.kag" % (jobname, k) 540 expindex = self._savekmeans(kagfilename, expclusters, eorder, 1) 541 postfix += "_A%d" % k 542 else: 543 expindex = numpy.argsort(eorder) 544 filename = filename + postfix 545 self._savedata(filename, gid, aid, geneindex, expindex)
546
547 - def _savekmeans(self, filename, clusterids, order, transpose):
548 # Save a k-means clustering solution 549 if transpose == 0: 550 label = self.uniqid 551 names = self.geneid 552 else: 553 label = "ARRAY" 554 names = self.expid 555 with open(filename, "w") as outputfile: 556 outputfile.write(label + "\tGROUP\n") 557 index = numpy.argsort(order) 558 n = len(names) 559 sortedindex = numpy.zeros(n, int) 560 counter = 0 561 cluster = 0 562 while counter < n: 563 for j in index: 564 if clusterids[j] == cluster: 565 outputfile.write("%s\t%s\n" % (names[j], cluster)) 566 sortedindex[counter] = j 567 counter += 1 568 cluster += 1 569 return sortedindex
570
571 - def _savedata(self, jobname, gid, aid, geneindex, expindex):
572 # Save the clustered data. 573 if self.genename is None: 574 genename = self.geneid 575 else: 576 genename = self.genename 577 (ngenes, nexps) = numpy.shape(self.data) 578 with open(jobname+'.cdt', 'w') as outputfile: 579 if self.mask is not None: 580 mask = self.mask 581 else: 582 mask = numpy.ones((ngenes, nexps), int) 583 if self.gweight is not None: 584 gweight = self.gweight 585 else: 586 gweight = numpy.ones(ngenes) 587 if self.eweight is not None: 588 eweight = self.eweight 589 else: 590 eweight = numpy.ones(nexps) 591 if gid: 592 outputfile.write('GID\t') 593 outputfile.write(self.uniqid) 594 outputfile.write('\tNAME\tGWEIGHT') 595 # Now add headers for data columns. 596 for j in expindex: 597 outputfile.write('\t%s' % self.expid[j]) 598 outputfile.write('\n') 599 if aid: 600 outputfile.write("AID") 601 if gid: 602 outputfile.write('\t') 603 outputfile.write("\t\t") 604 for j in expindex: 605 outputfile.write('\tARRY%dX' % j) 606 outputfile.write('\n') 607 outputfile.write('EWEIGHT') 608 if gid: 609 outputfile.write('\t') 610 outputfile.write('\t\t') 611 for j in expindex: 612 outputfile.write('\t%f' % eweight[j]) 613 outputfile.write('\n') 614 for i in geneindex: 615 if gid: 616 outputfile.write('GENE%dX\t' % i) 617 outputfile.write("%s\t%s\t%f" % 618 (self.geneid[i], genename[i], gweight[i])) 619 for j in expindex: 620 outputfile.write('\t') 621 if mask[i, j]: 622 outputfile.write(str(self.data[i, j])) 623 outputfile.write('\n')
624 625
626 -def read(handle):
627 """Read gene expression data from the file handle and return a Record. 628 629 The file should be in the file format defined for Michael Eisen's 630 Cluster/TreeView program. 631 632 """ 633 return Record(handle)
634