Package Bio :: Package Phylo :: Module Consensus
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.Consensus

  1  # Copyright (C) 2013 by Yanbo Ye (yeyanbo289@gmail.com) 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license. Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """ Classes and methods for finding consensus trees. 
  7   
  8  This module contains a ``_BitString`` class to assist the consensus tree 
  9  searching and some common consensus algorithms such as strict, majority rule and 
 10  adam consensus. 
 11  """ 
 12  from __future__ import division 
 13   
 14  import random 
 15  import itertools 
 16   
 17  from ast import literal_eval 
 18  from Bio.Phylo import BaseTree 
 19   
 20  __docformat__ = "restructuredtext en" 
21 22 23 -class _BitString(str):
24 """Helper class for binary string data (PRIVATE). 25 26 Assistant class of binary string data used for storing and 27 counting compatible clades in consensus tree searching. It includes 28 some binary manipulation(&|^~) methods. 29 30 _BitString is a sub-class of ``str`` object that only accepts two 31 characters('0' and '1'), with additional functions for binary-like 32 manipulation(&|^~). It is used to count and store the clades in 33 multiple trees in consensus tree searching. During counting, the 34 clades will be considered the same if their terminals(in terms of 35 ``name`` attribute) are the same. 36 37 For example, let's say two trees are provided as below to search 38 their strict consensus tree:: 39 40 tree1: (((A, B), C),(D, E)) 41 tree2: ((A, (B, C)),(D, E)) 42 43 For both trees, a _BitString object '11111' will represent their 44 root clade. Each '1' stands for the terminal clade in the list 45 [A, B, C, D, E](the order might not be the same, it's determined 46 by the ``get_terminal`` method of the first tree provided). For 47 the clade ((A, B), C) in tree1 and (A, (B, C)) in tree2, they both 48 can be represented by '11100'. Similarly, '11000' represents clade 49 (A, B) in tree1, '01100' represents clade (B, C) in tree2, and '00011' 50 represents clade (D, E) in both trees. 51 52 So, with the ``_count_clades`` function in this module, finally we 53 can get the clade counts and their _BitString representation as follows 54 (the root and terminals are omitted):: 55 56 clade _BitString count 57 ABC '11100' 2 58 DE '00011' 2 59 AB '11000' 1 60 BC '01100' 1 61 62 To get the _BitString representation of a clade, we can use the following 63 code snippet:: 64 65 # suppose we are provided with a tree list, the first thing to do is 66 # to get all the terminal names in the first tree 67 term_names = [term.name for term in trees[0].get_terminals()] 68 # for a specific clade in any of the tree, also get its terminal names 69 clade_term_names = [term.name for term in clade.get_terminals()] 70 # then create a boolean list 71 boolvals = [name in clade_term_names for name in term_names] 72 # create the string version and pass it to _BitString 73 bitstr = _BitString(''.join(map(str, map(int, boolvals)))) 74 # or, equivalently: 75 bitstr = _BitString.from_bool(boolvals) 76 77 To convert back:: 78 79 # get all the terminal clades of the first tree 80 terms = [term for term in trees[0].get_terminals()] 81 # get the index of terminal clades in bitstr 82 index_list = bitstr.index_one() 83 # get all terminal clades by index 84 clade_terms = [terms[i] for i in index_list] 85 # create a new calde and append all the terminal clades 86 new_clade = BaseTree.Clade() 87 new_clade.clades.extend(clade_terms) 88 89 90 Example 91 ------- 92 93 >>> from Bio.Phylo.Consensus import _BitString 94 >>> bitstr1 = _BitString('11111') 95 >>> bitstr2 = _BitString('11100') 96 >>> bitstr3 = _BitString('01101') 97 >>> bitstr1 98 _BitString('11111') 99 >>> bitstr2 & bitstr3 100 _BitString('01100') 101 >>> bitstr2 | bitstr3 102 _BitString('11101') 103 >>> bitstr2 ^ bitstr3 104 _BitString('10001') 105 >>> bitstr2.index_one() 106 [0, 1, 2] 107 >>> bitstr3.index_one() 108 [1, 2, 4] 109 >>> bitstr3.index_zero() 110 [0, 3] 111 >>> bitstr1.contains(bitstr2) 112 True 113 >>> bitstr2.contains(bitstr3) 114 False 115 >>> bitstr2.independent(bitstr3) 116 False 117 >>> bitstr2.independent(bitstr4) 118 True 119 >>> bitstr1.iscompatible(bitstr2) 120 True 121 >>> bitstr2.iscompatible(bitstr3) 122 False 123 >>> bitstr2.iscompatible(bitstr4) 124 True 125 """ 126
127 - def __new__(cls, strdata):
128 """init from a binary string data""" 129 if (isinstance(strdata, str) and 130 len(strdata) == strdata.count('0') + strdata.count('1')): 131 return str.__new__(cls, strdata) 132 else: 133 raise TypeError( 134 "The input should be a binary string composed of '0' and '1'")
135
136 - def __and__(self, other):
137 selfint = literal_eval('0b' + self) 138 otherint = literal_eval('0b' + other) 139 resultint = selfint & otherint 140 return _BitString(bin(resultint)[2:].zfill(len(self)))
141
142 - def __or__(self, other):
143 selfint = literal_eval('0b' + self) 144 otherint = literal_eval('0b' + other) 145 resultint = selfint | otherint 146 return _BitString(bin(resultint)[2:].zfill(len(self)))
147
148 - def __xor__(self, other):
149 selfint = literal_eval('0b' + self) 150 otherint = literal_eval('0b' + other) 151 resultint = selfint ^ otherint 152 return _BitString(bin(resultint)[2:].zfill(len(self)))
153
154 - def __rand__(self, other):
155 selfint = literal_eval('0b' + self) 156 otherint = literal_eval('0b' + other) 157 resultint = otherint & selfint 158 return _BitString(bin(resultint)[2:].zfill(len(self)))
159
160 - def __ror__(self, other):
161 selfint = literal_eval('0b' + self) 162 otherint = literal_eval('0b' + other) 163 resultint = otherint | selfint 164 return _BitString(bin(resultint)[2:].zfill(len(self)))
165
166 - def __rxor__(self, other):
167 selfint = literal_eval('0b' + self) 168 otherint = literal_eval('0b' + other) 169 resultint = otherint ^ selfint 170 return _BitString(bin(resultint)[2:].zfill(len(self)))
171
172 - def __repr__(self):
173 return '_BitString(' + str.__repr__(self) + ')'
174
175 - def index_one(self):
176 """Return a list of positions where the element is '1'""" 177 return [i for i, n in enumerate(self) if n == '1']
178
179 - def index_zero(self):
180 """Return a list of positions where the element is '0'""" 181 return [i for i, n in enumerate(self) if n == '0']
182
183 - def contains(self, other):
184 """Check if current bitstr1 contains another one bitstr2. 185 186 That is to say, the bitstr2.index_one() is a subset of 187 bitstr1.index_one(). 188 189 Examples: 190 "011011" contains "011000", "011001", "000011" 191 192 Be careful, "011011" also contains "000000". Actually, all _BitString 193 objects contain all-zero _BitString of the same length. 194 """ 195 xorbit = self ^ other 196 return (xorbit.count('1') == self.count('1') - other.count('1'))
197
198 - def independent(self, other):
199 """Check if current bitstr1 is independent of another one bitstr2. 200 201 That is to say the bitstr1.index_one() and bitstr2.index_one() have 202 no intersection. 203 204 Be careful, all _BitString objects are independent of all-zero _BitString 205 of the same length. 206 """ 207 xorbit = self ^ other 208 return (xorbit.count('1') == self.count('1') + other.count('1'))
209
210 - def iscompatible(self, other):
211 """Check if current bitstr1 is compatible with another bitstr2. 212 213 Two conditions are considered as compatible: 214 215 1. bitstr1.contain(bitstr2) or vise versa; 216 2. bitstr1.independent(bitstr2). 217 """ 218 return (self.contains(other) or other.contains(self) or 219 self.independent(other))
220 221 @classmethod
222 - def from_bool(cls, bools):
223 return cls(''.join(map(str, map(int, bools))))
224
225 226 -def strict_consensus(trees):
227 """Search strict consensus tree from multiple trees. 228 229 :Parameters: 230 trees : iterable 231 iterable of trees to produce consensus tree. 232 """ 233 trees_iter = iter(trees) 234 first_tree = next(trees_iter) 235 236 terms = first_tree.get_terminals() 237 bitstr_counts, tree_count = _count_clades( 238 itertools.chain([first_tree], trees_iter)) 239 240 # Store bitstrs for strict clades 241 strict_bitstrs = [bitstr for bitstr, t in bitstr_counts.items() 242 if t[0] == tree_count] 243 strict_bitstrs.sort(key=lambda bitstr: bitstr.count('1'), reverse=True) 244 # Create root 245 root = BaseTree.Clade() 246 if strict_bitstrs[0].count('1') == len(terms): 247 root.clades.extend(terms) 248 else: 249 raise ValueError('Taxons in provided trees should be consistent') 250 # make a bitstr to clades dict and store root clade 251 bitstr_clades = {strict_bitstrs[0]: root} 252 # create inner clades 253 for bitstr in strict_bitstrs[1:]: 254 clade_terms = [terms[i] for i in bitstr.index_one()] 255 clade = BaseTree.Clade() 256 clade.clades.extend(clade_terms) 257 for bs, c in bitstr_clades.items(): 258 # check if it should be the parent of current clade 259 if bs.contains(bitstr): 260 # remove old bitstring 261 del bitstr_clades[bs] 262 # update clade childs 263 new_childs = [child for child in c.clades 264 if child not in clade_terms] 265 c.clades = new_childs 266 # set current clade as child of c 267 c.clades.append(clade) 268 # update bitstring 269 bs = bs ^ bitstr 270 # update clade 271 bitstr_clades[bs] = c 272 break 273 # put new clade 274 bitstr_clades[bitstr] = clade 275 return BaseTree.Tree(root=root)
276
277 278 -def majority_consensus(trees, cutoff=0):
279 """Search majority rule consensus tree from multiple trees. 280 281 This is a extend majority rule method, which means the you can set any 282 cutoff between 0 ~ 1 instead of 0.5. The default value of cutoff is 0 to 283 create a relaxed binary consensus tree in any condition (as long as one of 284 the provided trees is a binary tree). The branch length of each consensus 285 clade in the result consensus tree is the average length of all counts for 286 that clade. 287 288 :Parameters: 289 trees : iterable 290 iterable of trees to produce consensus tree. 291 """ 292 tree_iter = iter(trees) 293 first_tree = next(tree_iter) 294 295 terms = first_tree.get_terminals() 296 bitstr_counts, tree_count = _count_clades( 297 itertools.chain([first_tree], tree_iter)) 298 299 # Sort bitstrs by descending #occurrences, then #tips, then tip order 300 bitstrs = sorted(bitstr_counts.keys(), 301 key=lambda bitstr: (bitstr_counts[bitstr][0], 302 bitstr.count('1'), 303 str(bitstr)), 304 reverse=True) 305 root = BaseTree.Clade() 306 if bitstrs[0].count('1') == len(terms): 307 root.clades.extend(terms) 308 else: 309 raise ValueError('Taxons in provided trees should be consistent') 310 # Make a bitstr-to-clades dict and store root clade 311 bitstr_clades = {bitstrs[0]: root} 312 # create inner clades 313 for bitstr in bitstrs[1:]: 314 # apply majority rule 315 count_in_trees, branch_length_sum = bitstr_counts[bitstr] 316 confidence = 100.0 * count_in_trees / tree_count 317 if confidence < cutoff * 100.0: 318 break 319 clade_terms = [terms[i] for i in bitstr.index_one()] 320 clade = BaseTree.Clade() 321 clade.clades.extend(clade_terms) 322 clade.confidence = confidence 323 clade.branch_length = branch_length_sum / count_in_trees 324 bsckeys = sorted(bitstr_clades, key=lambda bs: bs.count('1'), 325 reverse=True) 326 327 # check if current clade is compatible with previous clades and 328 # record it's possible parent and child clades. 329 compatible = True 330 parent_bitstr = None 331 child_bitstrs = [] # multiple independent childs 332 for bs in bsckeys: 333 if not bs.iscompatible(bitstr): 334 compatible = False 335 break 336 # assign the closest ancestor as its parent 337 # as bsckeys is sorted, it should be the last one 338 if bs.contains(bitstr): 339 parent_bitstr = bs 340 # assign the closest descendant as its child 341 # the largest and independent clades 342 if (bitstr.contains(bs) and bs != bitstr and 343 all(c.independent(bs) for c in child_bitstrs)): 344 child_bitstrs.append(bs) 345 if not compatible: 346 continue 347 348 if parent_bitstr: 349 # insert current clade; remove old bitstring 350 parent_clade = bitstr_clades.pop(parent_bitstr) 351 # update parent clade childs 352 parent_clade.clades = [c for c in parent_clade.clades 353 if c not in clade_terms] 354 # set current clade as child of parent_clade 355 parent_clade.clades.append(clade) 356 # update bitstring 357 # parent = parent ^ bitstr 358 # update clade 359 bitstr_clades[parent_bitstr] = parent_clade 360 361 if child_bitstrs: 362 remove_list = [] 363 for c in child_bitstrs: 364 remove_list.extend(c.index_one()) 365 child_clade = bitstr_clades[c] 366 parent_clade.clades.remove(child_clade) 367 clade.clades.append(child_clade) 368 remove_terms = [terms[i] for i in remove_list] 369 clade.clades = [c for c in clade.clades if c not in remove_terms] 370 # put new clade 371 bitstr_clades[bitstr] = clade 372 if ((len(bitstr_clades) == len(terms) - 1) or 373 (len(bitstr_clades) == len(terms) - 2 and len(root.clades) == 3)): 374 break 375 return BaseTree.Tree(root=root)
376
377 378 -def adam_consensus(trees):
379 """Search Adam Consensus tree from multiple trees 380 381 :Parameters: 382 trees : list 383 list of trees to produce consensus tree. 384 """ 385 clades = [tree.root for tree in trees] 386 return BaseTree.Tree(root=_part(clades), rooted=True)
387
388 389 -def _part(clades):
390 """recursive function of adam consensus algorithm""" 391 new_clade = None 392 terms = clades[0].get_terminals() 393 term_names = [term.name for term in terms] 394 if len(terms) == 1 or len(terms) == 2: 395 new_clade = clades[0] 396 else: 397 bitstrs = set([_BitString('1' * len(terms))]) 398 for clade in clades: 399 for child in clade.clades: 400 bitstr = _clade_to_bitstr(child, term_names) 401 to_remove = set() 402 to_add = set() 403 for bs in bitstrs: 404 if bs == bitstr: 405 continue 406 elif bs.contains(bitstr): 407 to_add.add(bitstr) 408 to_add.add(bs ^ bitstr) 409 to_remove.add(bs) 410 elif bitstr.contains(bs): 411 to_add.add(bs ^ bitstr) 412 elif not bs.independent(bitstr): 413 to_add.add(bs & bitstr) 414 to_add.add(bs & bitstr ^ bitstr) 415 to_add.add(bs & bitstr ^ bs) 416 to_remove.add(bs) 417 # bitstrs = bitstrs | to_add 418 bitstrs ^= to_remove 419 if to_add: 420 for ta in sorted(to_add, key=lambda bs: bs.count('1')): 421 independent = True 422 for bs in bitstrs: 423 if not ta.independent(bs): 424 independent = False 425 break 426 if independent: 427 bitstrs.add(ta) 428 new_clade = BaseTree.Clade() 429 for bitstr in sorted(bitstrs): 430 indices = bitstr.index_one() 431 if len(indices) == 1: 432 new_clade.clades.append(terms[indices[0]]) 433 elif len(indices) == 2: 434 bifur_clade = BaseTree.Clade() 435 bifur_clade.clades.append(terms[indices[0]]) 436 bifur_clade.clades.append(terms[indices[1]]) 437 new_clade.clades.append(bifur_clade) 438 elif len(indices) > 2: 439 part_names = [term_names[i] for i in indices] 440 next_clades = [] 441 for clade in clades: 442 next_clades.append(_sub_clade(clade, part_names)) 443 # next_clades = [clade.common_ancestor([clade.find_any(name=name) for name in part_names]) for clade in clades] 444 new_clade.clades.append(_part(next_clades)) 445 return new_clade
446
447 448 -def _sub_clade(clade, term_names):
449 """extract a compatible subclade that only contains the given terminal names""" 450 term_clades = [clade.find_any(name) for name in term_names] 451 sub_clade = clade.common_ancestor(term_clades) 452 if len(term_names) != sub_clade.count_terminals(): 453 temp_clade = BaseTree.Clade() 454 temp_clade.clades.extend(term_clades) 455 for c in sub_clade.find_clades(terminal=False, order="preorder"): 456 if c == sub_clade.root: 457 continue 458 childs = set(c.find_clades(terminal=True)) & set(term_clades) 459 if childs: 460 for tc in temp_clade.find_clades(terminal=False, 461 order="preorder"): 462 tc_childs = set(tc.clades) 463 tc_new_clades = tc_childs - childs 464 if childs.issubset(tc_childs) and tc_new_clades: 465 tc.clades = list(tc_new_clades) 466 child_clade = BaseTree.Clade() 467 child_clade.clades.extend(list(childs)) 468 tc.clades.append(child_clade) 469 sub_clade = temp_clade 470 return sub_clade
471
472 473 -def _count_clades(trees):
474 """Count distinct clades (different sets of terminal names) in the trees. 475 476 Return a tuple first a dict of bitstring (representing clade) and a tuple of its count of 477 occurrences and sum of branch length for that clade, second the number of trees processed. 478 479 :Parameters: 480 trees : iterable 481 An iterable that returns the trees to count 482 """ 483 bitstrs = {} 484 tree_count = 0 485 for tree in trees: 486 tree_count += 1 487 clade_bitstrs = _tree_to_bitstrs(tree) 488 for clade in tree.find_clades(terminal=False): 489 bitstr = clade_bitstrs[clade] 490 if bitstr in bitstrs: 491 count, sum_bl = bitstrs[bitstr] 492 count += 1 493 sum_bl += clade.branch_length or 0 494 bitstrs[bitstr] = (count, sum_bl) 495 else: 496 bitstrs[bitstr] = (1, clade.branch_length or 0) 497 return bitstrs, tree_count
498
499 500 -def get_support(target_tree, trees, len_trees=None):
501 """Calculate branch support for a target tree given bootstrap replicate trees. 502 503 :Parameters: 504 target_tree : Tree 505 tree to calculate branch support for. 506 trees : iterable 507 iterable of trees used to calculate branch support. 508 len_trees : int 509 optional count of replicates in trees. len_trees must be provided 510 when len(trees) is not a valid operation. 511 """ 512 term_names = sorted(term.name 513 for term in target_tree.find_clades(terminal=True)) 514 bitstrs = {} 515 516 size = len_trees 517 if size is None: 518 try: 519 size = len(trees) 520 except TypeError: 521 raise TypeError("Trees does not support len(trees), " 522 "you must provide the number of replicates in trees " 523 "as the optional parameter len_trees.") 524 525 for clade in target_tree.find_clades(terminal=False): 526 bitstr = _clade_to_bitstr(clade, term_names) 527 bitstrs[bitstr] = (clade, 0) 528 for tree in trees: 529 for clade in tree.find_clades(terminal=False): 530 bitstr = _clade_to_bitstr(clade, term_names) 531 if bitstr in bitstrs: 532 c, t = bitstrs[bitstr] 533 c.confidence = (t + 1) * 100.0 / size 534 bitstrs[bitstr] = (c, t + 1) 535 return target_tree
536
537 538 -def bootstrap(msa, times):
539 """Generate bootstrap replicates from a multiple sequence alignment object 540 541 :Parameters: 542 msa : MultipleSeqAlignment 543 multiple sequence alignment to generate replicates. 544 times : int 545 number of bootstrap times. 546 """ 547 548 length = len(msa[0]) 549 i = 0 550 while i < times: 551 i += 1 552 item = None 553 for j in range(length): 554 col = random.randint(0, length - 1) 555 if not item: 556 item = msa[:, col:col + 1] 557 else: 558 item += msa[:, col:col + 1] 559 yield item
560
561 562 -def bootstrap_trees(msa, times, tree_constructor):
563 """Generate bootstrap replicate trees from a multiple sequence alignment. 564 565 :Parameters: 566 msa : MultipleSeqAlignment 567 multiple sequence alignment to generate replicates. 568 times : int 569 number of bootstrap times. 570 tree_constructor : TreeConstructor 571 tree constructor to be used to build trees. 572 """ 573 574 msas = bootstrap(msa, times) 575 for aln in msas: 576 tree = tree_constructor.build_tree(aln) 577 yield tree
578
579 580 -def bootstrap_consensus(msa, times, tree_constructor, consensus):
581 """Consensus tree of a series of bootstrap trees for a multiple sequence alignment 582 583 :Parameters: 584 msa : MultipleSeqAlignment 585 Multiple sequence alignment to generate replicates. 586 times : int 587 Number of bootstrap times. 588 tree_constructor : TreeConstructor 589 Tree constructor to be used to build trees. 590 consensus : function 591 Consensus method in this module: `strict_consensus`, 592 `majority_consensus`, `adam_consensus`. 593 """ 594 trees = bootstrap_trees(msa, times, tree_constructor) 595 tree = consensus(list(trees)) 596 return tree
597
598 599 -def _clade_to_bitstr(clade, tree_term_names):
600 """Create a BitString representing a clade, given ordered tree taxon names.""" 601 clade_term_names = set(term.name for term in 602 clade.find_clades(terminal=True)) 603 return _BitString.from_bool((name in clade_term_names) 604 for name in tree_term_names)
605
606 607 -def _tree_to_bitstrs(tree):
608 """Create a dict of a tree's clades to corresponding BitStrings.""" 609 clades_bitstrs = {} 610 term_names = [term.name for term in tree.find_clades(terminal=True)] 611 for clade in tree.find_clades(terminal=False): 612 bitstr = _clade_to_bitstr(clade, term_names) 613 clades_bitstrs[clade] = bitstr 614 return clades_bitstrs
615
616 617 -def _bitstring_topology(tree):
618 """Generates a branch length dict for a tree, keyed by BitStrings. 619 620 Create a dict of all clades' BitStrings to the corresponding branch 621 lengths (rounded to 5 decimal places).""" 622 bitstrs = {} 623 for clade, bitstr in _tree_to_bitstrs(tree).items(): 624 bitstrs[bitstr] = round(clade.branch_length or 0.0, 5) 625 return bitstrs
626
627 628 -def _equal_topology(tree1, tree2):
629 """Are two trees are equal in terms of topology and branch lengths. 630 631 (Branch lengths checked to 5 decimal places.) 632 """ 633 term_names1 = set(term.name for term in tree1.find_clades(terminal=True)) 634 term_names2 = set(term.name for term in tree2.find_clades(terminal=True)) 635 return ((term_names1 == term_names2) and 636 (_bitstring_topology(tree1) == _bitstring_topology(tree2)))
637