Package Bio :: Package Phylo :: Module PhyloXML
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.PhyloXML

   1  # Copyright (C) 2009 by Eric Talevich (eric.talevich@gmail.com) 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license. Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5   
   6  """Classes corresponding to phyloXML elements. 
   7   
   8  See Also 
   9  -------- 
  10  Official specification: 
  11     http://phyloxml.org/  
  12  Journal article: 
  13      Han and Zmasek (2009), doi:10.1186/1471-2105-10-356 
  14  """ 
  15  __docformat__ = "restructuredtext en" 
  16   
  17  import re 
  18  import warnings 
  19   
  20  from Bio import Alphabet 
  21  from Bio.Align import MultipleSeqAlignment 
  22  from Bio.Seq import Seq 
  23  from Bio.SeqFeature import SeqFeature, FeatureLocation 
  24  from Bio.SeqRecord import SeqRecord 
  25  from Bio import BiopythonWarning 
  26   
  27  from Bio.Phylo import BaseTree 
28 29 30 -class PhyloXMLWarning(BiopythonWarning):
31 """Warning for non-compliance with the phyloXML specification.""" 32 pass
33
34 35 -def _check_str(text, testfunc):
36 """Check a string using testfunc, and warn if there's no match.""" 37 if text is not None and not testfunc(text): 38 warnings.warn("String %s doesn't match the given regexp" % text, 39 PhyloXMLWarning, stacklevel=2)
40
41 42 # Core elements 43 44 -class PhyloElement(BaseTree.TreeElement):
45 """Base class for all PhyloXML objects."""
46
47 48 -class Phyloxml(PhyloElement):
49 """Root node of the PhyloXML document. 50 51 Contains an arbitrary number of Phylogeny elements, possibly followed by 52 elements from other namespaces. 53 54 :Parameters: 55 attributes 56 (XML namespace definitions) 57 phylogenies 58 list of phylogenetic trees 59 other 60 list of arbitrary non-phyloXML elements, if any 61 """
62 - def __init__(self, attributes, phylogenies=None, other=None):
63 self.attributes = attributes 64 self.phylogenies = phylogenies or [] 65 self.other = other or []
66
67 - def __getitem__(self, index):
68 """Get a phylogeny by index or name.""" 69 if isinstance(index, int) or isinstance(index, slice): 70 return self.phylogenies[index] 71 if not isinstance(index, basestring): 72 raise KeyError("can't use %s as an index" % type(index)) 73 for tree in self.phylogenies: 74 if tree.name == index: 75 return tree 76 else: 77 raise KeyError("no phylogeny found with name " + repr(index))
78
79 - def __iter__(self):
80 """Iterate through the phylogenetic trees in this object.""" 81 return iter(self.phylogenies)
82
83 - def __len__(self):
84 """Number of phylogenetic trees in this object.""" 85 return len(self.phylogenies)
86
87 - def __str__(self):
88 return '%s([%s])' % (self.__class__.__name__, 89 ',\n'.join(map(str, self.phylogenies)))
90
91 92 -class Other(PhyloElement):
93 """Container for non-phyloXML elements in the tree. 94 95 Usually, an Other object will have either a 'value' or a non-empty list 96 of 'children', but not both. This is not enforced here, though. 97 98 :Parameters: 99 tag : string 100 local tag for the XML node 101 namespace : string 102 XML namespace for the node -- should not be the default phyloXML 103 namespace. 104 attributes : dict of strings 105 attributes on the XML node 106 value : string 107 text contained directly within this XML node 108 children : list 109 child nodes, if any (also `Other` instances) 110 """
111 - def __init__(self, tag, namespace=None, attributes=None, value=None, 112 children=None):
113 self.tag = tag 114 self.namespace = namespace 115 self.attributes = attributes 116 self.value = value 117 self.children = children or []
118
119 - def __iter__(self):
120 """Iterate through the children of this object (if any).""" 121 return iter(self.children)
122
123 124 -class Phylogeny(PhyloElement, BaseTree.Tree):
125 """A phylogenetic tree. 126 127 :Parameters: 128 root : Clade 129 the root node/clade of this tree 130 rooted : bool 131 True if this tree is rooted 132 rerootable : bool 133 True if this tree is rerootable 134 branch_length_unit : string 135 unit for branch_length values on clades 136 name : string 137 identifier for this tree, not required to be unique 138 id : Id 139 unique identifier for this tree 140 description : string 141 plain-text description 142 date : Date 143 date for the root node of this tree 144 confidences : list 145 Confidence objects for this tree 146 clade_relations : list 147 CladeRelation objects 148 sequence_relations : list 149 SequenceRelation objects 150 properties : list 151 Property objects 152 other : list 153 non-phyloXML elements (type `Other`) 154 """
155 - def __init__(self, root=None, rooted=True, 156 rerootable=None, branch_length_unit=None, type=None, 157 # Child nodes 158 name=None, id=None, description=None, date=None, 159 # Collections 160 confidences=None, clade_relations=None, sequence_relations=None, 161 properties=None, other=None, 162 ):
163 assert isinstance(rooted, bool) 164 self.root = root 165 self.rooted = rooted 166 self.rerootable = rerootable 167 self.branch_length_unit = branch_length_unit 168 self.type = type 169 self.name = name 170 self.id = id 171 self.description = description 172 self.date = date 173 self.confidences = confidences or [] 174 self.clade_relations = clade_relations or [] 175 self.sequence_relations = sequence_relations or [] 176 self.properties = properties or [] 177 self.other = other or []
178 179 @classmethod
180 - def from_tree(cls, tree, **kwargs):
181 """Create a new Phylogeny given a Tree (from Newick/Nexus or BaseTree). 182 183 Keyword arguments are the usual `Phylogeny` constructor parameters. 184 """ 185 phy = cls( 186 root=Clade.from_clade(tree.root), 187 rooted=tree.rooted, 188 name=tree.name, 189 id=(tree.id is not None) and Id(str(tree.id)) or None) 190 phy.__dict__.update(kwargs) 191 return phy
192 193 @classmethod
194 - def from_clade(cls, clade, **kwargs):
195 """Create a new Phylogeny given a Newick or BaseTree Clade object. 196 197 Keyword arguments are the usual `PhyloXML.Clade` constructor parameters. 198 """ 199 return Clade.from_clade(clade).to_phylogeny(**kwargs)
200
201 - def as_phyloxml(self):
202 """Return this tree, a PhyloXML-compatible Phylogeny object. 203 204 Overrides the `BaseTree` method. 205 """ 206 return self
207
208 - def to_phyloxml_container(self, **kwargs):
209 """Create a new Phyloxml object containing just this phylogeny.""" 210 return Phyloxml(kwargs, phylogenies=[self])
211
212 - def to_alignment(self):
213 """Construct an alignment from the aligned sequences in this tree.""" 214 def is_aligned_seq(elem): 215 if isinstance(elem, Sequence) and elem.mol_seq.is_aligned: 216 return True 217 return False
218 seqs = self._filter_search(is_aligned_seq, 'preorder', True) 219 try: 220 first_seq = seqs.next() 221 except StopIteration: 222 # No aligned sequences were found --> empty MSA 223 return MultipleSeqAlignment([]) 224 msa = MultipleSeqAlignment([first_seq.to_seqrecord()], 225 first_seq.get_alphabet()) 226 msa.extend(seq.to_seqrecord() for seq in seqs) 227 return msa
228 229 # Singular property for plural attribute
230 - def _get_confidence(self):
231 """Equivalent to self.confidences[0] if there is only 1 value. 232 233 See also: `Clade.confidence`, `Clade.taxonomy` 234 """ 235 if len(self.confidences) == 0: 236 return None 237 if len(self.confidences) > 1: 238 raise AttributeError("more than 1 confidence value available; " 239 "use Phylogeny.confidences") 240 return self.confidences[0]
241
242 - def _set_confidence(self, value):
243 if value is None: 244 # Special case: mirror the behavior of _get_confidence 245 self.confidences = [] 246 return 247 if isinstance(value, float) or isinstance(value, int): 248 value = Confidence(value) 249 elif not isinstance(value, Confidence): 250 raise ValueError("value must be a number or Confidence instance") 251 if len(self.confidences) == 0: 252 self.confidences.append(value) 253 elif len(self.confidences) == 1: 254 self.confidences[0] = value 255 else: 256 raise ValueError("multiple confidence values already exist; " 257 "use Phylogeny.confidences instead")
258
259 - def _del_confidence(self):
260 self.confidences = []
261 262 confidence = property(_get_confidence, _set_confidence, _del_confidence) 263
264 265 -class Clade(PhyloElement, BaseTree.Clade):
266 """Describes a branch of the current phylogenetic tree. 267 268 Used recursively, describes the topology of a phylogenetic tree. 269 270 Both ``color`` and ``width`` elements should be interpreted by client code 271 as applying to the whole clade, including all descendents, unless 272 overwritten in-sub clades. This module doesn't automatically assign these 273 attributes to sub-clades to achieve this cascade -- and neither should you. 274 275 :Parameters: 276 branch_length 277 parent branch length of this clade 278 id_source 279 link other elements to a clade (on the xml-level) 280 name : string 281 short label for this clade 282 confidences : list of Confidence objects 283 used to indicate the support for a clade/parent branch. 284 width : float 285 branch width for this clade (including branch from parent) 286 color : BranchColor 287 color used for graphical display of this clade 288 node_id 289 unique identifier for the root node of this clade 290 taxonomies : list 291 Taxonomy objects 292 sequences : list 293 Sequence objects 294 events : Events 295 describe such events as gene-duplications at the root node/parent 296 branch of this clade 297 binary_characters : BinaryCharacters 298 binary characters 299 distributions : list of Distribution objects 300 distribution(s) of this clade 301 date : Date 302 a date for the root node of this clade 303 references : list 304 Reference objects 305 properties : list 306 Property objects 307 clades : list Clade objects 308 Sub-clades 309 other : list of Other objects 310 non-phyloXML objects 311 """
312 - def __init__(self, 313 # Attributes 314 branch_length=None, id_source=None, 315 # Child nodes 316 name=None, width=None, color=None, node_id=None, events=None, 317 binary_characters=None, date=None, 318 # Collections 319 confidences=None, taxonomies=None, sequences=None, 320 distributions=None, references=None, properties=None, clades=None, 321 other=None, 322 ):
323 self.branch_length = branch_length 324 self.id_source = id_source 325 self.name = name 326 self.width = width 327 self.color = color 328 self.node_id = node_id 329 self.events = events 330 self.binary_characters = binary_characters 331 self.date = date 332 self.confidences = confidences or [] 333 self.taxonomies = taxonomies or [] 334 self.sequences = sequences or [] 335 self.distributions = distributions or [] 336 self.references = references or [] 337 self.properties = properties or [] 338 self.clades = clades or [] 339 self.other = other or []
340 341 @classmethod
342 - def from_clade(cls, clade, **kwargs):
343 """Create a new PhyloXML Clade from a Newick or BaseTree Clade object. 344 345 Keyword arguments are the usual PhyloXML Clade constructor parameters. 346 """ 347 new_clade = cls(branch_length=clade.branch_length, 348 name=clade.name) 349 new_clade.clades = [cls.from_clade(c) for c in clade] 350 new_clade.confidence = clade.confidence 351 new_clade.width = clade.width 352 new_clade.color = (BranchColor( 353 clade.color.red, clade.color.green, clade.color.blue) 354 if clade.color else None) 355 new_clade.__dict__.update(kwargs) 356 return new_clade
357
358 - def to_phylogeny(self, **kwargs):
359 """Create a new phylogeny containing just this clade.""" 360 phy = Phylogeny(root=self, date=self.date) 361 phy.__dict__.update(kwargs) 362 return phy
363 364 # Shortcuts for list attributes that are usually only 1 item 365 # NB: Duplicated from Phylogeny class
366 - def _get_confidence(self):
367 if len(self.confidences) == 0: 368 return None 369 if len(self.confidences) > 1: 370 raise AttributeError("more than 1 confidence value available; " 371 "use Clade.confidences") 372 return self.confidences[0]
373
374 - def _set_confidence(self, value):
375 if value is None: 376 # Special case: mirror the behavior of _get_confidence 377 self.confidences = [] 378 return 379 if isinstance(value, float) or isinstance(value, int): 380 value = Confidence(value) 381 elif not isinstance(value, Confidence): 382 raise ValueError("value must be a number or Confidence instance") 383 if len(self.confidences) == 0: 384 self.confidences.append(value) 385 elif len(self.confidences) == 1: 386 self.confidences[0] = value 387 else: 388 raise ValueError("multiple confidence values already exist; " 389 "use Phylogeny.confidences instead")
390
391 - def _del_confidence(self):
392 self.confidences = []
393 394 confidence = property(_get_confidence, _set_confidence, _del_confidence) 395
396 - def _get_taxonomy(self):
397 if len(self.taxonomies) == 0: 398 return None 399 if len(self.taxonomies) > 1: 400 raise AttributeError("more than 1 taxonomy value available; " 401 "use Clade.taxonomies") 402 return self.taxonomies[0]
403
404 - def _set_taxonomy(self, value):
405 if not isinstance(value, Taxonomy): 406 raise ValueError("assigned value must be a Taxonomy instance") 407 if len(self.taxonomies) == 0: 408 self.taxonomies.append(value) 409 elif len(self.taxonomies) == 1: 410 self.taxonomies[0] = value 411 else: 412 raise ValueError("multiple taxonomy values already exist; " 413 "use Phylogeny.taxonomies instead")
414 415 taxonomy = property(_get_taxonomy, _set_taxonomy)
416
417 418 # PhyloXML wrapper for a special BaseTree attribute 419 420 -class BranchColor(PhyloElement, BaseTree.BranchColor):
421 - def __init__(self, *args, **kwargs):
422 BaseTree.BranchColor.__init__(self, *args, **kwargs)
423
424 425 # PhyloXML-specific complex types 426 427 -class Accession(PhyloElement):
428 """Captures the local part in a sequence identifier. 429 430 Example: In ``UniProtKB:P17304``, the Accession instance attribute ``value`` 431 is 'P17304' and the ``source`` attribute is 'UniProtKB'. 432 """
433 - def __init__(self, value, source):
434 self.value = value 435 self.source = source
436
437 - def __str__(self):
438 """Show the class name and an identifying attribute.""" 439 return '%s:%s' % (self.source, self.value)
440
441 442 -class Annotation(PhyloElement):
443 """The annotation of a molecular sequence. 444 445 It is recommended to annotate by using the optional 'ref' attribute. 446 447 :Parameters: 448 ref : string 449 reference string, e.g. 'GO:0008270', 450 'KEGG:Tetrachloroethene degradation', 'EC:1.1.1.1' 451 source : string 452 plain-text source for this annotation 453 evidence : str 454 describe evidence as free text (e.g. 'experimental') 455 desc : string 456 free text description 457 confidence : Confidence 458 state the type and value of support (type Confidence) 459 properties : list 460 typed and referenced annotations from external resources 461 uri : Uri 462 link 463 """ 464 re_ref = re.compile(r'[a-zA-Z0-9_]+:[a-zA-Z0-9_\.\-\s]+') 465
466 - def __init__(self, 467 # Attributes 468 ref=None, source=None, evidence=None, type=None, 469 # Child nodes 470 desc=None, confidence=None, uri=None, 471 # Collection 472 properties=None):
473 _check_str(ref, self.re_ref.match) 474 self.ref = ref 475 self.source = source 476 self.evidence = evidence 477 self.type = type 478 self.desc = desc 479 self.confidence = confidence 480 self.uri = uri 481 self.properties = properties or []
482
483 484 -class BinaryCharacters(PhyloElement):
485 """The names and/or counts of binary characters present, gained, and lost 486 at the root of a clade. 487 """
488 - def __init__(self, 489 # Attributes 490 type=None, gained_count=None, lost_count=None, present_count=None, 491 absent_count=None, 492 # Child nodes (flattened into collections) 493 gained=None, lost=None, present=None, absent=None):
494 self.type=type 495 self.gained_count=gained_count 496 self.lost_count=lost_count 497 self.present_count=present_count 498 self.absent_count=absent_count 499 self.gained=gained or [] 500 self.lost=lost or [] 501 self.present=present or [] 502 self.absent=absent or []
503
504 505 506 -class CladeRelation(PhyloElement):
507 """Expresses a typed relationship between two clades. 508 509 For example, this could be used to describe multiple parents of a clade. 510 511 @type id_ref_0: str 512 @type id_ref_1: str 513 @type distance: str 514 @type type: str 515 516 @type confidence: Confidence 517 """
518 - def __init__(self, type, id_ref_0, id_ref_1, 519 distance=None, confidence=None):
520 self.distance = distance 521 self.type = type 522 self.id_ref_0 = id_ref_0 523 self.id_ref_1 = id_ref_1 524 self.confidence = confidence
525
526 527 -class Confidence(PhyloElement):
528 """A general purpose confidence element. 529 530 For example, this can be used to express the bootstrap support value of a 531 clade (in which case the `type` attribute is 'bootstrap'). 532 533 :Parameters: 534 value : float 535 confidence value 536 type : string 537 label for the type of confidence, e.g. 'bootstrap' 538 """
539 - def __init__(self, value, type='unknown'):
540 self.value = value 541 self.type = type
542 543 # Comparison operators 544
545 - def __hash__(self):
546 """Return the hash value of the object. 547 548 Hash values are integers. They are used to quickly compare dictionary 549 keys during a dictionary lookup. Numeric values that compare equal have 550 the same hash value (even if they are of different types, as is the 551 case for 1 and 1.0). 552 """ 553 return id(self)
554
555 - def __eq__(self, other):
556 if isinstance(other, Confidence): 557 return self.value == other.value 558 return self.value == other
559
560 - def __ne__(self, other):
561 if isinstance(other, Confidence): 562 return self.value != other.value 563 return self.value != other
564 565 # Ordering -- see functools.total_ordering in Py2.7 566
567 - def __lt__(self, other):
568 if isinstance(other, Confidence): 569 return self.value < other.value 570 return self.value < other
571
572 - def __le__(self, other):
573 return self < other or self == other
574
575 - def __gt__(self, other):
576 return not (self <= other)
577
578 - def __ge__(self, other):
579 return not (self.value < other)
580 581 # Arithmetic operators, including reverse 582
583 - def __add__(self, other):
584 return self.value + other
585
586 - def __radd__(self, other):
587 return other + self.value
588
589 - def __sub__(self, other):
590 return self.value - other
591
592 - def __rsub__(self, other):
593 return other - self.value
594
595 - def __mul__(self, other):
596 return self.value * other
597
598 - def __rmul__(self, other):
599 return other * self.value
600
601 - def __div__(self, other):
602 return self.value.__div__(other)
603
604 - def __rdiv__(self, other):
605 return other.__div__(self.value)
606
607 - def __truediv__(self, other):
608 """Rational-style division in Py3.0+. 609 610 Also active in Py2.5+ with __future__.division import. 611 """ 612 return self.value / other
613
614 - def __rtruediv__(self, other):
615 return other / self.value
616
617 - def __floordiv__(self, other):
618 """C-style and old-style division in Py3.0+. 619 620 Also active in Py2.5+ with __future__.division import. 621 """ 622 return self.value.__floordiv__(other)
623
624 - def __rfloordiv__(self, other):
625 return other.__floordiv__(self.value)
626
627 - def __mod__(self, other):
628 return self.value % other
629
630 - def __rmod__(self, other):
631 return other % self.value
632
633 - def __divmod__(self, other):
634 return divmod(self.value, other)
635
636 - def __rdivmod__(self, other):
637 return divmod(other, self.value)
638
639 - def __pow__(self, other, modulo=None):
640 if modulo is not None: 641 return pow(self.value, other, modulo) 642 return pow(self.value, other)
643
644 - def __rpow__(self, other):
645 return pow(other, self.value)
646 647 # Unary arithmetic operations: -, +, abs() 648
649 - def __neg__(self):
650 return -self.value
651
652 - def __pos__(self):
653 return self.value
654
655 - def __abs__(self):
656 return abs(self.value)
657 658 # Explicit coercion to numeric types: int, long, float 659
660 - def __float__(self):
661 return float(self.value)
662
663 - def __int__(self):
664 return int(self.value)
665
666 - def __long__(self):
667 return long(self.value)
668
669 670 -class Date(PhyloElement):
671 """A date associated with a clade/node. 672 673 Its value can be numerical by using the 'value' element and/or free text 674 with the 'desc' element' (e.g. 'Silurian'). If a numerical value is used, it 675 is recommended to employ the 'unit' attribute. 676 677 :Parameters: 678 unit : string 679 type of numerical value (e.g. 'mya' for 'million years ago') 680 value : float 681 the date value 682 desc : string 683 plain-text description of the date 684 minimum : float 685 lower bound on the date value 686 maximum : float 687 upper bound on the date value 688 """
689 - def __init__(self, value=None, unit=None, desc=None, 690 minimum=None, maximum=None):
691 self.value = value 692 self.unit = unit 693 self.desc = desc 694 self.minimum = minimum 695 self.maximum = maximum
696
697 - def __str__(self):
698 """Show the class name and the human-readable date.""" 699 if self.unit and self.value is not None: 700 return '%s %s' % (self.value, self.unit) 701 if self.desc is not None: 702 return self.desc 703 return self.__class__.__name__
704
705 706 -class Distribution(PhyloElement):
707 """Geographic distribution of the items of a clade (species, sequences). 708 709 Intended for phylogeographic applications. 710 711 :Parameters: 712 desc : string 713 free-text description of the location 714 points : list of `Point` objects 715 coordinates (similar to the 'Point' element in Google's KML format) 716 polygons : list of `Polygon` objects 717 coordinate sets defining geographic regions 718 """
719 - def __init__(self, desc=None, points=None, polygons=None):
720 self.desc = desc 721 self.points = points or [] 722 self.polygons = polygons or []
723
724 725 -class DomainArchitecture(PhyloElement):
726 """Domain architecture of a protein. 727 728 :Parameters: 729 length : int 730 total length of the protein sequence 731 domains : list ProteinDomain objects 732 the domains within this protein 733 """
734 - def __init__(self, length=None, domains=None):
735 self.length = length 736 self.domains = domains
737
738 739 -class Events(PhyloElement):
740 """Events at the root node of a clade (e.g. one gene duplication). 741 742 All attributes are set to None by default, but this object can also be 743 treated as a dictionary, in which case None values are treated as missing 744 keys and deleting a key resets that attribute's value back to None. 745 """ 746 ok_type = set(('transfer', 'fusion', 'speciation_or_duplication', 'other', 747 'mixed', 'unassigned')) 748
749 - def __init__(self, type=None, duplications=None, speciations=None, 750 losses=None, confidence=None):
751 _check_str(type, self.ok_type.__contains__) 752 self.type = type 753 self.duplications = duplications 754 self.speciations = speciations 755 self.losses = losses 756 self.confidence = confidence
757
758 - def items(self):
759 return [(k, v) for k, v in self.__dict__.iteritems() if v is not None]
760
761 - def keys(self):
762 return [k for k, v in self.__dict__.iteritems() if v is not None]
763
764 - def values(self):
765 return [v for v in self.__dict__.itervalues() if v is not None]
766
767 - def __len__(self):
768 return len(self.values())
769
770 - def __getitem__(self, key):
771 if not hasattr(self, key): 772 raise KeyError(key) 773 val = getattr(self, key) 774 if val is None: 775 raise KeyError("%s has not been set in this object" % repr(key)) 776 return val
777
778 - def __setitem__(self, key, val):
779 setattr(self, key, val)
780
781 - def __delitem__(self, key):
782 setattr(self, key, None)
783
784 - def __iter__(self):
785 return iter(self.keys())
786
787 - def __contains__(self, key):
788 return (hasattr(self, key) and getattr(self, key) is not None)
789
790 791 -class Id(PhyloElement):
792 """A general-purpose identifier element. 793 794 Allows to indicate the provider (or authority) of an identifier, e.g. NCBI, 795 along with the value itself. 796 """
797 - def __init__(self, value, provider=None):
798 self.value = value 799 self.provider = provider
800
801 - def __str__(self):
802 if self.provider is not None: 803 return '%s:%s' % (self.provider, self.value) 804 return self.value
805
806 807 -class MolSeq(PhyloElement):
808 """Store a molecular sequence. 809 810 :Parameters: 811 value : string 812 the sequence itself 813 is_aligned : bool 814 True if this sequence is aligned with the others (usually meaning 815 all aligned seqs are the same length and gaps may be present) 816 """ 817 re_value = re.compile(r'[a-zA-Z\.\-\?\*_]+') 818
819 - def __init__(self, value, is_aligned=None):
820 _check_str(value, self.re_value.match) 821 self.value = value 822 self.is_aligned = is_aligned
823
824 - def __str__(self):
825 return self.value
826
827 828 -class Point(PhyloElement):
829 """Geographic coordinates of a point, with an optional altitude. 830 831 Used by element 'Distribution'. 832 833 :Parameters: 834 geodetic_datum : string, required 835 the geodetic datum (also called 'map datum'). For example, Google's 836 KML uses 'WGS84'. 837 lat : numeric 838 latitude 839 long : numeric 840 longitude 841 alt : numeric 842 altitude 843 alt_unit : string 844 unit for the altitude (e.g. 'meter') 845 """
846 - def __init__(self, geodetic_datum, lat, long, alt=None, alt_unit=None):
847 self.geodetic_datum = geodetic_datum 848 self.lat = lat 849 self.long = long 850 self.alt = alt 851 self.alt_unit = alt_unit
852
853 854 -class Polygon(PhyloElement):
855 """A polygon defined by a list of 'Points' (used by element 'Distribution'). 856 857 :param points: list of 3 or more points representing vertices. 858 """
859 - def __init__(self, points=None):
860 self.points = points or []
861
862 - def __str__(self):
863 return '%s([%s])' % (self.__class__.__name__, 864 ',\n'.join(map(str, self.points)))
865
866 867 -class Property(PhyloElement):
868 """A typed and referenced property from an external resources. 869 870 Can be attached to `Phylogeny`, `Clade`, and `Annotation` objects. 871 872 :Parameters: 873 value : string 874 the value of the property 875 ref : string 876 reference to an external resource, e.g. "NOAA:depth" 877 applies_to : string 878 indicates the item to which a property applies to (e.g. 'node' for 879 the parent node of a clade, 'parent_branch' for the parent branch of 880 a clade, or just 'clade'). 881 datatype : string 882 the type of a property; limited to xsd-datatypes 883 (e.g. 'xsd:string', 'xsd:boolean', 'xsd:integer', 'xsd:decimal', 884 'xsd:float', 'xsd:double', 'xsd:date', 'xsd:anyURI'). 885 unit : string (optional) 886 the unit of the property, e.g. "METRIC:m" 887 id_ref : Id (optional) 888 allows to attached a property specifically to one element (on the 889 xml-level) 890 """ 891 re_ref = re.compile(r'[a-zA-Z0-9_]+:[a-zA-Z0-9_\.\-\s]+') 892 ok_applies_to = set(('phylogeny', 'clade', 'node', 'annotation', 893 'parent_branch', 'other')) 894 ok_datatype = set(('xsd:string', 'xsd:boolean', 'xsd:decimal', 'xsd:float', 895 'xsd:double', 'xsd:duration', 'xsd:dateTime', 'xsd:time', 'xsd:date', 896 'xsd:gYearMonth', 'xsd:gYear', 'xsd:gMonthDay', 'xsd:gDay', 897 'xsd:gMonth', 'xsd:hexBinary', 'xsd:base64Binary', 'xsd:anyURI', 898 'xsd:normalizedString', 'xsd:token', 'xsd:integer', 899 'xsd:nonPositiveInteger', 'xsd:negativeInteger', 'xsd:long', 'xsd:int', 900 'xsd:short', 'xsd:byte', 'xsd:nonNegativeInteger', 'xsd:unsignedLong', 901 'xsd:unsignedInt', 'xsd:unsignedShort', 'xsd:unsignedByte', 902 'xsd:positiveInteger')) 903
904 - def __init__(self, value, ref, applies_to, datatype, 905 unit=None, id_ref=None):
906 _check_str(ref, self.re_ref.match) 907 _check_str(applies_to, self.ok_applies_to.__contains__) 908 _check_str(datatype, self.ok_datatype.__contains__) 909 _check_str(unit, self.re_ref.match) 910 self.unit = unit 911 self.id_ref = id_ref 912 self.value = value 913 self.ref = ref 914 self.applies_to = applies_to 915 self.datatype = datatype
916
917 918 -class ProteinDomain(PhyloElement):
919 """Represents an individual domain in a domain architecture. 920 921 The locations use 0-based indexing, as most Python objects including 922 SeqFeature do, rather than the usual biological convention starting at 1. 923 This means the start and end attributes can be used directly as slice 924 indexes on Seq objects. 925 926 :Parameters: 927 start : non-negative integer 928 start of the domain on the sequence, using 0-based indexing 929 end : non-negative integer 930 end of the domain on the sequence 931 confidence : float 932 can be used to store e.g. E-values 933 id : string 934 unique identifier/name 935 """ 936
937 - def __init__(self, value, start, end, confidence=None, id=None):
938 self.value = value 939 self.start = start 940 self.end = end 941 self.confidence = confidence 942 self.id = id
943 944 @classmethod
945 - def from_seqfeature(cls, feat):
946 return ProteinDomain(feat.id, 947 feat.location.nofuzzy_start, 948 feat.location.nofuzzy_end, 949 confidence=feat.qualifiers.get('confidence'))
950
951 - def to_seqfeature(self):
952 feat = SeqFeature(location=FeatureLocation(self.start, self.end), 953 id=self.value) 954 if hasattr(self, 'confidence'): 955 feat.qualifiers['confidence'] = self.confidence 956 return feat
957
958 959 -class Reference(PhyloElement):
960 """Literature reference for a clade. 961 962 NB: Whenever possible, use the ``doi`` attribute instead of the free-text 963 ``desc`` element. 964 """ 965 re_doi = re.compile(r'[a-zA-Z0-9_\.]+/[a-zA-Z0-9_\.]+') 966
967 - def __init__(self, doi=None, desc=None):
968 _check_str(doi, self.re_doi.match) 969 self.doi = doi 970 self.desc = desc
971
972 973 -class Sequence(PhyloElement):
974 """A molecular sequence (Protein, DNA, RNA) associated with a node. 975 976 One intended use for ``id_ref`` is to link a sequence to a taxonomy (via the 977 taxonomy's ``id_source``) in case of multiple sequences and taxonomies per 978 node. 979 980 :Parameters: 981 type : {'dna', 'rna', 'protein'} 982 type of molecule this sequence represents 983 id_ref : string 984 reference to another resource 985 id_source : string 986 source for the reference 987 symbol : string 988 short symbol of the sequence, e.g. 'ACTM' (max. 10 chars) 989 accession : Accession 990 accession code for this sequence. 991 name : string 992 full name of the sequence, e.g. 'muscle Actin' 993 location 994 location of a sequence on a genome/chromosome. 995 mol_seq : MolSeq 996 the molecular sequence itself 997 uri : Uri 998 link 999 annotations : list of Annotation objects 1000 annotations on this sequence 1001 domain_architecture : DomainArchitecture 1002 protein domains on this sequence 1003 other : list of Other objects 1004 non-phyloXML elements 1005 """ 1006 alphabets = {'dna': Alphabet.generic_dna, 1007 'rna': Alphabet.generic_rna, 1008 'protein': Alphabet.generic_protein} 1009 re_symbol = re.compile(r'\S{1,10}') 1010
1011 - def __init__(self, 1012 # Attributes 1013 type=None, id_ref=None, id_source=None, 1014 # Child nodes 1015 symbol=None, accession=None, name=None, location=None, 1016 mol_seq=None, uri=None, domain_architecture=None, 1017 # Collections 1018 annotations=None, other=None, 1019 ):
1020 _check_str(type, self.alphabets.__contains__) 1021 _check_str(symbol, self.re_symbol.match) 1022 self.type = type 1023 self.id_ref = id_ref 1024 self.id_source = id_source 1025 self.symbol = symbol 1026 self.accession = accession 1027 self.name = name 1028 self.location = location 1029 self.mol_seq = mol_seq 1030 self.uri = uri 1031 self.domain_architecture = domain_architecture 1032 self.annotations = annotations or [] 1033 self.other = other or []
1034 1035 @classmethod
1036 - def from_seqrecord(cls, record, is_aligned=None):
1037 """Create a new PhyloXML Sequence from a SeqRecord object.""" 1038 if is_aligned == None: 1039 is_aligned = isinstance(record.seq.alphabet, Alphabet.Gapped) 1040 params = { 1041 'accession': Accession(record.id, ''), 1042 'symbol': record.name, 1043 'name': record.description, 1044 'mol_seq': MolSeq(str(record.seq), is_aligned), 1045 } 1046 if isinstance(record.seq.alphabet, Alphabet.DNAAlphabet): 1047 params['type'] = 'dna' 1048 elif isinstance(record.seq.alphabet, Alphabet.RNAAlphabet): 1049 params['type'] = 'rna' 1050 elif isinstance(record.seq.alphabet, Alphabet.ProteinAlphabet): 1051 params['type'] = 'protein' 1052 1053 # Unpack record.annotations 1054 for key in ('id_ref', 'id_source', 'location'): 1055 if key in record.annotations: 1056 params[key] = record.annotations[key] 1057 if isinstance(record.annotations.get('uri'), dict): 1058 params['uri'] = Uri(**record.annotations['uri']) 1059 # Build a Sequence.annotation object 1060 if record.annotations.get('annotations'): 1061 params['annotations'] = [] 1062 for annot in record.annotations['annotations']: 1063 ann_args = {} 1064 for key in ('ref', 'source', 'evidence', 'type', 'desc'): 1065 if key in annot: 1066 ann_args[key] = annot[key] 1067 if isinstance(annot.get('confidence'), list): 1068 ann_args['confidence'] = Confidence( 1069 *annot['confidence']) 1070 if isinstance(annot.get('properties'), list): 1071 ann_args['properties'] = [Property(**prop) 1072 for prop in annot['properties'] 1073 if isinstance(prop, dict)] 1074 params['annotations'].append(Annotation(**ann_args)) 1075 1076 # Unpack record.features 1077 if record.features: 1078 params['domain_architecture'] = DomainArchitecture( 1079 length=len(record.seq), 1080 domains=[ProteinDomain.from_seqfeature(feat) 1081 for feat in record.features]) 1082 1083 return Sequence(**params)
1084
1085 - def to_seqrecord(self):
1086 """Create a SeqRecord object from this Sequence instance. 1087 1088 The seqrecord.annotations dictionary is packed like so:: 1089 1090 { # Sequence attributes with no SeqRecord equivalent: 1091 'id_ref': self.id_ref, 1092 'id_source': self.id_source, 1093 'location': self.location, 1094 'uri': { 'value': self.uri.value, 1095 'desc': self.uri.desc, 1096 'type': self.uri.type }, 1097 # Sequence.annotations attribute (list of Annotations) 1098 'annotations': [{ 'ref': ann.ref, 1099 'source': ann.source, 1100 'evidence': ann.evidence, 1101 'type': ann.type, 1102 'confidence': [ ann.confidence.value, 1103 ann.confidence.type ], 1104 'properties': [{ 'value': prop.value, 1105 'ref': prop.ref, 1106 'applies_to': prop.applies_to, 1107 'datatype': prop.datatype, 1108 'unit': prop.unit, 1109 'id_ref': prop.id_ref } 1110 for prop in ann.properties], 1111 } for ann in self.annotations], 1112 } 1113 """ 1114 def clean_dict(dct): 1115 """Remove None-valued items from a dictionary.""" 1116 return dict((key, val) for key, val in dct.iteritems() 1117 if val is not None)
1118 1119 seqrec = SeqRecord(Seq(self.mol_seq.value, self.get_alphabet()), 1120 **clean_dict({ 1121 'id': str(self.accession), 1122 'name': self.symbol, 1123 'description': self.name, 1124 # 'dbxrefs': None, 1125 })) 1126 if self.domain_architecture: 1127 seqrec.features = [dom.to_seqfeature() 1128 for dom in self.domain_architecture.domains] 1129 # Sequence attributes with no SeqRecord equivalent 1130 seqrec.annotations = clean_dict({ 1131 'id_ref': self.id_ref, 1132 'id_source': self.id_source, 1133 'location': self.location, 1134 'uri': self.uri and clean_dict({ 1135 'value': self.uri.value, 1136 'desc': self.uri.desc, 1137 'type': self.uri.type, 1138 }), 1139 'annotations': self.annotations and [ 1140 clean_dict({ 1141 'ref': ann.ref, 1142 'source': ann.source, 1143 'evidence': ann.evidence, 1144 'type': ann.type, 1145 'confidence': ann.confidence and [ 1146 ann.confidence.value, 1147 ann.confidence.type], 1148 'properties': [clean_dict({ 1149 'value': prop.value, 1150 'ref': prop.ref, 1151 'applies_to': prop.applies_to, 1152 'datatype': prop.datatype, 1153 'unit': prop.unit, 1154 'id_ref': prop.id_ref }) 1155 for prop in ann.properties], 1156 }) for ann in self.annotations], 1157 }) 1158 return seqrec
1159
1160 - def get_alphabet(self):
1161 alph = self.alphabets.get(self.type, Alphabet.generic_alphabet) 1162 if self.mol_seq and self.mol_seq.is_aligned: 1163 return Alphabet.Gapped(alph) 1164 return alph
1165
1166 1167 -class SequenceRelation(PhyloElement):
1168 """Express a typed relationship between two sequences. 1169 1170 For example, this could be used to describe an orthology (in which case 1171 attribute 'type' is 'orthology'). 1172 1173 :Parameters: 1174 id_ref_0 : Id 1175 first sequence reference identifier 1176 id_ref_1 : Id 1177 second sequence reference identifier 1178 distance : float 1179 distance between the two sequences 1180 type : restricted string 1181 describe the type of relationship 1182 confidence : Confidence 1183 confidence value for this relation 1184 """ 1185 ok_type = set(('orthology', 'one_to_one_orthology', 'super_orthology', 1186 'paralogy', 'ultra_paralogy', 'xenology', 'unknown', 'other')) 1187
1188 - def __init__(self, type, id_ref_0, id_ref_1, 1189 distance=None, confidence=None):
1190 _check_str(type, self.ok_type.__contains__) 1191 self.distance = distance 1192 self.type = type 1193 self.id_ref_0 = id_ref_0 1194 self.id_ref_1 = id_ref_1 1195 self.confidence = confidence
1196
1197 1198 -class Taxonomy(PhyloElement):
1199 """Describe taxonomic information for a clade. 1200 1201 :Parameters: 1202 id_source : Id 1203 link other elements to a taxonomy (on the XML level) 1204 id : Id 1205 unique identifier of a taxon, e.g. Id('6500', 1206 provider='ncbi_taxonomy') for the California sea hare 1207 code : restricted string 1208 store UniProt/Swiss-Prot style organism codes, e.g. 'APLCA' for the 1209 California sea hare 'Aplysia californica' 1210 scientific_name : string 1211 the standard scientific name for this organism, e.g. 'Aplysia 1212 californica' for the California sea hare 1213 authority : string 1214 keep the authority, such as 'J. G. Cooper, 1863', associated with 1215 the 'scientific_name' 1216 common_names : list of strings 1217 common names for this organism 1218 synonyms : list of strings 1219 synonyms for this taxon? 1220 rank : restricted string 1221 taxonomic rank 1222 uri : Uri 1223 link 1224 other : list of Other objects 1225 non-phyloXML elements 1226 """ 1227 re_code = re.compile(r'[a-zA-Z0-9_]{2,10}') 1228 ok_rank = set(('domain', 'kingdom', 'subkingdom', 'branch', 'infrakingdom', 1229 'superphylum', 'phylum', 'subphylum', 'infraphylum', 'microphylum', 1230 'superdivision', 'division', 'subdivision', 'infradivision', 1231 'superclass', 'class', 'subclass', 'infraclass', 'superlegion', 1232 'legion', 'sublegion', 'infralegion', 'supercohort', 'cohort', 1233 'subcohort', 'infracohort', 'superorder', 'order', 'suborder', 1234 'superfamily', 'family', 'subfamily', 'supertribe', 'tribe', 'subtribe', 1235 'infratribe', 'genus', 'subgenus', 'superspecies', 'species', 1236 'subspecies', 'variety', 'subvariety', 'form', 'subform', 'cultivar', 1237 'unknown', 'other')) 1238
1239 - def __init__(self, 1240 # Attributes 1241 id_source=None, 1242 # Child nodes 1243 id=None, code=None, scientific_name=None, authority=None, 1244 rank=None, uri=None, 1245 # Collections 1246 common_names=None, synonyms=None, other=None, 1247 ):
1248 _check_str(code, self.re_code.match) 1249 _check_str(rank, self.ok_rank.__contains__) 1250 self.id_source = id_source 1251 self.id = id 1252 self.code = code 1253 self.scientific_name = scientific_name 1254 self.authority = authority 1255 self.rank = rank 1256 self.uri = uri 1257 self.common_names = common_names or [] 1258 self.synonyms = synonyms or [] 1259 self.other = other or []
1260
1261 - def __str__(self):
1262 """Show the class name and an identifying attribute.""" 1263 if self.code is not None: 1264 return self.code 1265 if self.scientific_name is not None: 1266 return self.scientific_name 1267 if self.rank is not None: 1268 return self.rank 1269 if self.id is not None: 1270 return str(self.id) 1271 return self.__class__.__name__
1272
1273 1274 -class Uri(PhyloElement):
1275 """A uniform resource identifier. 1276 1277 In general, this is expected to be an URL (for example, to link to an image 1278 on a website, in which case the ``type`` attribute might be 'image' and 1279 ``desc`` might be 'image of a California sea hare'). 1280 """
1281 - def __init__(self, value, desc=None, type=None):
1282 self.value = value 1283 self.desc = desc 1284 self.type = type
1285
1286 - def __str__(self):
1287 if self.value: 1288 return self.value 1289 return repr(self)
1290