Package Bio :: Package Phylo :: Module PhyloXML
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.PhyloXML

   1  # Copyright (C) 2009 by Eric Talevich (eric.talevich@gmail.com) 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license. Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5   
   6  """Classes corresponding to phyloXML elements. 
   7   
   8  See Also 
   9  -------- 
  10  Official specification: 
  11     http://phyloxml.org/ 
  12  Journal article: 
  13      Han and Zmasek (2009), doi:10.1186/1471-2105-10-356 
  14  """ 
  15  __docformat__ = "restructuredtext en" 
  16   
  17  import re 
  18  import warnings 
  19   
  20  from Bio._py3k import basestring 
  21   
  22  from Bio import Alphabet 
  23  from Bio.Align import MultipleSeqAlignment 
  24  from Bio.Seq import Seq 
  25  from Bio.SeqFeature import SeqFeature, FeatureLocation 
  26  from Bio.SeqRecord import SeqRecord 
  27  from Bio import BiopythonWarning 
  28   
  29  from Bio.Phylo import BaseTree 
30 31 32 -class PhyloXMLWarning(BiopythonWarning):
33 """Warning for non-compliance with the phyloXML specification.""" 34 pass
35
36 37 -def _check_str(text, testfunc):
38 """Check a string using testfunc, and warn if there's no match.""" 39 if text is not None and not testfunc(text): 40 warnings.warn("String %s doesn't match the given regexp" % text, 41 PhyloXMLWarning, stacklevel=2)
42
43 44 # Core elements 45 46 -class PhyloElement(BaseTree.TreeElement):
47 """Base class for all PhyloXML objects."""
48
49 50 -class Phyloxml(PhyloElement):
51 """Root node of the PhyloXML document. 52 53 Contains an arbitrary number of Phylogeny elements, possibly followed by 54 elements from other namespaces. 55 56 :Parameters: 57 attributes : dict 58 (XML namespace definitions) 59 phylogenies : list 60 The phylogenetic trees 61 other : list 62 Arbitrary non-phyloXML elements, if any 63 """
64 - def __init__(self, attributes, phylogenies=None, other=None):
65 self.attributes = { 66 "xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance", # standard 67 "xmlns": "http://www.phyloxml.org", 68 "xsi:schemaLocation": "http://www.phyloxml.org http://www.phyloxml.org/1.10/phyloxml.xsd", 69 } 70 if attributes: 71 self.attributes.update(attributes) 72 self.phylogenies = phylogenies or [] 73 self.other = other or []
74
75 - def __getitem__(self, index):
76 """Get a phylogeny by index or name.""" 77 if isinstance(index, int) or isinstance(index, slice): 78 return self.phylogenies[index] 79 if not isinstance(index, basestring): 80 raise KeyError("can't use %s as an index" % type(index)) 81 for tree in self.phylogenies: 82 if tree.name == index: 83 return tree 84 else: 85 raise KeyError("no phylogeny found with name " + repr(index))
86
87 - def __iter__(self):
88 """Iterate through the phylogenetic trees in this object.""" 89 return iter(self.phylogenies)
90
91 - def __len__(self):
92 """Number of phylogenetic trees in this object.""" 93 return len(self.phylogenies)
94
95 - def __str__(self):
96 return '%s([%s])' % (self.__class__.__name__, 97 ',\n'.join(map(str, self.phylogenies)))
98
99 100 -class Other(PhyloElement):
101 """Container for non-phyloXML elements in the tree. 102 103 Usually, an Other object will have either a 'value' or a non-empty list 104 of 'children', but not both. This is not enforced here, though. 105 106 :Parameters: 107 tag : string 108 local tag for the XML node 109 namespace : string 110 XML namespace for the node -- should not be the default phyloXML 111 namespace. 112 attributes : dict of strings 113 attributes on the XML node 114 value : string 115 text contained directly within this XML node 116 children : list 117 child nodes, if any (also `Other` instances) 118 """
119 - def __init__(self, tag, namespace=None, attributes=None, value=None, 120 children=None):
121 self.tag = tag 122 self.namespace = namespace 123 self.attributes = attributes or {} 124 self.value = value 125 self.children = children or []
126
127 - def __iter__(self):
128 """Iterate through the children of this object (if any).""" 129 return iter(self.children)
130
131 132 -class Phylogeny(PhyloElement, BaseTree.Tree):
133 """A phylogenetic tree. 134 135 :Parameters: 136 root : Clade 137 the root node/clade of this tree 138 rooted : bool 139 True if this tree is rooted 140 rerootable : bool 141 True if this tree is rerootable 142 branch_length_unit : string 143 unit for branch_length values on clades 144 name : string 145 identifier for this tree, not required to be unique 146 id : Id 147 unique identifier for this tree 148 description : string 149 plain-text description 150 date : Date 151 date for the root node of this tree 152 confidences : list 153 Confidence objects for this tree 154 clade_relations : list 155 CladeRelation objects 156 sequence_relations : list 157 SequenceRelation objects 158 properties : list 159 Property objects 160 other : list 161 non-phyloXML elements (type `Other`) 162 """
163 - def __init__(self, root=None, rooted=True, 164 rerootable=None, branch_length_unit=None, type=None, 165 # Child nodes 166 name=None, id=None, description=None, date=None, 167 # Collections 168 confidences=None, clade_relations=None, sequence_relations=None, 169 properties=None, other=None, 170 ):
171 assert isinstance(rooted, bool) 172 self.root = root 173 self.rooted = rooted 174 self.rerootable = rerootable 175 self.branch_length_unit = branch_length_unit 176 self.type = type 177 self.name = name 178 self.id = id 179 self.description = description 180 self.date = date 181 self.confidences = confidences or [] 182 self.clade_relations = clade_relations or [] 183 self.sequence_relations = sequence_relations or [] 184 self.properties = properties or [] 185 self.other = other or []
186 187 @classmethod
188 - def from_tree(cls, tree, **kwargs):
189 """Create a new Phylogeny given a Tree (from Newick/Nexus or BaseTree). 190 191 Keyword arguments are the usual `Phylogeny` constructor parameters. 192 """ 193 phy = cls( 194 root=Clade.from_clade(tree.root), 195 rooted=tree.rooted, 196 name=tree.name, 197 id=(tree.id is not None) and Id(str(tree.id)) or None) 198 phy.__dict__.update(kwargs) 199 return phy
200 201 @classmethod
202 - def from_clade(cls, clade, **kwargs):
203 """Create a new Phylogeny given a Newick or BaseTree Clade object. 204 205 Keyword arguments are the usual `PhyloXML.Clade` constructor parameters. 206 """ 207 return Clade.from_clade(clade).to_phylogeny(**kwargs)
208
209 - def as_phyloxml(self):
210 """Return this tree, a PhyloXML-compatible Phylogeny object. 211 212 Overrides the `BaseTree` method. 213 """ 214 return self
215
216 - def to_phyloxml_container(self, **kwargs):
217 """Create a new Phyloxml object containing just this phylogeny.""" 218 return Phyloxml(kwargs, phylogenies=[self])
219
220 - def to_alignment(self):
221 """Construct an alignment from the aligned sequences in this tree.""" 222 def is_aligned_seq(elem): 223 if isinstance(elem, Sequence) and elem.mol_seq.is_aligned: 224 return True 225 return False
226 seqs = self._filter_search(is_aligned_seq, 'preorder', True) 227 try: 228 first_seq = next(seqs) 229 except StopIteration: 230 # No aligned sequences were found --> empty MSA 231 return MultipleSeqAlignment([]) 232 msa = MultipleSeqAlignment([first_seq.to_seqrecord()], 233 first_seq.get_alphabet()) 234 msa.extend(seq.to_seqrecord() for seq in seqs) 235 return msa
236 237 # Singular property for plural attribute
238 - def _get_confidence(self):
239 """Equivalent to self.confidences[0] if there is only 1 value. 240 241 See also: `Clade.confidence`, `Clade.taxonomy` 242 """ 243 if len(self.confidences) == 0: 244 return None 245 if len(self.confidences) > 1: 246 raise AttributeError("more than 1 confidence value available; " 247 "use Phylogeny.confidences") 248 return self.confidences[0]
249
250 - def _set_confidence(self, value):
251 if value is None: 252 # Special case: mirror the behavior of _get_confidence 253 self.confidences = [] 254 return 255 if isinstance(value, float) or isinstance(value, int): 256 value = Confidence(value) 257 elif not isinstance(value, Confidence): 258 raise ValueError("value must be a number or Confidence instance") 259 if len(self.confidences) == 0: 260 self.confidences.append(value) 261 elif len(self.confidences) == 1: 262 self.confidences[0] = value 263 else: 264 raise ValueError("multiple confidence values already exist; " 265 "use Phylogeny.confidences instead")
266
267 - def _del_confidence(self):
268 self.confidences = []
269 270 confidence = property(_get_confidence, _set_confidence, _del_confidence) 271
272 273 -class Clade(PhyloElement, BaseTree.Clade):
274 """Describes a branch of the current phylogenetic tree. 275 276 Used recursively, describes the topology of a phylogenetic tree. 277 278 Both ``color`` and ``width`` elements should be interpreted by client code 279 as applying to the whole clade, including all descendents, unless 280 overwritten in-sub clades. This module doesn't automatically assign these 281 attributes to sub-clades to achieve this cascade -- and neither should you. 282 283 :Parameters: 284 branch_length 285 parent branch length of this clade 286 id_source 287 link other elements to a clade (on the xml-level) 288 name : string 289 short label for this clade 290 confidences : list of Confidence objects 291 used to indicate the support for a clade/parent branch. 292 width : float 293 branch width for this clade (including branch from parent) 294 color : BranchColor 295 color used for graphical display of this clade 296 node_id 297 unique identifier for the root node of this clade 298 taxonomies : list 299 Taxonomy objects 300 sequences : list 301 Sequence objects 302 events : Events 303 describe such events as gene-duplications at the root node/parent 304 branch of this clade 305 binary_characters : BinaryCharacters 306 binary characters 307 distributions : list of Distribution objects 308 distribution(s) of this clade 309 date : Date 310 a date for the root node of this clade 311 references : list 312 Reference objects 313 properties : list 314 Property objects 315 clades : list Clade objects 316 Sub-clades 317 other : list of Other objects 318 non-phyloXML objects 319 """
320 - def __init__(self, 321 # Attributes 322 branch_length=None, id_source=None, 323 # Child nodes 324 name=None, width=None, color=None, node_id=None, events=None, 325 binary_characters=None, date=None, 326 # Collections 327 confidences=None, taxonomies=None, sequences=None, 328 distributions=None, references=None, properties=None, clades=None, 329 other=None, 330 ):
331 self.branch_length = branch_length 332 self.id_source = id_source 333 self.name = name 334 self.width = width 335 self.color = color 336 self.node_id = node_id 337 self.events = events 338 self.binary_characters = binary_characters 339 self.date = date 340 self.confidences = confidences or [] 341 self.taxonomies = taxonomies or [] 342 self.sequences = sequences or [] 343 self.distributions = distributions or [] 344 self.references = references or [] 345 self.properties = properties or [] 346 self.clades = clades or [] 347 self.other = other or []
348 349 @classmethod
350 - def from_clade(cls, clade, **kwargs):
351 """Create a new PhyloXML Clade from a Newick or BaseTree Clade object. 352 353 Keyword arguments are the usual PhyloXML Clade constructor parameters. 354 """ 355 new_clade = cls(branch_length=clade.branch_length, 356 name=clade.name) 357 new_clade.clades = [cls.from_clade(c) for c in clade] 358 new_clade.confidence = clade.confidence 359 new_clade.width = clade.width 360 new_clade.color = (BranchColor( 361 clade.color.red, clade.color.green, clade.color.blue) 362 if clade.color else None) 363 new_clade.__dict__.update(kwargs) 364 return new_clade
365
366 - def to_phylogeny(self, **kwargs):
367 """Create a new phylogeny containing just this clade.""" 368 phy = Phylogeny(root=self, date=self.date) 369 phy.__dict__.update(kwargs) 370 return phy
371 372 # Shortcuts for list attributes that are usually only 1 item 373 # NB: Duplicated from Phylogeny class
374 - def _get_confidence(self):
375 if len(self.confidences) == 0: 376 return None 377 if len(self.confidences) > 1: 378 raise AttributeError("more than 1 confidence value available; " 379 "use Clade.confidences") 380 return self.confidences[0]
381
382 - def _set_confidence(self, value):
383 if value is None: 384 # Special case: mirror the behavior of _get_confidence 385 self.confidences = [] 386 return 387 if isinstance(value, float) or isinstance(value, int): 388 value = Confidence(value) 389 elif not isinstance(value, Confidence): 390 raise ValueError("value must be a number or Confidence instance") 391 if len(self.confidences) == 0: 392 self.confidences.append(value) 393 elif len(self.confidences) == 1: 394 self.confidences[0] = value 395 else: 396 raise ValueError("multiple confidence values already exist; " 397 "use Phylogeny.confidences instead")
398
399 - def _del_confidence(self):
400 self.confidences = []
401 402 confidence = property(_get_confidence, _set_confidence, _del_confidence) 403
404 - def _get_taxonomy(self):
405 if len(self.taxonomies) == 0: 406 return None 407 if len(self.taxonomies) > 1: 408 raise AttributeError("more than 1 taxonomy value available; " 409 "use Clade.taxonomies") 410 return self.taxonomies[0]
411
412 - def _set_taxonomy(self, value):
413 if not isinstance(value, Taxonomy): 414 raise ValueError("assigned value must be a Taxonomy instance") 415 if len(self.taxonomies) == 0: 416 self.taxonomies.append(value) 417 elif len(self.taxonomies) == 1: 418 self.taxonomies[0] = value 419 else: 420 raise ValueError("multiple taxonomy values already exist; " 421 "use Phylogeny.taxonomies instead")
422 423 taxonomy = property(_get_taxonomy, _set_taxonomy)
424
425 426 # PhyloXML wrapper for a special BaseTree attribute 427 428 -class BranchColor(PhyloElement, BaseTree.BranchColor):
429 - def __init__(self, *args, **kwargs):
430 BaseTree.BranchColor.__init__(self, *args, **kwargs)
431
432 433 # PhyloXML-specific complex types 434 435 -class Accession(PhyloElement):
436 """Captures the local part in a sequence identifier. 437 438 Example: In ``UniProtKB:P17304``, the Accession instance attribute ``value`` 439 is 'P17304' and the ``source`` attribute is 'UniProtKB'. 440 """
441 - def __init__(self, value, source):
442 self.value = value 443 self.source = source
444
445 - def __str__(self):
446 """Show the class name and an identifying attribute.""" 447 return '%s:%s' % (self.source, self.value)
448
449 450 -class Annotation(PhyloElement):
451 """The annotation of a molecular sequence. 452 453 It is recommended to annotate by using the optional 'ref' attribute. 454 455 :Parameters: 456 ref : string 457 reference string, e.g. 'GO:0008270', 458 'KEGG:Tetrachloroethene degradation', 'EC:1.1.1.1' 459 source : string 460 plain-text source for this annotation 461 evidence : str 462 describe evidence as free text (e.g. 'experimental') 463 desc : string 464 free text description 465 confidence : Confidence 466 state the type and value of support (type Confidence) 467 properties : list 468 typed and referenced annotations from external resources 469 uri : Uri 470 link 471 """ 472 re_ref = re.compile(r'[a-zA-Z0-9_]+:[a-zA-Z0-9_\.\-\s]+') 473
474 - def __init__(self, 475 # Attributes 476 ref=None, source=None, evidence=None, type=None, 477 # Child nodes 478 desc=None, confidence=None, uri=None, 479 # Collection 480 properties=None):
481 _check_str(ref, self.re_ref.match) 482 self.ref = ref 483 self.source = source 484 self.evidence = evidence 485 self.type = type 486 self.desc = desc 487 self.confidence = confidence 488 self.uri = uri 489 self.properties = properties or []
490
491 492 -class BinaryCharacters(PhyloElement):
493 """The names and/or counts of binary characters present, gained, and lost 494 at the root of a clade. 495 """
496 - def __init__(self, 497 # Attributes 498 type=None, gained_count=None, lost_count=None, present_count=None, 499 absent_count=None, 500 # Child nodes (flattened into collections) 501 gained=None, lost=None, present=None, absent=None):
502 self.type=type 503 self.gained_count=gained_count 504 self.lost_count=lost_count 505 self.present_count=present_count 506 self.absent_count=absent_count 507 self.gained=gained or [] 508 self.lost=lost or [] 509 self.present=present or [] 510 self.absent=absent or []
511
512 513 -class CladeRelation(PhyloElement):
514 """Expresses a typed relationship between two clades. 515 516 For example, this could be used to describe multiple parents of a clade. 517 518 @type id_ref_0: str 519 @type id_ref_1: str 520 @type distance: str 521 @type type: str 522 523 @type confidence: Confidence 524 """
525 - def __init__(self, type, id_ref_0, id_ref_1, 526 distance=None, confidence=None):
527 self.distance = distance 528 self.type = type 529 self.id_ref_0 = id_ref_0 530 self.id_ref_1 = id_ref_1 531 self.confidence = confidence
532
533 534 -class Confidence(PhyloElement):
535 """A general purpose confidence element. 536 537 For example, this can be used to express the bootstrap support value of a 538 clade (in which case the `type` attribute is 'bootstrap'). 539 540 :Parameters: 541 value : float 542 confidence value 543 type : string 544 label for the type of confidence, e.g. 'bootstrap' 545 """
546 - def __init__(self, value, type='unknown'):
547 self.value = value 548 self.type = type
549 550 # Comparison operators 551
552 - def __hash__(self):
553 """Return the hash value of the object. 554 555 Hash values are integers. They are used to quickly compare dictionary 556 keys during a dictionary lookup. Numeric values that compare equal have 557 the same hash value (even if they are of different types, as is the 558 case for 1 and 1.0). 559 """ 560 return id(self)
561
562 - def __eq__(self, other):
563 if isinstance(other, Confidence): 564 return self.value == other.value 565 return self.value == other
566
567 - def __ne__(self, other):
568 if isinstance(other, Confidence): 569 return self.value != other.value 570 return self.value != other
571 572 # Ordering -- see functools.total_ordering in Py2.7 573
574 - def __lt__(self, other):
575 if isinstance(other, Confidence): 576 return self.value < other.value 577 return self.value < other
578
579 - def __le__(self, other):
580 return self < other or self == other
581
582 - def __gt__(self, other):
583 return not (self <= other)
584
585 - def __ge__(self, other):
586 return not (self.value < other)
587 588 # Arithmetic operators, including reverse 589
590 - def __add__(self, other):
591 return self.value + other
592
593 - def __radd__(self, other):
594 return other + self.value
595
596 - def __sub__(self, other):
597 return self.value - other
598
599 - def __rsub__(self, other):
600 return other - self.value
601
602 - def __mul__(self, other):
603 return self.value * other
604
605 - def __rmul__(self, other):
606 return other * self.value
607
608 - def __div__(self, other):
609 return self.value.__div__(other)
610
611 - def __rdiv__(self, other):
612 return other.__div__(self.value)
613
614 - def __truediv__(self, other):
615 """Rational-style division in Py3.0+. 616 617 Also active in Py2.5+ with __future__.division import. 618 """ 619 return self.value / other
620
621 - def __rtruediv__(self, other):
622 return other / self.value
623
624 - def __floordiv__(self, other):
625 """C-style and old-style division in Py3.0+. 626 627 Also active in Py2.5+ with __future__.division import. 628 """ 629 return self.value.__floordiv__(other)
630
631 - def __rfloordiv__(self, other):
632 return other.__floordiv__(self.value)
633
634 - def __mod__(self, other):
635 return self.value % other
636
637 - def __rmod__(self, other):
638 return other % self.value
639
640 - def __divmod__(self, other):
641 return divmod(self.value, other)
642
643 - def __rdivmod__(self, other):
644 return divmod(other, self.value)
645
646 - def __pow__(self, other, modulo=None):
647 if modulo is not None: 648 return pow(self.value, other, modulo) 649 return pow(self.value, other)
650
651 - def __rpow__(self, other):
652 return pow(other, self.value)
653 654 # Unary arithmetic operations: -, +, abs() 655
656 - def __neg__(self):
657 return -self.value
658
659 - def __pos__(self):
660 return self.value
661
662 - def __abs__(self):
663 return abs(self.value)
664 665 # Explicit coercion to numeric types: int, long, float 666
667 - def __float__(self):
668 return float(self.value)
669
670 - def __int__(self):
671 return int(self.value)
672
673 - def __long__(self):
674 return long(self.value)
675
676 677 -class Date(PhyloElement):
678 """A date associated with a clade/node. 679 680 Its value can be numerical by using the 'value' element and/or free text 681 with the 'desc' element' (e.g. 'Silurian'). If a numerical value is used, it 682 is recommended to employ the 'unit' attribute. 683 684 :Parameters: 685 unit : string 686 type of numerical value (e.g. 'mya' for 'million years ago') 687 value : float 688 the date value 689 desc : string 690 plain-text description of the date 691 minimum : float 692 lower bound on the date value 693 maximum : float 694 upper bound on the date value 695 """
696 - def __init__(self, value=None, unit=None, desc=None, 697 minimum=None, maximum=None):
698 self.value = value 699 self.unit = unit 700 self.desc = desc 701 self.minimum = minimum 702 self.maximum = maximum
703
704 - def __str__(self):
705 """Show the class name and the human-readable date.""" 706 if self.unit and self.value is not None: 707 return '%s %s' % (self.value, self.unit) 708 if self.desc is not None: 709 return self.desc 710 return self.__class__.__name__
711
712 713 -class Distribution(PhyloElement):
714 """Geographic distribution of the items of a clade (species, sequences). 715 716 Intended for phylogeographic applications. 717 718 :Parameters: 719 desc : string 720 free-text description of the location 721 points : list of `Point` objects 722 coordinates (similar to the 'Point' element in Google's KML format) 723 polygons : list of `Polygon` objects 724 coordinate sets defining geographic regions 725 """
726 - def __init__(self, desc=None, points=None, polygons=None):
727 self.desc = desc 728 self.points = points or [] 729 self.polygons = polygons or []
730
731 732 -class DomainArchitecture(PhyloElement):
733 """Domain architecture of a protein. 734 735 :Parameters: 736 length : int 737 total length of the protein sequence 738 domains : list ProteinDomain objects 739 the domains within this protein 740 """
741 - def __init__(self, length=None, domains=None):
742 self.length = length 743 self.domains = domains
744
745 746 -class Events(PhyloElement):
747 """Events at the root node of a clade (e.g. one gene duplication). 748 749 All attributes are set to None by default, but this object can also be 750 treated as a dictionary, in which case None values are treated as missing 751 keys and deleting a key resets that attribute's value back to None. 752 """ 753 ok_type = set(('transfer', 'fusion', 'speciation_or_duplication', 'other', 754 'mixed', 'unassigned')) 755
756 - def __init__(self, type=None, duplications=None, speciations=None, 757 losses=None, confidence=None):
758 _check_str(type, self.ok_type.__contains__) 759 self.type = type 760 self.duplications = duplications 761 self.speciations = speciations 762 self.losses = losses 763 self.confidence = confidence
764
765 - def items(self):
766 return [(k, v) for k, v in self.__dict__.items() if v is not None]
767
768 - def keys(self):
769 return [k for k, v in self.__dict__.items() if v is not None]
770
771 - def values(self):
772 return [v for v in self.__dict__.values() if v is not None]
773
774 - def __len__(self):
775 #TODO - Better way to do this? 776 return len(self.values())
777
778 - def __getitem__(self, key):
779 if not hasattr(self, key): 780 raise KeyError(key) 781 val = getattr(self, key) 782 if val is None: 783 raise KeyError("%s has not been set in this object" % repr(key)) 784 return val
785
786 - def __setitem__(self, key, val):
787 setattr(self, key, val)
788
789 - def __delitem__(self, key):
790 setattr(self, key, None)
791
792 - def __iter__(self):
793 return iter(self.keys())
794
795 - def __contains__(self, key):
796 return (hasattr(self, key) and getattr(self, key) is not None)
797
798 799 -class Id(PhyloElement):
800 """A general-purpose identifier element. 801 802 Allows to indicate the provider (or authority) of an identifier, e.g. NCBI, 803 along with the value itself. 804 """
805 - def __init__(self, value, provider=None):
806 self.value = value 807 self.provider = provider
808
809 - def __str__(self):
810 if self.provider is not None: 811 return '%s:%s' % (self.provider, self.value) 812 return self.value
813
814 815 -class MolSeq(PhyloElement):
816 """Store a molecular sequence. 817 818 :Parameters: 819 value : string 820 the sequence itself 821 is_aligned : bool 822 True if this sequence is aligned with the others (usually meaning 823 all aligned seqs are the same length and gaps may be present) 824 """ 825 re_value = re.compile(r'[a-zA-Z\.\-\?\*_]+') 826
827 - def __init__(self, value, is_aligned=None):
828 _check_str(value, self.re_value.match) 829 self.value = value 830 self.is_aligned = is_aligned
831
832 - def __str__(self):
833 return self.value
834
835 836 -class Point(PhyloElement):
837 """Geographic coordinates of a point, with an optional altitude. 838 839 Used by element 'Distribution'. 840 841 :Parameters: 842 geodetic_datum : string, required 843 the geodetic datum (also called 'map datum'). For example, Google's 844 KML uses 'WGS84'. 845 lat : numeric 846 latitude 847 long : numeric 848 longitude 849 alt : numeric 850 altitude 851 alt_unit : string 852 unit for the altitude (e.g. 'meter') 853 """
854 - def __init__(self, geodetic_datum, lat, long, alt=None, alt_unit=None):
855 self.geodetic_datum = geodetic_datum 856 self.lat = lat 857 self.long = long 858 self.alt = alt 859 self.alt_unit = alt_unit
860
861 862 -class Polygon(PhyloElement):
863 """A polygon defined by a list of 'Points' (used by element 'Distribution'). 864 865 :param points: list of 3 or more points representing vertices. 866 """
867 - def __init__(self, points=None):
868 self.points = points or []
869
870 - def __str__(self):
871 return '%s([%s])' % (self.__class__.__name__, 872 ',\n'.join(map(str, self.points)))
873
874 875 -class Property(PhyloElement):
876 """A typed and referenced property from an external resources. 877 878 Can be attached to `Phylogeny`, `Clade`, and `Annotation` objects. 879 880 :Parameters: 881 value : string 882 the value of the property 883 ref : string 884 reference to an external resource, e.g. "NOAA:depth" 885 applies_to : string 886 indicates the item to which a property applies to (e.g. 'node' for 887 the parent node of a clade, 'parent_branch' for the parent branch of 888 a clade, or just 'clade'). 889 datatype : string 890 the type of a property; limited to xsd-datatypes 891 (e.g. 'xsd:string', 'xsd:boolean', 'xsd:integer', 'xsd:decimal', 892 'xsd:float', 'xsd:double', 'xsd:date', 'xsd:anyURI'). 893 unit : string (optional) 894 the unit of the property, e.g. "METRIC:m" 895 id_ref : Id (optional) 896 allows to attached a property specifically to one element (on the 897 xml-level) 898 """ 899 re_ref = re.compile(r'[a-zA-Z0-9_]+:[a-zA-Z0-9_\.\-\s]+') 900 ok_applies_to = set(('phylogeny', 'clade', 'node', 'annotation', 901 'parent_branch', 'other')) 902 ok_datatype = set(('xsd:string', 'xsd:boolean', 'xsd:decimal', 'xsd:float', 903 'xsd:double', 'xsd:duration', 'xsd:dateTime', 'xsd:time', 'xsd:date', 904 'xsd:gYearMonth', 'xsd:gYear', 'xsd:gMonthDay', 'xsd:gDay', 905 'xsd:gMonth', 'xsd:hexBinary', 'xsd:base64Binary', 'xsd:anyURI', 906 'xsd:normalizedString', 'xsd:token', 'xsd:integer', 907 'xsd:nonPositiveInteger', 'xsd:negativeInteger', 'xsd:long', 'xsd:int', 908 'xsd:short', 'xsd:byte', 'xsd:nonNegativeInteger', 'xsd:unsignedLong', 909 'xsd:unsignedInt', 'xsd:unsignedShort', 'xsd:unsignedByte', 910 'xsd:positiveInteger')) 911
912 - def __init__(self, value, ref, applies_to, datatype, 913 unit=None, id_ref=None):
914 _check_str(ref, self.re_ref.match) 915 _check_str(applies_to, self.ok_applies_to.__contains__) 916 _check_str(datatype, self.ok_datatype.__contains__) 917 _check_str(unit, self.re_ref.match) 918 self.unit = unit 919 self.id_ref = id_ref 920 self.value = value 921 self.ref = ref 922 self.applies_to = applies_to 923 self.datatype = datatype
924
925 926 -class ProteinDomain(PhyloElement):
927 """Represents an individual domain in a domain architecture. 928 929 The locations use 0-based indexing, as most Python objects including 930 SeqFeature do, rather than the usual biological convention starting at 1. 931 This means the start and end attributes can be used directly as slice 932 indexes on Seq objects. 933 934 :Parameters: 935 start : non-negative integer 936 start of the domain on the sequence, using 0-based indexing 937 end : non-negative integer 938 end of the domain on the sequence 939 confidence : float 940 can be used to store e.g. E-values 941 id : string 942 unique identifier/name 943 """ 944
945 - def __init__(self, value, start, end, confidence=None, id=None):
946 self.value = value 947 self.start = start 948 self.end = end 949 self.confidence = confidence 950 self.id = id
951 952 @classmethod
953 - def from_seqfeature(cls, feat):
954 return ProteinDomain(feat.id, 955 feat.location.nofuzzy_start, 956 feat.location.nofuzzy_end, 957 confidence=feat.qualifiers.get('confidence'))
958
959 - def to_seqfeature(self):
960 feat = SeqFeature(location=FeatureLocation(self.start, self.end), 961 id=self.value) 962 if hasattr(self, 'confidence'): 963 feat.qualifiers['confidence'] = self.confidence 964 return feat
965
966 967 -class Reference(PhyloElement):
968 """Literature reference for a clade. 969 970 NB: Whenever possible, use the ``doi`` attribute instead of the free-text 971 ``desc`` element. 972 """ 973 re_doi = re.compile(r'[a-zA-Z0-9_\.]+/[a-zA-Z0-9_\.]+') 974
975 - def __init__(self, doi=None, desc=None):
976 _check_str(doi, self.re_doi.match) 977 self.doi = doi 978 self.desc = desc
979
980 981 -class Sequence(PhyloElement):
982 """A molecular sequence (Protein, DNA, RNA) associated with a node. 983 984 One intended use for ``id_ref`` is to link a sequence to a taxonomy (via the 985 taxonomy's ``id_source``) in case of multiple sequences and taxonomies per 986 node. 987 988 :Parameters: 989 type : {'dna', 'rna', 'protein'} 990 type of molecule this sequence represents 991 id_ref : string 992 reference to another resource 993 id_source : string 994 source for the reference 995 symbol : string 996 short symbol of the sequence, e.g. 'ACTM' (max. 10 chars) 997 accession : Accession 998 accession code for this sequence. 999 name : string 1000 full name of the sequence, e.g. 'muscle Actin' 1001 location 1002 location of a sequence on a genome/chromosome. 1003 mol_seq : MolSeq 1004 the molecular sequence itself 1005 uri : Uri 1006 link 1007 annotations : list of Annotation objects 1008 annotations on this sequence 1009 domain_architecture : DomainArchitecture 1010 protein domains on this sequence 1011 other : list of Other objects 1012 non-phyloXML elements 1013 """ 1014 alphabets = {'dna': Alphabet.generic_dna, 1015 'rna': Alphabet.generic_rna, 1016 'protein': Alphabet.generic_protein} 1017 re_symbol = re.compile(r'\S{1,10}') 1018
1019 - def __init__(self, 1020 # Attributes 1021 type=None, id_ref=None, id_source=None, 1022 # Child nodes 1023 symbol=None, accession=None, name=None, location=None, 1024 mol_seq=None, uri=None, domain_architecture=None, 1025 # Collections 1026 annotations=None, other=None, 1027 ):
1028 _check_str(type, self.alphabets.__contains__) 1029 _check_str(symbol, self.re_symbol.match) 1030 self.type = type 1031 self.id_ref = id_ref 1032 self.id_source = id_source 1033 self.symbol = symbol 1034 self.accession = accession 1035 self.name = name 1036 self.location = location 1037 self.mol_seq = mol_seq 1038 self.uri = uri 1039 self.domain_architecture = domain_architecture 1040 self.annotations = annotations or [] 1041 self.other = other or []
1042 1043 @classmethod
1044 - def from_seqrecord(cls, record, is_aligned=None):
1045 """Create a new PhyloXML Sequence from a SeqRecord object.""" 1046 if is_aligned is None: 1047 is_aligned = isinstance(record.seq.alphabet, Alphabet.Gapped) 1048 params = { 1049 'accession': Accession(record.id, ''), 1050 'symbol': record.name, 1051 'name': record.description, 1052 'mol_seq': MolSeq(str(record.seq), is_aligned), 1053 } 1054 if isinstance(record.seq.alphabet, Alphabet.DNAAlphabet): 1055 params['type'] = 'dna' 1056 elif isinstance(record.seq.alphabet, Alphabet.RNAAlphabet): 1057 params['type'] = 'rna' 1058 elif isinstance(record.seq.alphabet, Alphabet.ProteinAlphabet): 1059 params['type'] = 'protein' 1060 1061 # Unpack record.annotations 1062 for key in ('id_ref', 'id_source', 'location'): 1063 if key in record.annotations: 1064 params[key] = record.annotations[key] 1065 if isinstance(record.annotations.get('uri'), dict): 1066 params['uri'] = Uri(**record.annotations['uri']) 1067 # Build a Sequence.annotation object 1068 if record.annotations.get('annotations'): 1069 params['annotations'] = [] 1070 for annot in record.annotations['annotations']: 1071 ann_args = {} 1072 for key in ('ref', 'source', 'evidence', 'type', 'desc'): 1073 if key in annot: 1074 ann_args[key] = annot[key] 1075 if isinstance(annot.get('confidence'), list): 1076 ann_args['confidence'] = Confidence( 1077 *annot['confidence']) 1078 if isinstance(annot.get('properties'), list): 1079 ann_args['properties'] = [Property(**prop) 1080 for prop in annot['properties'] 1081 if isinstance(prop, dict)] 1082 params['annotations'].append(Annotation(**ann_args)) 1083 1084 # Unpack record.features 1085 if record.features: 1086 params['domain_architecture'] = DomainArchitecture( 1087 length=len(record.seq), 1088 domains=[ProteinDomain.from_seqfeature(feat) 1089 for feat in record.features]) 1090 1091 return Sequence(**params)
1092
1093 - def to_seqrecord(self):
1094 """Create a SeqRecord object from this Sequence instance. 1095 1096 The seqrecord.annotations dictionary is packed like so:: 1097 1098 { # Sequence attributes with no SeqRecord equivalent: 1099 'id_ref': self.id_ref, 1100 'id_source': self.id_source, 1101 'location': self.location, 1102 'uri': { 'value': self.uri.value, 1103 'desc': self.uri.desc, 1104 'type': self.uri.type }, 1105 # Sequence.annotations attribute (list of Annotations) 1106 'annotations': [{ 'ref': ann.ref, 1107 'source': ann.source, 1108 'evidence': ann.evidence, 1109 'type': ann.type, 1110 'confidence': [ ann.confidence.value, 1111 ann.confidence.type ], 1112 'properties': [{ 'value': prop.value, 1113 'ref': prop.ref, 1114 'applies_to': prop.applies_to, 1115 'datatype': prop.datatype, 1116 'unit': prop.unit, 1117 'id_ref': prop.id_ref } 1118 for prop in ann.properties], 1119 } for ann in self.annotations], 1120 } 1121 """ 1122 def clean_dict(dct): 1123 """Remove None-valued items from a dictionary.""" 1124 return dict((key, val) for key, val in dct.items() 1125 if val is not None)
1126 1127 seqrec = SeqRecord(Seq(self.mol_seq.value, self.get_alphabet()), 1128 **clean_dict({ 1129 'id': str(self.accession), 1130 'name': self.symbol, 1131 'description': self.name, 1132 # 'dbxrefs': None, 1133 })) 1134 if self.domain_architecture: 1135 seqrec.features = [dom.to_seqfeature() 1136 for dom in self.domain_architecture.domains] 1137 # Sequence attributes with no SeqRecord equivalent 1138 seqrec.annotations = clean_dict({ 1139 'id_ref': self.id_ref, 1140 'id_source': self.id_source, 1141 'location': self.location, 1142 'uri': self.uri and clean_dict({ 1143 'value': self.uri.value, 1144 'desc': self.uri.desc, 1145 'type': self.uri.type, 1146 }), 1147 'annotations': self.annotations and [ 1148 clean_dict({ 1149 'ref': ann.ref, 1150 'source': ann.source, 1151 'evidence': ann.evidence, 1152 'type': ann.type, 1153 'confidence': ann.confidence and [ 1154 ann.confidence.value, 1155 ann.confidence.type], 1156 'properties': [clean_dict({ 1157 'value': prop.value, 1158 'ref': prop.ref, 1159 'applies_to': prop.applies_to, 1160 'datatype': prop.datatype, 1161 'unit': prop.unit, 1162 'id_ref': prop.id_ref }) 1163 for prop in ann.properties], 1164 }) for ann in self.annotations], 1165 }) 1166 return seqrec
1167
1168 - def get_alphabet(self):
1169 alph = self.alphabets.get(self.type, Alphabet.generic_alphabet) 1170 if self.mol_seq and self.mol_seq.is_aligned: 1171 return Alphabet.Gapped(alph) 1172 return alph
1173
1174 1175 -class SequenceRelation(PhyloElement):
1176 """Express a typed relationship between two sequences. 1177 1178 For example, this could be used to describe an orthology (in which case 1179 attribute 'type' is 'orthology'). 1180 1181 :Parameters: 1182 id_ref_0 : Id 1183 first sequence reference identifier 1184 id_ref_1 : Id 1185 second sequence reference identifier 1186 distance : float 1187 distance between the two sequences 1188 type : restricted string 1189 describe the type of relationship 1190 confidence : Confidence 1191 confidence value for this relation 1192 """ 1193 ok_type = set(('orthology', 'one_to_one_orthology', 'super_orthology', 1194 'paralogy', 'ultra_paralogy', 'xenology', 'unknown', 'other')) 1195
1196 - def __init__(self, type, id_ref_0, id_ref_1, 1197 distance=None, confidence=None):
1198 _check_str(type, self.ok_type.__contains__) 1199 self.distance = distance 1200 self.type = type 1201 self.id_ref_0 = id_ref_0 1202 self.id_ref_1 = id_ref_1 1203 self.confidence = confidence
1204
1205 1206 -class Taxonomy(PhyloElement):
1207 """Describe taxonomic information for a clade. 1208 1209 :Parameters: 1210 id_source : Id 1211 link other elements to a taxonomy (on the XML level) 1212 id : Id 1213 unique identifier of a taxon, e.g. Id('6500', 1214 provider='ncbi_taxonomy') for the California sea hare 1215 code : restricted string 1216 store UniProt/Swiss-Prot style organism codes, e.g. 'APLCA' for the 1217 California sea hare 'Aplysia californica' 1218 scientific_name : string 1219 the standard scientific name for this organism, e.g. 'Aplysia 1220 californica' for the California sea hare 1221 authority : string 1222 keep the authority, such as 'J. G. Cooper, 1863', associated with 1223 the 'scientific_name' 1224 common_names : list of strings 1225 common names for this organism 1226 synonyms : list of strings 1227 synonyms for this taxon? 1228 rank : restricted string 1229 taxonomic rank 1230 uri : Uri 1231 link 1232 other : list of Other objects 1233 non-phyloXML elements 1234 """ 1235 re_code = re.compile(r'[a-zA-Z0-9_]{2,10}') 1236 ok_rank = set(('domain', 'kingdom', 'subkingdom', 'branch', 'infrakingdom', 1237 'superphylum', 'phylum', 'subphylum', 'infraphylum', 'microphylum', 1238 'superdivision', 'division', 'subdivision', 'infradivision', 1239 'superclass', 'class', 'subclass', 'infraclass', 'superlegion', 1240 'legion', 'sublegion', 'infralegion', 'supercohort', 'cohort', 1241 'subcohort', 'infracohort', 'superorder', 'order', 'suborder', 1242 'superfamily', 'family', 'subfamily', 'supertribe', 'tribe', 'subtribe', 1243 'infratribe', 'genus', 'subgenus', 'superspecies', 'species', 1244 'subspecies', 'variety', 'subvariety', 'form', 'subform', 'cultivar', 1245 'unknown', 'other')) 1246
1247 - def __init__(self, 1248 # Attributes 1249 id_source=None, 1250 # Child nodes 1251 id=None, code=None, scientific_name=None, authority=None, 1252 rank=None, uri=None, 1253 # Collections 1254 common_names=None, synonyms=None, other=None, 1255 ):
1256 _check_str(code, self.re_code.match) 1257 _check_str(rank, self.ok_rank.__contains__) 1258 self.id_source = id_source 1259 self.id = id 1260 self.code = code 1261 self.scientific_name = scientific_name 1262 self.authority = authority 1263 self.rank = rank 1264 self.uri = uri 1265 self.common_names = common_names or [] 1266 self.synonyms = synonyms or [] 1267 self.other = other or []
1268
1269 - def __str__(self):
1270 """Show the class name and an identifying attribute.""" 1271 if self.code is not None: 1272 return self.code 1273 if self.scientific_name is not None: 1274 return self.scientific_name 1275 if self.rank is not None: 1276 return self.rank 1277 if self.id is not None: 1278 return str(self.id) 1279 return self.__class__.__name__
1280
1281 1282 -class Uri(PhyloElement):
1283 """A uniform resource identifier. 1284 1285 In general, this is expected to be an URL (for example, to link to an image 1286 on a website, in which case the ``type`` attribute might be 'image' and 1287 ``desc`` might be 'image of a California sea hare'). 1288 """
1289 - def __init__(self, value, desc=None, type=None):
1290 self.value = value 1291 self.desc = desc 1292 self.type = type
1293
1294 - def __str__(self):
1295 if self.value: 1296 return self.value 1297 return repr(self)
1298