Package Bio :: Package Phylo :: Module PhyloXML
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.PhyloXML

   1  # Copyright (C) 2009 by Eric Talevich (eric.talevich@gmail.com) 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license. Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5   
   6  """Classes corresponding to phyloXML elements. 
   7   
   8  See Also 
   9  -------- 
  10  Official specification: 
  11     http://phyloxml.org/ 
  12  Journal article: 
  13      Han and Zmasek (2009), doi:10.1186/1471-2105-10-356 
  14  """ 
  15   
  16  __docformat__ = "restructuredtext en" 
  17   
  18  import re 
  19  import warnings 
  20   
  21  from Bio._py3k import basestring 
  22   
  23  from Bio import Alphabet 
  24  from Bio.Align import MultipleSeqAlignment 
  25  from Bio.Seq import Seq 
  26  from Bio.SeqFeature import SeqFeature, FeatureLocation 
  27  from Bio.SeqRecord import SeqRecord 
  28  from Bio import BiopythonWarning 
  29   
  30  from Bio.Phylo import BaseTree 
31 32 33 -class PhyloXMLWarning(BiopythonWarning):
34 """Warning for non-compliance with the phyloXML specification.""" 35 pass
36
37 38 -def _check_str(text, testfunc):
39 """Check a string using testfunc, and warn if there's no match.""" 40 if text is not None and not testfunc(text): 41 warnings.warn("String %s doesn't match the given regexp" % text, 42 PhyloXMLWarning, stacklevel=2)
43
44 45 # Core elements 46 47 -class PhyloElement(BaseTree.TreeElement):
48 """Base class for all PhyloXML objects."""
49
50 51 -class Phyloxml(PhyloElement):
52 """Root node of the PhyloXML document. 53 54 Contains an arbitrary number of Phylogeny elements, possibly followed by 55 elements from other namespaces. 56 57 :Parameters: 58 attributes : dict 59 (XML namespace definitions) 60 phylogenies : list 61 The phylogenetic trees 62 other : list 63 Arbitrary non-phyloXML elements, if any 64 """ 65
66 - def __init__(self, attributes, phylogenies=None, other=None):
67 self.attributes = { 68 # standard 69 "xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance", 70 "xmlns": "http://www.phyloxml.org", 71 "xsi:schemaLocation": "http://www.phyloxml.org http://www.phyloxml.org/1.10/phyloxml.xsd", 72 } 73 if attributes: 74 self.attributes.update(attributes) 75 self.phylogenies = phylogenies or [] 76 self.other = other or []
77
78 - def __getitem__(self, index):
79 """Get a phylogeny by index or name.""" 80 if isinstance(index, int) or isinstance(index, slice): 81 return self.phylogenies[index] 82 if not isinstance(index, basestring): 83 raise KeyError("can't use %s as an index" % type(index)) 84 for tree in self.phylogenies: 85 if tree.name == index: 86 return tree 87 else: 88 raise KeyError("no phylogeny found with name " + repr(index))
89
90 - def __iter__(self):
91 """Iterate through the phylogenetic trees in this object.""" 92 return iter(self.phylogenies)
93
94 - def __len__(self):
95 """Number of phylogenetic trees in this object.""" 96 return len(self.phylogenies)
97
98 - def __str__(self):
99 return '%s([%s])' % (self.__class__.__name__, 100 ',\n'.join(map(str, self.phylogenies)))
101
102 103 -class Other(PhyloElement):
104 """Container for non-phyloXML elements in the tree. 105 106 Usually, an Other object will have either a 'value' or a non-empty list 107 of 'children', but not both. This is not enforced here, though. 108 109 :Parameters: 110 tag : string 111 local tag for the XML node 112 namespace : string 113 XML namespace for the node -- should not be the default phyloXML 114 namespace. 115 attributes : dict of strings 116 attributes on the XML node 117 value : string 118 text contained directly within this XML node 119 children : list 120 child nodes, if any (also `Other` instances) 121 """ 122
123 - def __init__(self, tag, namespace=None, attributes=None, value=None, 124 children=None):
125 self.tag = tag 126 self.namespace = namespace 127 self.attributes = attributes or {} 128 self.value = value 129 self.children = children or []
130
131 - def __iter__(self):
132 """Iterate through the children of this object (if any).""" 133 return iter(self.children)
134
135 136 -class Phylogeny(PhyloElement, BaseTree.Tree):
137 """A phylogenetic tree. 138 139 :Parameters: 140 root : Clade 141 the root node/clade of this tree 142 rooted : bool 143 True if this tree is rooted 144 rerootable : bool 145 True if this tree is rerootable 146 branch_length_unit : string 147 unit for branch_length values on clades 148 name : string 149 identifier for this tree, not required to be unique 150 id : Id 151 unique identifier for this tree 152 description : string 153 plain-text description 154 date : Date 155 date for the root node of this tree 156 confidences : list 157 Confidence objects for this tree 158 clade_relations : list 159 CladeRelation objects 160 sequence_relations : list 161 SequenceRelation objects 162 properties : list 163 Property objects 164 other : list 165 non-phyloXML elements (type `Other`) 166 """ 167
168 - def __init__(self, root=None, rooted=True, 169 rerootable=None, branch_length_unit=None, type=None, 170 # Child nodes 171 name=None, id=None, description=None, date=None, 172 # Collections 173 confidences=None, clade_relations=None, sequence_relations=None, 174 properties=None, other=None, 175 ):
176 assert isinstance(rooted, bool) 177 self.root = root 178 self.rooted = rooted 179 self.rerootable = rerootable 180 self.branch_length_unit = branch_length_unit 181 self.type = type 182 self.name = name 183 self.id = id 184 self.description = description 185 self.date = date 186 self.confidences = confidences or [] 187 self.clade_relations = clade_relations or [] 188 self.sequence_relations = sequence_relations or [] 189 self.properties = properties or [] 190 self.other = other or []
191 192 @classmethod
193 - def from_tree(cls, tree, **kwargs):
194 """Create a new Phylogeny given a Tree (from Newick/Nexus or BaseTree). 195 196 Keyword arguments are the usual `Phylogeny` constructor parameters. 197 """ 198 phy = cls( 199 root=Clade.from_clade(tree.root), 200 rooted=tree.rooted, 201 name=tree.name, 202 id=(tree.id is not None) and Id(str(tree.id)) or None) 203 phy.__dict__.update(kwargs) 204 return phy
205 206 @classmethod
207 - def from_clade(cls, clade, **kwargs):
208 """Create a new Phylogeny given a Newick or BaseTree Clade object. 209 210 Keyword arguments are the usual `PhyloXML.Clade` constructor parameters. 211 """ 212 return Clade.from_clade(clade).to_phylogeny(**kwargs)
213
214 - def as_phyloxml(self):
215 """Return this tree, a PhyloXML-compatible Phylogeny object. 216 217 Overrides the `BaseTree` method. 218 """ 219 return self
220
221 - def to_phyloxml_container(self, **kwargs):
222 """Create a new Phyloxml object containing just this phylogeny.""" 223 return Phyloxml(kwargs, phylogenies=[self])
224
225 - def to_alignment(self):
226 """Construct an alignment from the aligned sequences in this tree.""" 227 def is_aligned_seq(elem): 228 if isinstance(elem, Sequence) and elem.mol_seq.is_aligned: 229 return True 230 return False
231 seqs = self._filter_search(is_aligned_seq, 'preorder', True) 232 try: 233 first_seq = next(seqs) 234 except StopIteration: 235 # No aligned sequences were found --> empty MSA 236 return MultipleSeqAlignment([]) 237 msa = MultipleSeqAlignment([first_seq.to_seqrecord()], 238 first_seq.get_alphabet()) 239 msa.extend(seq.to_seqrecord() for seq in seqs) 240 return msa
241 242 # Singular property for plural attribute
243 - def _get_confidence(self):
244 """Equivalent to self.confidences[0] if there is only 1 value. 245 246 See also: `Clade.confidence`, `Clade.taxonomy` 247 """ 248 if len(self.confidences) == 0: 249 return None 250 if len(self.confidences) > 1: 251 raise AttributeError("more than 1 confidence value available; " 252 "use Phylogeny.confidences") 253 return self.confidences[0]
254
255 - def _set_confidence(self, value):
256 if value is None: 257 # Special case: mirror the behavior of _get_confidence 258 self.confidences = [] 259 return 260 if isinstance(value, float) or isinstance(value, int): 261 value = Confidence(value) 262 elif not isinstance(value, Confidence): 263 raise ValueError("value must be a number or Confidence instance") 264 if len(self.confidences) == 0: 265 self.confidences.append(value) 266 elif len(self.confidences) == 1: 267 self.confidences[0] = value 268 else: 269 raise ValueError("multiple confidence values already exist; " 270 "use Phylogeny.confidences instead")
271
272 - def _del_confidence(self):
273 self.confidences = []
274 275 confidence = property(_get_confidence, _set_confidence, _del_confidence) 276
277 278 -class Clade(PhyloElement, BaseTree.Clade):
279 """Describes a branch of the current phylogenetic tree. 280 281 Used recursively, describes the topology of a phylogenetic tree. 282 283 Both ``color`` and ``width`` elements should be interpreted by client code 284 as applying to the whole clade, including all descendents, unless 285 overwritten in-sub clades. This module doesn't automatically assign these 286 attributes to sub-clades to achieve this cascade -- and neither should you. 287 288 :Parameters: 289 branch_length 290 parent branch length of this clade 291 id_source 292 link other elements to a clade (on the xml-level) 293 name : string 294 short label for this clade 295 confidences : list of Confidence objects 296 used to indicate the support for a clade/parent branch. 297 width : float 298 branch width for this clade (including branch from parent) 299 color : BranchColor 300 color used for graphical display of this clade 301 node_id 302 unique identifier for the root node of this clade 303 taxonomies : list 304 Taxonomy objects 305 sequences : list 306 Sequence objects 307 events : Events 308 describe such events as gene-duplications at the root node/parent 309 branch of this clade 310 binary_characters : BinaryCharacters 311 binary characters 312 distributions : list of Distribution objects 313 distribution(s) of this clade 314 date : Date 315 a date for the root node of this clade 316 references : list 317 Reference objects 318 properties : list 319 Property objects 320 clades : list Clade objects 321 Sub-clades 322 other : list of Other objects 323 non-phyloXML objects 324 """ 325
326 - def __init__(self, 327 # Attributes 328 branch_length=None, id_source=None, 329 # Child nodes 330 name=None, width=None, color=None, node_id=None, events=None, 331 binary_characters=None, date=None, 332 # Collections 333 confidences=None, taxonomies=None, sequences=None, 334 distributions=None, references=None, properties=None, clades=None, 335 other=None, 336 ):
337 self.branch_length = branch_length 338 self.id_source = id_source 339 self.name = name 340 self.width = width 341 self.color = color 342 self.node_id = node_id 343 self.events = events 344 self.binary_characters = binary_characters 345 self.date = date 346 self.confidences = confidences or [] 347 self.taxonomies = taxonomies or [] 348 self.sequences = sequences or [] 349 self.distributions = distributions or [] 350 self.references = references or [] 351 self.properties = properties or [] 352 self.clades = clades or [] 353 self.other = other or []
354 355 @classmethod
356 - def from_clade(cls, clade, **kwargs):
357 """Create a new PhyloXML Clade from a Newick or BaseTree Clade object. 358 359 Keyword arguments are the usual PhyloXML Clade constructor parameters. 360 """ 361 new_clade = cls(branch_length=clade.branch_length, 362 name=clade.name) 363 new_clade.clades = [cls.from_clade(c) for c in clade] 364 new_clade.confidence = clade.confidence 365 new_clade.width = clade.width 366 new_clade.color = (BranchColor( 367 clade.color.red, clade.color.green, clade.color.blue) 368 if clade.color else None) 369 new_clade.__dict__.update(kwargs) 370 return new_clade
371
372 - def to_phylogeny(self, **kwargs):
373 """Create a new phylogeny containing just this clade.""" 374 phy = Phylogeny(root=self, date=self.date) 375 phy.__dict__.update(kwargs) 376 return phy
377 378 # Shortcuts for list attributes that are usually only 1 item 379 # NB: Duplicated from Phylogeny class
380 - def _get_confidence(self):
381 if len(self.confidences) == 0: 382 return None 383 if len(self.confidences) > 1: 384 raise AttributeError("more than 1 confidence value available; " 385 "use Clade.confidences") 386 return self.confidences[0]
387
388 - def _set_confidence(self, value):
389 if value is None: 390 # Special case: mirror the behavior of _get_confidence 391 self.confidences = [] 392 return 393 if isinstance(value, float) or isinstance(value, int): 394 value = Confidence(value) 395 elif not isinstance(value, Confidence): 396 raise ValueError("value must be a number or Confidence instance") 397 if len(self.confidences) == 0: 398 self.confidences.append(value) 399 elif len(self.confidences) == 1: 400 self.confidences[0] = value 401 else: 402 raise ValueError("multiple confidence values already exist; " 403 "use Phylogeny.confidences instead")
404
405 - def _del_confidence(self):
406 self.confidences = []
407 408 confidence = property(_get_confidence, _set_confidence, _del_confidence) 409
410 - def _get_taxonomy(self):
411 if len(self.taxonomies) == 0: 412 return None 413 if len(self.taxonomies) > 1: 414 raise AttributeError("more than 1 taxonomy value available; " 415 "use Clade.taxonomies") 416 return self.taxonomies[0]
417
418 - def _set_taxonomy(self, value):
419 if not isinstance(value, Taxonomy): 420 raise ValueError("assigned value must be a Taxonomy instance") 421 if len(self.taxonomies) == 0: 422 self.taxonomies.append(value) 423 elif len(self.taxonomies) == 1: 424 self.taxonomies[0] = value 425 else: 426 raise ValueError("multiple taxonomy values already exist; " 427 "use Phylogeny.taxonomies instead")
428 429 taxonomy = property(_get_taxonomy, _set_taxonomy)
430
431 432 # PhyloXML wrapper for a special BaseTree attribute 433 434 -class BranchColor(PhyloElement, BaseTree.BranchColor):
435
436 - def __init__(self, *args, **kwargs):
437 BaseTree.BranchColor.__init__(self, *args, **kwargs)
438
439 440 # PhyloXML-specific complex types 441 442 -class Accession(PhyloElement):
443 """Captures the local part in a sequence identifier. 444 445 Example: In ``UniProtKB:P17304``, the Accession instance attribute ``value`` 446 is 'P17304' and the ``source`` attribute is 'UniProtKB'. 447 """ 448
449 - def __init__(self, value, source):
450 self.value = value 451 self.source = source
452
453 - def __str__(self):
454 """Show the class name and an identifying attribute.""" 455 return '%s:%s' % (self.source, self.value)
456
457 458 -class Annotation(PhyloElement):
459 """The annotation of a molecular sequence. 460 461 It is recommended to annotate by using the optional 'ref' attribute. 462 463 :Parameters: 464 ref : string 465 reference string, e.g. 'GO:0008270', 466 'KEGG:Tetrachloroethene degradation', 'EC:1.1.1.1' 467 source : string 468 plain-text source for this annotation 469 evidence : str 470 describe evidence as free text (e.g. 'experimental') 471 desc : string 472 free text description 473 confidence : Confidence 474 state the type and value of support (type Confidence) 475 properties : list 476 typed and referenced annotations from external resources 477 uri : Uri 478 link 479 """ 480 re_ref = re.compile(r'[a-zA-Z0-9_]+:[a-zA-Z0-9_\.\-\s]+') 481
482 - def __init__(self, 483 # Attributes 484 ref=None, source=None, evidence=None, type=None, 485 # Child nodes 486 desc=None, confidence=None, uri=None, 487 # Collection 488 properties=None):
489 _check_str(ref, self.re_ref.match) 490 self.ref = ref 491 self.source = source 492 self.evidence = evidence 493 self.type = type 494 self.desc = desc 495 self.confidence = confidence 496 self.uri = uri 497 self.properties = properties or []
498
499 500 -class BinaryCharacters(PhyloElement):
501 """Binary characters at the root of a clade. 502 503 The names and/or counts of binary characters present, gained, and lost 504 at the root of a clade. 505 """ 506
507 - def __init__(self, 508 # Attributes 509 type=None, gained_count=None, lost_count=None, present_count=None, 510 absent_count=None, 511 # Child nodes (flattened into collections) 512 gained=None, lost=None, present=None, absent=None):
513 self.type = type 514 self.gained_count = gained_count 515 self.lost_count = lost_count 516 self.present_count = present_count 517 self.absent_count = absent_count 518 self.gained = gained or [] 519 self.lost = lost or [] 520 self.present = present or [] 521 self.absent = absent or []
522
523 524 -class CladeRelation(PhyloElement):
525 """Expresses a typed relationship between two clades. 526 527 For example, this could be used to describe multiple parents of a clade. 528 529 @type id_ref_0: str 530 @type id_ref_1: str 531 @type distance: str 532 @type type: str 533 534 @type confidence: Confidence 535 """ 536
537 - def __init__(self, type, id_ref_0, id_ref_1, 538 distance=None, confidence=None):
539 self.distance = distance 540 self.type = type 541 self.id_ref_0 = id_ref_0 542 self.id_ref_1 = id_ref_1 543 self.confidence = confidence
544
545 546 -class Confidence(PhyloElement):
547 """A general purpose confidence element. 548 549 For example, this can be used to express the bootstrap support value of a 550 clade (in which case the `type` attribute is 'bootstrap'). 551 552 :Parameters: 553 value : float 554 confidence value 555 type : string 556 label for the type of confidence, e.g. 'bootstrap' 557 """ 558
559 - def __init__(self, value, type='unknown'):
560 self.value = value 561 self.type = type
562 563 # Comparison operators 564
565 - def __hash__(self):
566 """Return the hash value of the object. 567 568 Hash values are integers. They are used to quickly compare dictionary 569 keys during a dictionary lookup. Numeric values that compare equal have 570 the same hash value (even if they are of different types, as is the 571 case for 1 and 1.0). 572 """ 573 return id(self)
574
575 - def __eq__(self, other):
576 if isinstance(other, Confidence): 577 return self.value == other.value 578 return self.value == other
579
580 - def __ne__(self, other):
581 if isinstance(other, Confidence): 582 return self.value != other.value 583 return self.value != other
584 585 # Ordering -- see functools.total_ordering in Py2.7 586
587 - def __lt__(self, other):
588 if isinstance(other, Confidence): 589 return self.value < other.value 590 return self.value < other
591
592 - def __le__(self, other):
593 return self < other or self == other
594
595 - def __gt__(self, other):
596 return not (self <= other)
597
598 - def __ge__(self, other):
599 return not (self.value < other)
600 601 # Arithmetic operators, including reverse 602
603 - def __add__(self, other):
604 return self.value + other
605
606 - def __radd__(self, other):
607 return other + self.value
608
609 - def __sub__(self, other):
610 return self.value - other
611
612 - def __rsub__(self, other):
613 return other - self.value
614
615 - def __mul__(self, other):
616 return self.value * other
617
618 - def __rmul__(self, other):
619 return other * self.value
620
621 - def __div__(self, other):
622 return self.value.__div__(other)
623
624 - def __rdiv__(self, other):
625 return other.__div__(self.value)
626
627 - def __truediv__(self, other):
628 """Rational-style division in Py3.0+. 629 630 Also active in Py2.5+ with __future__.division import. 631 """ 632 return self.value / other
633
634 - def __rtruediv__(self, other):
635 return other / self.value
636
637 - def __floordiv__(self, other):
638 """C-style and old-style division in Py3.0+. 639 640 Also active in Py2.5+ with __future__.division import. 641 """ 642 return self.value.__floordiv__(other)
643
644 - def __rfloordiv__(self, other):
645 return other.__floordiv__(self.value)
646
647 - def __mod__(self, other):
648 return self.value % other
649
650 - def __rmod__(self, other):
651 return other % self.value
652
653 - def __divmod__(self, other):
654 return divmod(self.value, other)
655
656 - def __rdivmod__(self, other):
657 return divmod(other, self.value)
658
659 - def __pow__(self, other, modulo=None):
660 if modulo is not None: 661 return pow(self.value, other, modulo) 662 return pow(self.value, other)
663
664 - def __rpow__(self, other):
665 return pow(other, self.value)
666 667 # Unary arithmetic operations: -, +, abs() 668
669 - def __neg__(self):
670 return -self.value
671
672 - def __pos__(self):
673 return self.value
674
675 - def __abs__(self):
676 return abs(self.value)
677 678 # Explicit coercion to numeric types: int, long, float 679
680 - def __float__(self):
681 return float(self.value)
682
683 - def __int__(self):
684 return int(self.value)
685
686 - def __long__(self):
687 return long(self.value)
688
689 690 -class Date(PhyloElement):
691 """A date associated with a clade/node. 692 693 Its value can be numerical by using the 'value' element and/or free text 694 with the 'desc' element' (e.g. 'Silurian'). If a numerical value is used, it 695 is recommended to employ the 'unit' attribute. 696 697 :Parameters: 698 unit : string 699 type of numerical value (e.g. 'mya' for 'million years ago') 700 value : float 701 the date value 702 desc : string 703 plain-text description of the date 704 minimum : float 705 lower bound on the date value 706 maximum : float 707 upper bound on the date value 708 """ 709
710 - def __init__(self, value=None, unit=None, desc=None, 711 minimum=None, maximum=None):
712 self.value = value 713 self.unit = unit 714 self.desc = desc 715 self.minimum = minimum 716 self.maximum = maximum
717
718 - def __str__(self):
719 """Show the class name and the human-readable date.""" 720 if self.unit and self.value is not None: 721 return '%s %s' % (self.value, self.unit) 722 if self.desc is not None: 723 return self.desc 724 return self.__class__.__name__
725
726 727 -class Distribution(PhyloElement):
728 """Geographic distribution of the items of a clade (species, sequences). 729 730 Intended for phylogeographic applications. 731 732 :Parameters: 733 desc : string 734 free-text description of the location 735 points : list of `Point` objects 736 coordinates (similar to the 'Point' element in Google's KML format) 737 polygons : list of `Polygon` objects 738 coordinate sets defining geographic regions 739 """ 740
741 - def __init__(self, desc=None, points=None, polygons=None):
742 self.desc = desc 743 self.points = points or [] 744 self.polygons = polygons or []
745
746 747 -class DomainArchitecture(PhyloElement):
748 """Domain architecture of a protein. 749 750 :Parameters: 751 length : int 752 total length of the protein sequence 753 domains : list ProteinDomain objects 754 the domains within this protein 755 """ 756
757 - def __init__(self, length=None, domains=None):
758 self.length = length 759 self.domains = domains
760
761 762 -class Events(PhyloElement):
763 """Events at the root node of a clade (e.g. one gene duplication). 764 765 All attributes are set to None by default, but this object can also be 766 treated as a dictionary, in which case None values are treated as missing 767 keys and deleting a key resets that attribute's value back to None. 768 """ 769 ok_type = set(('transfer', 'fusion', 'speciation_or_duplication', 'other', 770 'mixed', 'unassigned')) 771
772 - def __init__(self, type=None, duplications=None, speciations=None, 773 losses=None, confidence=None):
774 _check_str(type, self.ok_type.__contains__) 775 self.type = type 776 self.duplications = duplications 777 self.speciations = speciations 778 self.losses = losses 779 self.confidence = confidence
780
781 - def items(self):
782 return [(k, v) for k, v in self.__dict__.items() if v is not None]
783
784 - def keys(self):
785 return [k for k, v in self.__dict__.items() if v is not None]
786
787 - def values(self):
788 return [v for v in self.__dict__.values() if v is not None]
789
790 - def __len__(self):
791 # TODO - Better way to do this? 792 return len(self.values())
793
794 - def __getitem__(self, key):
795 if not hasattr(self, key): 796 raise KeyError(key) 797 val = getattr(self, key) 798 if val is None: 799 raise KeyError("%s has not been set in this object" % repr(key)) 800 return val
801
802 - def __setitem__(self, key, val):
803 setattr(self, key, val)
804
805 - def __delitem__(self, key):
806 setattr(self, key, None)
807
808 - def __iter__(self):
809 return iter(self.keys())
810
811 - def __contains__(self, key):
812 return (hasattr(self, key) and getattr(self, key) is not None)
813
814 815 -class Id(PhyloElement):
816 """A general-purpose identifier element. 817 818 Allows to indicate the provider (or authority) of an identifier, e.g. NCBI, 819 along with the value itself. 820 """ 821
822 - def __init__(self, value, provider=None):
823 self.value = value 824 self.provider = provider
825
826 - def __str__(self):
827 if self.provider is not None: 828 return '%s:%s' % (self.provider, self.value) 829 return self.value
830
831 832 -class MolSeq(PhyloElement):
833 """Store a molecular sequence. 834 835 :Parameters: 836 value : string 837 the sequence itself 838 is_aligned : bool 839 True if this sequence is aligned with the others (usually meaning 840 all aligned seqs are the same length and gaps may be present) 841 """ 842 re_value = re.compile(r'[a-zA-Z\.\-\?\*_]+') 843
844 - def __init__(self, value, is_aligned=None):
845 _check_str(value, self.re_value.match) 846 self.value = value 847 self.is_aligned = is_aligned
848
849 - def __str__(self):
850 return self.value
851
852 853 -class Point(PhyloElement):
854 """Geographic coordinates of a point, with an optional altitude. 855 856 Used by element 'Distribution'. 857 858 :Parameters: 859 geodetic_datum : string, required 860 the geodetic datum (also called 'map datum'). For example, Google's 861 KML uses 'WGS84'. 862 lat : numeric 863 latitude 864 long : numeric 865 longitude 866 alt : numeric 867 altitude 868 alt_unit : string 869 unit for the altitude (e.g. 'meter') 870 """ 871
872 - def __init__(self, geodetic_datum, lat, long, alt=None, alt_unit=None):
873 self.geodetic_datum = geodetic_datum 874 self.lat = lat 875 self.long = long 876 self.alt = alt 877 self.alt_unit = alt_unit
878
879 880 -class Polygon(PhyloElement):
881 """A polygon defined by a list of 'Points' (used by element 'Distribution'). 882 883 :param points: list of 3 or more points representing vertices. 884 """ 885
886 - def __init__(self, points=None):
887 self.points = points or []
888
889 - def __str__(self):
890 return '%s([%s])' % (self.__class__.__name__, 891 ',\n'.join(map(str, self.points)))
892
893 894 -class Property(PhyloElement):
895 """A typed and referenced property from an external resources. 896 897 Can be attached to `Phylogeny`, `Clade`, and `Annotation` objects. 898 899 :Parameters: 900 value : string 901 the value of the property 902 ref : string 903 reference to an external resource, e.g. "NOAA:depth" 904 applies_to : string 905 indicates the item to which a property applies to (e.g. 'node' for 906 the parent node of a clade, 'parent_branch' for the parent branch of 907 a clade, or just 'clade'). 908 datatype : string 909 the type of a property; limited to xsd-datatypes 910 (e.g. 'xsd:string', 'xsd:boolean', 'xsd:integer', 'xsd:decimal', 911 'xsd:float', 'xsd:double', 'xsd:date', 'xsd:anyURI'). 912 unit : string (optional) 913 the unit of the property, e.g. "METRIC:m" 914 id_ref : Id (optional) 915 allows to attached a property specifically to one element (on the 916 xml-level) 917 """ 918 re_ref = re.compile(r'[a-zA-Z0-9_]+:[a-zA-Z0-9_\.\-\s]+') 919 ok_applies_to = set(('phylogeny', 'clade', 'node', 'annotation', 920 'parent_branch', 'other')) 921 ok_datatype = set(('xsd:string', 'xsd:boolean', 'xsd:decimal', 'xsd:float', 922 'xsd:double', 'xsd:duration', 'xsd:dateTime', 'xsd:time', 'xsd:date', 923 'xsd:gYearMonth', 'xsd:gYear', 'xsd:gMonthDay', 'xsd:gDay', 924 'xsd:gMonth', 'xsd:hexBinary', 'xsd:base64Binary', 'xsd:anyURI', 925 'xsd:normalizedString', 'xsd:token', 'xsd:integer', 926 'xsd:nonPositiveInteger', 'xsd:negativeInteger', 'xsd:long', 'xsd:int', 927 'xsd:short', 'xsd:byte', 'xsd:nonNegativeInteger', 'xsd:unsignedLong', 928 'xsd:unsignedInt', 'xsd:unsignedShort', 'xsd:unsignedByte', 929 'xsd:positiveInteger')) 930
931 - def __init__(self, value, ref, applies_to, datatype, 932 unit=None, id_ref=None):
933 _check_str(ref, self.re_ref.match) 934 _check_str(applies_to, self.ok_applies_to.__contains__) 935 _check_str(datatype, self.ok_datatype.__contains__) 936 _check_str(unit, self.re_ref.match) 937 self.unit = unit 938 self.id_ref = id_ref 939 self.value = value 940 self.ref = ref 941 self.applies_to = applies_to 942 self.datatype = datatype
943
944 945 -class ProteinDomain(PhyloElement):
946 """Represents an individual domain in a domain architecture. 947 948 The locations use 0-based indexing, as most Python objects including 949 SeqFeature do, rather than the usual biological convention starting at 1. 950 This means the start and end attributes can be used directly as slice 951 indexes on Seq objects. 952 953 :Parameters: 954 start : non-negative integer 955 start of the domain on the sequence, using 0-based indexing 956 end : non-negative integer 957 end of the domain on the sequence 958 confidence : float 959 can be used to store e.g. E-values 960 id : string 961 unique identifier/name 962 """ 963
964 - def __init__(self, value, start, end, confidence=None, id=None):
965 self.value = value 966 self.start = start 967 self.end = end 968 self.confidence = confidence 969 self.id = id
970 971 @classmethod
972 - def from_seqfeature(cls, feat):
973 return ProteinDomain(feat.id, 974 feat.location.nofuzzy_start, 975 feat.location.nofuzzy_end, 976 confidence=feat.qualifiers.get('confidence'))
977
978 - def to_seqfeature(self):
979 feat = SeqFeature(location=FeatureLocation(self.start, self.end), 980 id=self.value) 981 if hasattr(self, 'confidence'): 982 feat.qualifiers['confidence'] = self.confidence 983 return feat
984
985 986 -class Reference(PhyloElement):
987 """Literature reference for a clade. 988 989 NB: Whenever possible, use the ``doi`` attribute instead of the free-text 990 ``desc`` element. 991 """ 992 re_doi = re.compile(r'[a-zA-Z0-9_\.]+/[a-zA-Z0-9_\.]+') 993
994 - def __init__(self, doi=None, desc=None):
995 _check_str(doi, self.re_doi.match) 996 self.doi = doi 997 self.desc = desc
998
999 1000 -class Sequence(PhyloElement):
1001 """A molecular sequence (Protein, DNA, RNA) associated with a node. 1002 1003 One intended use for ``id_ref`` is to link a sequence to a taxonomy (via the 1004 taxonomy's ``id_source``) in case of multiple sequences and taxonomies per 1005 node. 1006 1007 :Parameters: 1008 type : {'dna', 'rna', 'protein'} 1009 type of molecule this sequence represents 1010 id_ref : string 1011 reference to another resource 1012 id_source : string 1013 source for the reference 1014 symbol : string 1015 short symbol of the sequence, e.g. 'ACTM' (max. 10 chars) 1016 accession : Accession 1017 accession code for this sequence. 1018 name : string 1019 full name of the sequence, e.g. 'muscle Actin' 1020 location 1021 location of a sequence on a genome/chromosome. 1022 mol_seq : MolSeq 1023 the molecular sequence itself 1024 uri : Uri 1025 link 1026 annotations : list of Annotation objects 1027 annotations on this sequence 1028 domain_architecture : DomainArchitecture 1029 protein domains on this sequence 1030 other : list of Other objects 1031 non-phyloXML elements 1032 """ 1033 alphabets = {'dna': Alphabet.generic_dna, 1034 'rna': Alphabet.generic_rna, 1035 'protein': Alphabet.generic_protein} 1036 re_symbol = re.compile(r'\S{1,10}') 1037
1038 - def __init__(self, 1039 # Attributes 1040 type=None, id_ref=None, id_source=None, 1041 # Child nodes 1042 symbol=None, accession=None, name=None, location=None, 1043 mol_seq=None, uri=None, domain_architecture=None, 1044 # Collections 1045 annotations=None, other=None, 1046 ):
1047 _check_str(type, self.alphabets.__contains__) 1048 _check_str(symbol, self.re_symbol.match) 1049 self.type = type 1050 self.id_ref = id_ref 1051 self.id_source = id_source 1052 self.symbol = symbol 1053 self.accession = accession 1054 self.name = name 1055 self.location = location 1056 self.mol_seq = mol_seq 1057 self.uri = uri 1058 self.domain_architecture = domain_architecture 1059 self.annotations = annotations or [] 1060 self.other = other or []
1061 1062 @classmethod
1063 - def from_seqrecord(cls, record, is_aligned=None):
1064 """Create a new PhyloXML Sequence from a SeqRecord object.""" 1065 if is_aligned is None: 1066 is_aligned = isinstance(record.seq.alphabet, Alphabet.Gapped) 1067 params = { 1068 'accession': Accession(record.id, ''), 1069 'symbol': record.name, 1070 'name': record.description, 1071 'mol_seq': MolSeq(str(record.seq), is_aligned), 1072 } 1073 if isinstance(record.seq.alphabet, Alphabet.DNAAlphabet): 1074 params['type'] = 'dna' 1075 elif isinstance(record.seq.alphabet, Alphabet.RNAAlphabet): 1076 params['type'] = 'rna' 1077 elif isinstance(record.seq.alphabet, Alphabet.ProteinAlphabet): 1078 params['type'] = 'protein' 1079 1080 # Unpack record.annotations 1081 for key in ('id_ref', 'id_source', 'location'): 1082 if key in record.annotations: 1083 params[key] = record.annotations[key] 1084 if isinstance(record.annotations.get('uri'), dict): 1085 params['uri'] = Uri(**record.annotations['uri']) 1086 # Build a Sequence.annotation object 1087 if record.annotations.get('annotations'): 1088 params['annotations'] = [] 1089 for annot in record.annotations['annotations']: 1090 ann_args = {} 1091 for key in ('ref', 'source', 'evidence', 'type', 'desc'): 1092 if key in annot: 1093 ann_args[key] = annot[key] 1094 if isinstance(annot.get('confidence'), list): 1095 ann_args['confidence'] = Confidence( 1096 *annot['confidence']) 1097 if isinstance(annot.get('properties'), list): 1098 ann_args['properties'] = [Property(**prop) 1099 for prop in annot['properties'] 1100 if isinstance(prop, dict)] 1101 params['annotations'].append(Annotation(**ann_args)) 1102 1103 # Unpack record.features 1104 if record.features: 1105 params['domain_architecture'] = DomainArchitecture( 1106 length=len(record.seq), 1107 domains=[ProteinDomain.from_seqfeature(feat) 1108 for feat in record.features]) 1109 1110 return Sequence(**params)
1111
1112 - def to_seqrecord(self):
1113 """Create a SeqRecord object from this Sequence instance. 1114 1115 The seqrecord.annotations dictionary is packed like so:: 1116 1117 { # Sequence attributes with no SeqRecord equivalent: 1118 'id_ref': self.id_ref, 1119 'id_source': self.id_source, 1120 'location': self.location, 1121 'uri': { 'value': self.uri.value, 1122 'desc': self.uri.desc, 1123 'type': self.uri.type }, 1124 # Sequence.annotations attribute (list of Annotations) 1125 'annotations': [{'ref': ann.ref, 1126 'source': ann.source, 1127 'evidence': ann.evidence, 1128 'type': ann.type, 1129 'confidence': [ann.confidence.value, 1130 ann.confidence.type], 1131 'properties': [{'value': prop.value, 1132 'ref': prop.ref, 1133 'applies_to': prop.applies_to, 1134 'datatype': prop.datatype, 1135 'unit': prop.unit, 1136 'id_ref': prop.id_ref} 1137 for prop in ann.properties], 1138 } for ann in self.annotations], 1139 } 1140 """ 1141 def clean_dict(dct): 1142 """Remove None-valued items from a dictionary.""" 1143 return dict((key, val) for key, val in dct.items() 1144 if val is not None)
1145 1146 seqrec = SeqRecord(Seq(self.mol_seq.value, self.get_alphabet()), 1147 **clean_dict({ 1148 'id': str(self.accession), 1149 'name': self.symbol, 1150 'description': self.name, 1151 # 'dbxrefs': None, 1152 })) 1153 if self.domain_architecture: 1154 seqrec.features = [dom.to_seqfeature() 1155 for dom in self.domain_architecture.domains] 1156 # Sequence attributes with no SeqRecord equivalent 1157 seqrec.annotations = clean_dict({ 1158 'id_ref': self.id_ref, 1159 'id_source': self.id_source, 1160 'location': self.location, 1161 'uri': self.uri and clean_dict({ 1162 'value': self.uri.value, 1163 'desc': self.uri.desc, 1164 'type': self.uri.type, 1165 }), 1166 'annotations': self.annotations and [ 1167 clean_dict({ 1168 'ref': ann.ref, 1169 'source': ann.source, 1170 'evidence': ann.evidence, 1171 'type': ann.type, 1172 'confidence': ann.confidence and [ 1173 ann.confidence.value, 1174 ann.confidence.type], 1175 'properties': [clean_dict({ 1176 'value': prop.value, 1177 'ref': prop.ref, 1178 'applies_to': prop.applies_to, 1179 'datatype': prop.datatype, 1180 'unit': prop.unit, 1181 'id_ref': prop.id_ref}) 1182 for prop in ann.properties], 1183 }) for ann in self.annotations], 1184 }) 1185 return seqrec
1186
1187 - def get_alphabet(self):
1188 alph = self.alphabets.get(self.type, Alphabet.generic_alphabet) 1189 if self.mol_seq and self.mol_seq.is_aligned: 1190 return Alphabet.Gapped(alph) 1191 return alph
1192
1193 1194 -class SequenceRelation(PhyloElement):
1195 """Express a typed relationship between two sequences. 1196 1197 For example, this could be used to describe an orthology (in which case 1198 attribute 'type' is 'orthology'). 1199 1200 :Parameters: 1201 id_ref_0 : Id 1202 first sequence reference identifier 1203 id_ref_1 : Id 1204 second sequence reference identifier 1205 distance : float 1206 distance between the two sequences 1207 type : restricted string 1208 describe the type of relationship 1209 confidence : Confidence 1210 confidence value for this relation 1211 """ 1212 ok_type = set(('orthology', 'one_to_one_orthology', 'super_orthology', 1213 'paralogy', 'ultra_paralogy', 'xenology', 'unknown', 'other')) 1214
1215 - def __init__(self, type, id_ref_0, id_ref_1, 1216 distance=None, confidence=None):
1217 _check_str(type, self.ok_type.__contains__) 1218 self.distance = distance 1219 self.type = type 1220 self.id_ref_0 = id_ref_0 1221 self.id_ref_1 = id_ref_1 1222 self.confidence = confidence
1223
1224 1225 -class Taxonomy(PhyloElement):
1226 """Describe taxonomic information for a clade. 1227 1228 :Parameters: 1229 id_source : Id 1230 link other elements to a taxonomy (on the XML level) 1231 id : Id 1232 unique identifier of a taxon, e.g. Id('6500', 1233 provider='ncbi_taxonomy') for the California sea hare 1234 code : restricted string 1235 store UniProt/Swiss-Prot style organism codes, e.g. 'APLCA' for the 1236 California sea hare 'Aplysia californica' 1237 scientific_name : string 1238 the standard scientific name for this organism, e.g. 'Aplysia 1239 californica' for the California sea hare 1240 authority : string 1241 keep the authority, such as 'J. G. Cooper, 1863', associated with 1242 the 'scientific_name' 1243 common_names : list of strings 1244 common names for this organism 1245 synonyms : list of strings 1246 synonyms for this taxon? 1247 rank : restricted string 1248 taxonomic rank 1249 uri : Uri 1250 link 1251 other : list of Other objects 1252 non-phyloXML elements 1253 """ 1254 re_code = re.compile(r'[a-zA-Z0-9_]{2,10}') 1255 ok_rank = set(('domain', 'kingdom', 'subkingdom', 'branch', 'infrakingdom', 1256 'superphylum', 'phylum', 'subphylum', 'infraphylum', 'microphylum', 1257 'superdivision', 'division', 'subdivision', 'infradivision', 1258 'superclass', 'class', 'subclass', 'infraclass', 'superlegion', 1259 'legion', 'sublegion', 'infralegion', 'supercohort', 'cohort', 1260 'subcohort', 'infracohort', 'superorder', 'order', 'suborder', 1261 'superfamily', 'family', 'subfamily', 'supertribe', 'tribe', 'subtribe', 1262 'infratribe', 'genus', 'subgenus', 'superspecies', 'species', 1263 'subspecies', 'variety', 'subvariety', 'form', 'subform', 'cultivar', 1264 'unknown', 'other')) 1265
1266 - def __init__(self, 1267 # Attributes 1268 id_source=None, 1269 # Child nodes 1270 id=None, code=None, scientific_name=None, authority=None, 1271 rank=None, uri=None, 1272 # Collections 1273 common_names=None, synonyms=None, other=None, 1274 ):
1275 _check_str(code, self.re_code.match) 1276 _check_str(rank, self.ok_rank.__contains__) 1277 self.id_source = id_source 1278 self.id = id 1279 self.code = code 1280 self.scientific_name = scientific_name 1281 self.authority = authority 1282 self.rank = rank 1283 self.uri = uri 1284 self.common_names = common_names or [] 1285 self.synonyms = synonyms or [] 1286 self.other = other or []
1287
1288 - def __str__(self):
1289 """Show the class name and an identifying attribute.""" 1290 if self.code is not None: 1291 return self.code 1292 if self.scientific_name is not None: 1293 return self.scientific_name 1294 if self.rank is not None: 1295 return self.rank 1296 if self.id is not None: 1297 return str(self.id) 1298 return self.__class__.__name__
1299
1300 1301 -class Uri(PhyloElement):
1302 """A uniform resource identifier. 1303 1304 In general, this is expected to be an URL (for example, to link to an image 1305 on a website, in which case the ``type`` attribute might be 'image' and 1306 ``desc`` might be 'image of a California sea hare'). 1307 """ 1308
1309 - def __init__(self, value, desc=None, type=None):
1310 self.value = value 1311 self.desc = desc 1312 self.type = type
1313
1314 - def __str__(self):
1315 if self.value: 1316 return self.value 1317 return repr(self)
1318