Package Bio :: Package Phylo :: Module PhyloXML
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.PhyloXML

   1  # Copyright (C) 2009 by Eric Talevich (eric.talevich@gmail.com) 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license. Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5   
   6  """Classes corresponding to phyloXML elements. 
   7   
   8  See Also 
   9  -------- 
  10  Official specification: 
  11     http://phyloxml.org/ 
  12  Journal article: 
  13      Han and Zmasek (2009), doi:10.1186/1471-2105-10-356 
  14  """ 
  15  __docformat__ = "restructuredtext en" 
  16   
  17  import re 
  18  import warnings 
  19   
  20  from Bio import Alphabet 
  21  from Bio.Align import MultipleSeqAlignment 
  22  from Bio.Seq import Seq 
  23  from Bio.SeqFeature import SeqFeature, FeatureLocation 
  24  from Bio.SeqRecord import SeqRecord 
  25  from Bio import BiopythonWarning 
  26   
  27  from Bio.Phylo import BaseTree 
28 29 30 -class PhyloXMLWarning(BiopythonWarning):
31 """Warning for non-compliance with the phyloXML specification.""" 32 pass
33
34 35 -def _check_str(text, testfunc):
36 """Check a string using testfunc, and warn if there's no match.""" 37 if text is not None and not testfunc(text): 38 warnings.warn("String %s doesn't match the given regexp" % text, 39 PhyloXMLWarning, stacklevel=2)
40
41 42 # Core elements 43 44 -class PhyloElement(BaseTree.TreeElement):
45 """Base class for all PhyloXML objects."""
46
47 48 -class Phyloxml(PhyloElement):
49 """Root node of the PhyloXML document. 50 51 Contains an arbitrary number of Phylogeny elements, possibly followed by 52 elements from other namespaces. 53 54 :Parameters: 55 attributes : dict 56 (XML namespace definitions) 57 phylogenies : list 58 The phylogenetic trees 59 other : list 60 Arbitrary non-phyloXML elements, if any 61 """
62 - def __init__(self, attributes, phylogenies=None, other=None):
63 self.attributes = { 64 "xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance", # standard 65 "xmlns": "http://www.phyloxml.org", 66 "xsi:schemaLocation": "http://www.phyloxml.org http://www.phyloxml.org/1.10/phyloxml.xsd", 67 } 68 if attributes: 69 self.attributes.update(attributes) 70 self.phylogenies = phylogenies or [] 71 self.other = other or []
72
73 - def __getitem__(self, index):
74 """Get a phylogeny by index or name.""" 75 if isinstance(index, int) or isinstance(index, slice): 76 return self.phylogenies[index] 77 if not isinstance(index, basestring): 78 raise KeyError("can't use %s as an index" % type(index)) 79 for tree in self.phylogenies: 80 if tree.name == index: 81 return tree 82 else: 83 raise KeyError("no phylogeny found with name " + repr(index))
84
85 - def __iter__(self):
86 """Iterate through the phylogenetic trees in this object.""" 87 return iter(self.phylogenies)
88
89 - def __len__(self):
90 """Number of phylogenetic trees in this object.""" 91 return len(self.phylogenies)
92
93 - def __str__(self):
94 return '%s([%s])' % (self.__class__.__name__, 95 ',\n'.join(map(str, self.phylogenies)))
96
97 98 -class Other(PhyloElement):
99 """Container for non-phyloXML elements in the tree. 100 101 Usually, an Other object will have either a 'value' or a non-empty list 102 of 'children', but not both. This is not enforced here, though. 103 104 :Parameters: 105 tag : string 106 local tag for the XML node 107 namespace : string 108 XML namespace for the node -- should not be the default phyloXML 109 namespace. 110 attributes : dict of strings 111 attributes on the XML node 112 value : string 113 text contained directly within this XML node 114 children : list 115 child nodes, if any (also `Other` instances) 116 """
117 - def __init__(self, tag, namespace=None, attributes=None, value=None, 118 children=None):
119 self.tag = tag 120 self.namespace = namespace 121 self.attributes = attributes or {} 122 self.value = value 123 self.children = children or []
124
125 - def __iter__(self):
126 """Iterate through the children of this object (if any).""" 127 return iter(self.children)
128
129 130 -class Phylogeny(PhyloElement, BaseTree.Tree):
131 """A phylogenetic tree. 132 133 :Parameters: 134 root : Clade 135 the root node/clade of this tree 136 rooted : bool 137 True if this tree is rooted 138 rerootable : bool 139 True if this tree is rerootable 140 branch_length_unit : string 141 unit for branch_length values on clades 142 name : string 143 identifier for this tree, not required to be unique 144 id : Id 145 unique identifier for this tree 146 description : string 147 plain-text description 148 date : Date 149 date for the root node of this tree 150 confidences : list 151 Confidence objects for this tree 152 clade_relations : list 153 CladeRelation objects 154 sequence_relations : list 155 SequenceRelation objects 156 properties : list 157 Property objects 158 other : list 159 non-phyloXML elements (type `Other`) 160 """
161 - def __init__(self, root=None, rooted=True, 162 rerootable=None, branch_length_unit=None, type=None, 163 # Child nodes 164 name=None, id=None, description=None, date=None, 165 # Collections 166 confidences=None, clade_relations=None, sequence_relations=None, 167 properties=None, other=None, 168 ):
169 assert isinstance(rooted, bool) 170 self.root = root 171 self.rooted = rooted 172 self.rerootable = rerootable 173 self.branch_length_unit = branch_length_unit 174 self.type = type 175 self.name = name 176 self.id = id 177 self.description = description 178 self.date = date 179 self.confidences = confidences or [] 180 self.clade_relations = clade_relations or [] 181 self.sequence_relations = sequence_relations or [] 182 self.properties = properties or [] 183 self.other = other or []
184 185 @classmethod
186 - def from_tree(cls, tree, **kwargs):
187 """Create a new Phylogeny given a Tree (from Newick/Nexus or BaseTree). 188 189 Keyword arguments are the usual `Phylogeny` constructor parameters. 190 """ 191 phy = cls( 192 root=Clade.from_clade(tree.root), 193 rooted=tree.rooted, 194 name=tree.name, 195 id=(tree.id is not None) and Id(str(tree.id)) or None) 196 phy.__dict__.update(kwargs) 197 return phy
198 199 @classmethod
200 - def from_clade(cls, clade, **kwargs):
201 """Create a new Phylogeny given a Newick or BaseTree Clade object. 202 203 Keyword arguments are the usual `PhyloXML.Clade` constructor parameters. 204 """ 205 return Clade.from_clade(clade).to_phylogeny(**kwargs)
206
207 - def as_phyloxml(self):
208 """Return this tree, a PhyloXML-compatible Phylogeny object. 209 210 Overrides the `BaseTree` method. 211 """ 212 return self
213
214 - def to_phyloxml_container(self, **kwargs):
215 """Create a new Phyloxml object containing just this phylogeny.""" 216 return Phyloxml(kwargs, phylogenies=[self])
217
218 - def to_alignment(self):
219 """Construct an alignment from the aligned sequences in this tree.""" 220 def is_aligned_seq(elem): 221 if isinstance(elem, Sequence) and elem.mol_seq.is_aligned: 222 return True 223 return False
224 seqs = self._filter_search(is_aligned_seq, 'preorder', True) 225 try: 226 first_seq = seqs.next() 227 except StopIteration: 228 # No aligned sequences were found --> empty MSA 229 return MultipleSeqAlignment([]) 230 msa = MultipleSeqAlignment([first_seq.to_seqrecord()], 231 first_seq.get_alphabet()) 232 msa.extend(seq.to_seqrecord() for seq in seqs) 233 return msa
234 235 # Singular property for plural attribute
236 - def _get_confidence(self):
237 """Equivalent to self.confidences[0] if there is only 1 value. 238 239 See also: `Clade.confidence`, `Clade.taxonomy` 240 """ 241 if len(self.confidences) == 0: 242 return None 243 if len(self.confidences) > 1: 244 raise AttributeError("more than 1 confidence value available; " 245 "use Phylogeny.confidences") 246 return self.confidences[0]
247
248 - def _set_confidence(self, value):
249 if value is None: 250 # Special case: mirror the behavior of _get_confidence 251 self.confidences = [] 252 return 253 if isinstance(value, float) or isinstance(value, int): 254 value = Confidence(value) 255 elif not isinstance(value, Confidence): 256 raise ValueError("value must be a number or Confidence instance") 257 if len(self.confidences) == 0: 258 self.confidences.append(value) 259 elif len(self.confidences) == 1: 260 self.confidences[0] = value 261 else: 262 raise ValueError("multiple confidence values already exist; " 263 "use Phylogeny.confidences instead")
264
265 - def _del_confidence(self):
266 self.confidences = []
267 268 confidence = property(_get_confidence, _set_confidence, _del_confidence) 269
270 271 -class Clade(PhyloElement, BaseTree.Clade):
272 """Describes a branch of the current phylogenetic tree. 273 274 Used recursively, describes the topology of a phylogenetic tree. 275 276 Both ``color`` and ``width`` elements should be interpreted by client code 277 as applying to the whole clade, including all descendents, unless 278 overwritten in-sub clades. This module doesn't automatically assign these 279 attributes to sub-clades to achieve this cascade -- and neither should you. 280 281 :Parameters: 282 branch_length 283 parent branch length of this clade 284 id_source 285 link other elements to a clade (on the xml-level) 286 name : string 287 short label for this clade 288 confidences : list of Confidence objects 289 used to indicate the support for a clade/parent branch. 290 width : float 291 branch width for this clade (including branch from parent) 292 color : BranchColor 293 color used for graphical display of this clade 294 node_id 295 unique identifier for the root node of this clade 296 taxonomies : list 297 Taxonomy objects 298 sequences : list 299 Sequence objects 300 events : Events 301 describe such events as gene-duplications at the root node/parent 302 branch of this clade 303 binary_characters : BinaryCharacters 304 binary characters 305 distributions : list of Distribution objects 306 distribution(s) of this clade 307 date : Date 308 a date for the root node of this clade 309 references : list 310 Reference objects 311 properties : list 312 Property objects 313 clades : list Clade objects 314 Sub-clades 315 other : list of Other objects 316 non-phyloXML objects 317 """
318 - def __init__(self, 319 # Attributes 320 branch_length=None, id_source=None, 321 # Child nodes 322 name=None, width=None, color=None, node_id=None, events=None, 323 binary_characters=None, date=None, 324 # Collections 325 confidences=None, taxonomies=None, sequences=None, 326 distributions=None, references=None, properties=None, clades=None, 327 other=None, 328 ):
329 self.branch_length = branch_length 330 self.id_source = id_source 331 self.name = name 332 self.width = width 333 self.color = color 334 self.node_id = node_id 335 self.events = events 336 self.binary_characters = binary_characters 337 self.date = date 338 self.confidences = confidences or [] 339 self.taxonomies = taxonomies or [] 340 self.sequences = sequences or [] 341 self.distributions = distributions or [] 342 self.references = references or [] 343 self.properties = properties or [] 344 self.clades = clades or [] 345 self.other = other or []
346 347 @classmethod
348 - def from_clade(cls, clade, **kwargs):
349 """Create a new PhyloXML Clade from a Newick or BaseTree Clade object. 350 351 Keyword arguments are the usual PhyloXML Clade constructor parameters. 352 """ 353 new_clade = cls(branch_length=clade.branch_length, 354 name=clade.name) 355 new_clade.clades = [cls.from_clade(c) for c in clade] 356 new_clade.confidence = clade.confidence 357 new_clade.width = clade.width 358 new_clade.color = (BranchColor( 359 clade.color.red, clade.color.green, clade.color.blue) 360 if clade.color else None) 361 new_clade.__dict__.update(kwargs) 362 return new_clade
363
364 - def to_phylogeny(self, **kwargs):
365 """Create a new phylogeny containing just this clade.""" 366 phy = Phylogeny(root=self, date=self.date) 367 phy.__dict__.update(kwargs) 368 return phy
369 370 # Shortcuts for list attributes that are usually only 1 item 371 # NB: Duplicated from Phylogeny class
372 - def _get_confidence(self):
373 if len(self.confidences) == 0: 374 return None 375 if len(self.confidences) > 1: 376 raise AttributeError("more than 1 confidence value available; " 377 "use Clade.confidences") 378 return self.confidences[0]
379
380 - def _set_confidence(self, value):
381 if value is None: 382 # Special case: mirror the behavior of _get_confidence 383 self.confidences = [] 384 return 385 if isinstance(value, float) or isinstance(value, int): 386 value = Confidence(value) 387 elif not isinstance(value, Confidence): 388 raise ValueError("value must be a number or Confidence instance") 389 if len(self.confidences) == 0: 390 self.confidences.append(value) 391 elif len(self.confidences) == 1: 392 self.confidences[0] = value 393 else: 394 raise ValueError("multiple confidence values already exist; " 395 "use Phylogeny.confidences instead")
396
397 - def _del_confidence(self):
398 self.confidences = []
399 400 confidence = property(_get_confidence, _set_confidence, _del_confidence) 401
402 - def _get_taxonomy(self):
403 if len(self.taxonomies) == 0: 404 return None 405 if len(self.taxonomies) > 1: 406 raise AttributeError("more than 1 taxonomy value available; " 407 "use Clade.taxonomies") 408 return self.taxonomies[0]
409
410 - def _set_taxonomy(self, value):
411 if not isinstance(value, Taxonomy): 412 raise ValueError("assigned value must be a Taxonomy instance") 413 if len(self.taxonomies) == 0: 414 self.taxonomies.append(value) 415 elif len(self.taxonomies) == 1: 416 self.taxonomies[0] = value 417 else: 418 raise ValueError("multiple taxonomy values already exist; " 419 "use Phylogeny.taxonomies instead")
420 421 taxonomy = property(_get_taxonomy, _set_taxonomy)
422
423 424 # PhyloXML wrapper for a special BaseTree attribute 425 426 -class BranchColor(PhyloElement, BaseTree.BranchColor):
427 - def __init__(self, *args, **kwargs):
428 BaseTree.BranchColor.__init__(self, *args, **kwargs)
429
430 431 # PhyloXML-specific complex types 432 433 -class Accession(PhyloElement):
434 """Captures the local part in a sequence identifier. 435 436 Example: In ``UniProtKB:P17304``, the Accession instance attribute ``value`` 437 is 'P17304' and the ``source`` attribute is 'UniProtKB'. 438 """
439 - def __init__(self, value, source):
440 self.value = value 441 self.source = source
442
443 - def __str__(self):
444 """Show the class name and an identifying attribute.""" 445 return '%s:%s' % (self.source, self.value)
446
447 448 -class Annotation(PhyloElement):
449 """The annotation of a molecular sequence. 450 451 It is recommended to annotate by using the optional 'ref' attribute. 452 453 :Parameters: 454 ref : string 455 reference string, e.g. 'GO:0008270', 456 'KEGG:Tetrachloroethene degradation', 'EC:1.1.1.1' 457 source : string 458 plain-text source for this annotation 459 evidence : str 460 describe evidence as free text (e.g. 'experimental') 461 desc : string 462 free text description 463 confidence : Confidence 464 state the type and value of support (type Confidence) 465 properties : list 466 typed and referenced annotations from external resources 467 uri : Uri 468 link 469 """ 470 re_ref = re.compile(r'[a-zA-Z0-9_]+:[a-zA-Z0-9_\.\-\s]+') 471
472 - def __init__(self, 473 # Attributes 474 ref=None, source=None, evidence=None, type=None, 475 # Child nodes 476 desc=None, confidence=None, uri=None, 477 # Collection 478 properties=None):
479 _check_str(ref, self.re_ref.match) 480 self.ref = ref 481 self.source = source 482 self.evidence = evidence 483 self.type = type 484 self.desc = desc 485 self.confidence = confidence 486 self.uri = uri 487 self.properties = properties or []
488
489 490 -class BinaryCharacters(PhyloElement):
491 """The names and/or counts of binary characters present, gained, and lost 492 at the root of a clade. 493 """
494 - def __init__(self, 495 # Attributes 496 type=None, gained_count=None, lost_count=None, present_count=None, 497 absent_count=None, 498 # Child nodes (flattened into collections) 499 gained=None, lost=None, present=None, absent=None):
500 self.type=type 501 self.gained_count=gained_count 502 self.lost_count=lost_count 503 self.present_count=present_count 504 self.absent_count=absent_count 505 self.gained=gained or [] 506 self.lost=lost or [] 507 self.present=present or [] 508 self.absent=absent or []
509
510 511 -class CladeRelation(PhyloElement):
512 """Expresses a typed relationship between two clades. 513 514 For example, this could be used to describe multiple parents of a clade. 515 516 @type id_ref_0: str 517 @type id_ref_1: str 518 @type distance: str 519 @type type: str 520 521 @type confidence: Confidence 522 """
523 - def __init__(self, type, id_ref_0, id_ref_1, 524 distance=None, confidence=None):
525 self.distance = distance 526 self.type = type 527 self.id_ref_0 = id_ref_0 528 self.id_ref_1 = id_ref_1 529 self.confidence = confidence
530
531 532 -class Confidence(PhyloElement):
533 """A general purpose confidence element. 534 535 For example, this can be used to express the bootstrap support value of a 536 clade (in which case the `type` attribute is 'bootstrap'). 537 538 :Parameters: 539 value : float 540 confidence value 541 type : string 542 label for the type of confidence, e.g. 'bootstrap' 543 """
544 - def __init__(self, value, type='unknown'):
545 self.value = value 546 self.type = type
547 548 # Comparison operators 549
550 - def __hash__(self):
551 """Return the hash value of the object. 552 553 Hash values are integers. They are used to quickly compare dictionary 554 keys during a dictionary lookup. Numeric values that compare equal have 555 the same hash value (even if they are of different types, as is the 556 case for 1 and 1.0). 557 """ 558 return id(self)
559
560 - def __eq__(self, other):
561 if isinstance(other, Confidence): 562 return self.value == other.value 563 return self.value == other
564
565 - def __ne__(self, other):
566 if isinstance(other, Confidence): 567 return self.value != other.value 568 return self.value != other
569 570 # Ordering -- see functools.total_ordering in Py2.7 571
572 - def __lt__(self, other):
573 if isinstance(other, Confidence): 574 return self.value < other.value 575 return self.value < other
576
577 - def __le__(self, other):
578 return self < other or self == other
579
580 - def __gt__(self, other):
581 return not (self <= other)
582
583 - def __ge__(self, other):
584 return not (self.value < other)
585 586 # Arithmetic operators, including reverse 587
588 - def __add__(self, other):
589 return self.value + other
590
591 - def __radd__(self, other):
592 return other + self.value
593
594 - def __sub__(self, other):
595 return self.value - other
596
597 - def __rsub__(self, other):
598 return other - self.value
599
600 - def __mul__(self, other):
601 return self.value * other
602
603 - def __rmul__(self, other):
604 return other * self.value
605
606 - def __div__(self, other):
607 return self.value.__div__(other)
608
609 - def __rdiv__(self, other):
610 return other.__div__(self.value)
611
612 - def __truediv__(self, other):
613 """Rational-style division in Py3.0+. 614 615 Also active in Py2.5+ with __future__.division import. 616 """ 617 return self.value / other
618
619 - def __rtruediv__(self, other):
620 return other / self.value
621
622 - def __floordiv__(self, other):
623 """C-style and old-style division in Py3.0+. 624 625 Also active in Py2.5+ with __future__.division import. 626 """ 627 return self.value.__floordiv__(other)
628
629 - def __rfloordiv__(self, other):
630 return other.__floordiv__(self.value)
631
632 - def __mod__(self, other):
633 return self.value % other
634
635 - def __rmod__(self, other):
636 return other % self.value
637
638 - def __divmod__(self, other):
639 return divmod(self.value, other)
640
641 - def __rdivmod__(self, other):
642 return divmod(other, self.value)
643
644 - def __pow__(self, other, modulo=None):
645 if modulo is not None: 646 return pow(self.value, other, modulo) 647 return pow(self.value, other)
648
649 - def __rpow__(self, other):
650 return pow(other, self.value)
651 652 # Unary arithmetic operations: -, +, abs() 653
654 - def __neg__(self):
655 return -self.value
656
657 - def __pos__(self):
658 return self.value
659
660 - def __abs__(self):
661 return abs(self.value)
662 663 # Explicit coercion to numeric types: int, long, float 664
665 - def __float__(self):
666 return float(self.value)
667
668 - def __int__(self):
669 return int(self.value)
670
671 - def __long__(self):
672 return long(self.value)
673
674 675 -class Date(PhyloElement):
676 """A date associated with a clade/node. 677 678 Its value can be numerical by using the 'value' element and/or free text 679 with the 'desc' element' (e.g. 'Silurian'). If a numerical value is used, it 680 is recommended to employ the 'unit' attribute. 681 682 :Parameters: 683 unit : string 684 type of numerical value (e.g. 'mya' for 'million years ago') 685 value : float 686 the date value 687 desc : string 688 plain-text description of the date 689 minimum : float 690 lower bound on the date value 691 maximum : float 692 upper bound on the date value 693 """
694 - def __init__(self, value=None, unit=None, desc=None, 695 minimum=None, maximum=None):
696 self.value = value 697 self.unit = unit 698 self.desc = desc 699 self.minimum = minimum 700 self.maximum = maximum
701
702 - def __str__(self):
703 """Show the class name and the human-readable date.""" 704 if self.unit and self.value is not None: 705 return '%s %s' % (self.value, self.unit) 706 if self.desc is not None: 707 return self.desc 708 return self.__class__.__name__
709
710 711 -class Distribution(PhyloElement):
712 """Geographic distribution of the items of a clade (species, sequences). 713 714 Intended for phylogeographic applications. 715 716 :Parameters: 717 desc : string 718 free-text description of the location 719 points : list of `Point` objects 720 coordinates (similar to the 'Point' element in Google's KML format) 721 polygons : list of `Polygon` objects 722 coordinate sets defining geographic regions 723 """
724 - def __init__(self, desc=None, points=None, polygons=None):
725 self.desc = desc 726 self.points = points or [] 727 self.polygons = polygons or []
728
729 730 -class DomainArchitecture(PhyloElement):
731 """Domain architecture of a protein. 732 733 :Parameters: 734 length : int 735 total length of the protein sequence 736 domains : list ProteinDomain objects 737 the domains within this protein 738 """
739 - def __init__(self, length=None, domains=None):
740 self.length = length 741 self.domains = domains
742
743 744 -class Events(PhyloElement):
745 """Events at the root node of a clade (e.g. one gene duplication). 746 747 All attributes are set to None by default, but this object can also be 748 treated as a dictionary, in which case None values are treated as missing 749 keys and deleting a key resets that attribute's value back to None. 750 """ 751 ok_type = set(('transfer', 'fusion', 'speciation_or_duplication', 'other', 752 'mixed', 'unassigned')) 753
754 - def __init__(self, type=None, duplications=None, speciations=None, 755 losses=None, confidence=None):
756 _check_str(type, self.ok_type.__contains__) 757 self.type = type 758 self.duplications = duplications 759 self.speciations = speciations 760 self.losses = losses 761 self.confidence = confidence
762
763 - def items(self):
764 return [(k, v) for k, v in self.__dict__.iteritems() if v is not None]
765
766 - def keys(self):
767 return [k for k, v in self.__dict__.iteritems() if v is not None]
768
769 - def values(self):
770 return [v for v in self.__dict__.itervalues() if v is not None]
771
772 - def __len__(self):
773 return len(self.values())
774
775 - def __getitem__(self, key):
776 if not hasattr(self, key): 777 raise KeyError(key) 778 val = getattr(self, key) 779 if val is None: 780 raise KeyError("%s has not been set in this object" % repr(key)) 781 return val
782
783 - def __setitem__(self, key, val):
784 setattr(self, key, val)
785
786 - def __delitem__(self, key):
787 setattr(self, key, None)
788
789 - def __iter__(self):
790 return iter(self.keys())
791
792 - def __contains__(self, key):
793 return (hasattr(self, key) and getattr(self, key) is not None)
794
795 796 -class Id(PhyloElement):
797 """A general-purpose identifier element. 798 799 Allows to indicate the provider (or authority) of an identifier, e.g. NCBI, 800 along with the value itself. 801 """
802 - def __init__(self, value, provider=None):
803 self.value = value 804 self.provider = provider
805
806 - def __str__(self):
807 if self.provider is not None: 808 return '%s:%s' % (self.provider, self.value) 809 return self.value
810
811 812 -class MolSeq(PhyloElement):
813 """Store a molecular sequence. 814 815 :Parameters: 816 value : string 817 the sequence itself 818 is_aligned : bool 819 True if this sequence is aligned with the others (usually meaning 820 all aligned seqs are the same length and gaps may be present) 821 """ 822 re_value = re.compile(r'[a-zA-Z\.\-\?\*_]+') 823
824 - def __init__(self, value, is_aligned=None):
825 _check_str(value, self.re_value.match) 826 self.value = value 827 self.is_aligned = is_aligned
828
829 - def __str__(self):
830 return self.value
831
832 833 -class Point(PhyloElement):
834 """Geographic coordinates of a point, with an optional altitude. 835 836 Used by element 'Distribution'. 837 838 :Parameters: 839 geodetic_datum : string, required 840 the geodetic datum (also called 'map datum'). For example, Google's 841 KML uses 'WGS84'. 842 lat : numeric 843 latitude 844 long : numeric 845 longitude 846 alt : numeric 847 altitude 848 alt_unit : string 849 unit for the altitude (e.g. 'meter') 850 """
851 - def __init__(self, geodetic_datum, lat, long, alt=None, alt_unit=None):
852 self.geodetic_datum = geodetic_datum 853 self.lat = lat 854 self.long = long 855 self.alt = alt 856 self.alt_unit = alt_unit
857
858 859 -class Polygon(PhyloElement):
860 """A polygon defined by a list of 'Points' (used by element 'Distribution'). 861 862 :param points: list of 3 or more points representing vertices. 863 """
864 - def __init__(self, points=None):
865 self.points = points or []
866
867 - def __str__(self):
868 return '%s([%s])' % (self.__class__.__name__, 869 ',\n'.join(map(str, self.points)))
870
871 872 -class Property(PhyloElement):
873 """A typed and referenced property from an external resources. 874 875 Can be attached to `Phylogeny`, `Clade`, and `Annotation` objects. 876 877 :Parameters: 878 value : string 879 the value of the property 880 ref : string 881 reference to an external resource, e.g. "NOAA:depth" 882 applies_to : string 883 indicates the item to which a property applies to (e.g. 'node' for 884 the parent node of a clade, 'parent_branch' for the parent branch of 885 a clade, or just 'clade'). 886 datatype : string 887 the type of a property; limited to xsd-datatypes 888 (e.g. 'xsd:string', 'xsd:boolean', 'xsd:integer', 'xsd:decimal', 889 'xsd:float', 'xsd:double', 'xsd:date', 'xsd:anyURI'). 890 unit : string (optional) 891 the unit of the property, e.g. "METRIC:m" 892 id_ref : Id (optional) 893 allows to attached a property specifically to one element (on the 894 xml-level) 895 """ 896 re_ref = re.compile(r'[a-zA-Z0-9_]+:[a-zA-Z0-9_\.\-\s]+') 897 ok_applies_to = set(('phylogeny', 'clade', 'node', 'annotation', 898 'parent_branch', 'other')) 899 ok_datatype = set(('xsd:string', 'xsd:boolean', 'xsd:decimal', 'xsd:float', 900 'xsd:double', 'xsd:duration', 'xsd:dateTime', 'xsd:time', 'xsd:date', 901 'xsd:gYearMonth', 'xsd:gYear', 'xsd:gMonthDay', 'xsd:gDay', 902 'xsd:gMonth', 'xsd:hexBinary', 'xsd:base64Binary', 'xsd:anyURI', 903 'xsd:normalizedString', 'xsd:token', 'xsd:integer', 904 'xsd:nonPositiveInteger', 'xsd:negativeInteger', 'xsd:long', 'xsd:int', 905 'xsd:short', 'xsd:byte', 'xsd:nonNegativeInteger', 'xsd:unsignedLong', 906 'xsd:unsignedInt', 'xsd:unsignedShort', 'xsd:unsignedByte', 907 'xsd:positiveInteger')) 908
909 - def __init__(self, value, ref, applies_to, datatype, 910 unit=None, id_ref=None):
911 _check_str(ref, self.re_ref.match) 912 _check_str(applies_to, self.ok_applies_to.__contains__) 913 _check_str(datatype, self.ok_datatype.__contains__) 914 _check_str(unit, self.re_ref.match) 915 self.unit = unit 916 self.id_ref = id_ref 917 self.value = value 918 self.ref = ref 919 self.applies_to = applies_to 920 self.datatype = datatype
921
922 923 -class ProteinDomain(PhyloElement):
924 """Represents an individual domain in a domain architecture. 925 926 The locations use 0-based indexing, as most Python objects including 927 SeqFeature do, rather than the usual biological convention starting at 1. 928 This means the start and end attributes can be used directly as slice 929 indexes on Seq objects. 930 931 :Parameters: 932 start : non-negative integer 933 start of the domain on the sequence, using 0-based indexing 934 end : non-negative integer 935 end of the domain on the sequence 936 confidence : float 937 can be used to store e.g. E-values 938 id : string 939 unique identifier/name 940 """ 941
942 - def __init__(self, value, start, end, confidence=None, id=None):
943 self.value = value 944 self.start = start 945 self.end = end 946 self.confidence = confidence 947 self.id = id
948 949 @classmethod
950 - def from_seqfeature(cls, feat):
951 return ProteinDomain(feat.id, 952 feat.location.nofuzzy_start, 953 feat.location.nofuzzy_end, 954 confidence=feat.qualifiers.get('confidence'))
955
956 - def to_seqfeature(self):
957 feat = SeqFeature(location=FeatureLocation(self.start, self.end), 958 id=self.value) 959 if hasattr(self, 'confidence'): 960 feat.qualifiers['confidence'] = self.confidence 961 return feat
962
963 964 -class Reference(PhyloElement):
965 """Literature reference for a clade. 966 967 NB: Whenever possible, use the ``doi`` attribute instead of the free-text 968 ``desc`` element. 969 """ 970 re_doi = re.compile(r'[a-zA-Z0-9_\.]+/[a-zA-Z0-9_\.]+') 971
972 - def __init__(self, doi=None, desc=None):
973 _check_str(doi, self.re_doi.match) 974 self.doi = doi 975 self.desc = desc
976
977 978 -class Sequence(PhyloElement):
979 """A molecular sequence (Protein, DNA, RNA) associated with a node. 980 981 One intended use for ``id_ref`` is to link a sequence to a taxonomy (via the 982 taxonomy's ``id_source``) in case of multiple sequences and taxonomies per 983 node. 984 985 :Parameters: 986 type : {'dna', 'rna', 'protein'} 987 type of molecule this sequence represents 988 id_ref : string 989 reference to another resource 990 id_source : string 991 source for the reference 992 symbol : string 993 short symbol of the sequence, e.g. 'ACTM' (max. 10 chars) 994 accession : Accession 995 accession code for this sequence. 996 name : string 997 full name of the sequence, e.g. 'muscle Actin' 998 location 999 location of a sequence on a genome/chromosome. 1000 mol_seq : MolSeq 1001 the molecular sequence itself 1002 uri : Uri 1003 link 1004 annotations : list of Annotation objects 1005 annotations on this sequence 1006 domain_architecture : DomainArchitecture 1007 protein domains on this sequence 1008 other : list of Other objects 1009 non-phyloXML elements 1010 """ 1011 alphabets = {'dna': Alphabet.generic_dna, 1012 'rna': Alphabet.generic_rna, 1013 'protein': Alphabet.generic_protein} 1014 re_symbol = re.compile(r'\S{1,10}') 1015
1016 - def __init__(self, 1017 # Attributes 1018 type=None, id_ref=None, id_source=None, 1019 # Child nodes 1020 symbol=None, accession=None, name=None, location=None, 1021 mol_seq=None, uri=None, domain_architecture=None, 1022 # Collections 1023 annotations=None, other=None, 1024 ):
1025 _check_str(type, self.alphabets.__contains__) 1026 _check_str(symbol, self.re_symbol.match) 1027 self.type = type 1028 self.id_ref = id_ref 1029 self.id_source = id_source 1030 self.symbol = symbol 1031 self.accession = accession 1032 self.name = name 1033 self.location = location 1034 self.mol_seq = mol_seq 1035 self.uri = uri 1036 self.domain_architecture = domain_architecture 1037 self.annotations = annotations or [] 1038 self.other = other or []
1039 1040 @classmethod
1041 - def from_seqrecord(cls, record, is_aligned=None):
1042 """Create a new PhyloXML Sequence from a SeqRecord object.""" 1043 if is_aligned is None: 1044 is_aligned = isinstance(record.seq.alphabet, Alphabet.Gapped) 1045 params = { 1046 'accession': Accession(record.id, ''), 1047 'symbol': record.name, 1048 'name': record.description, 1049 'mol_seq': MolSeq(str(record.seq), is_aligned), 1050 } 1051 if isinstance(record.seq.alphabet, Alphabet.DNAAlphabet): 1052 params['type'] = 'dna' 1053 elif isinstance(record.seq.alphabet, Alphabet.RNAAlphabet): 1054 params['type'] = 'rna' 1055 elif isinstance(record.seq.alphabet, Alphabet.ProteinAlphabet): 1056 params['type'] = 'protein' 1057 1058 # Unpack record.annotations 1059 for key in ('id_ref', 'id_source', 'location'): 1060 if key in record.annotations: 1061 params[key] = record.annotations[key] 1062 if isinstance(record.annotations.get('uri'), dict): 1063 params['uri'] = Uri(**record.annotations['uri']) 1064 # Build a Sequence.annotation object 1065 if record.annotations.get('annotations'): 1066 params['annotations'] = [] 1067 for annot in record.annotations['annotations']: 1068 ann_args = {} 1069 for key in ('ref', 'source', 'evidence', 'type', 'desc'): 1070 if key in annot: 1071 ann_args[key] = annot[key] 1072 if isinstance(annot.get('confidence'), list): 1073 ann_args['confidence'] = Confidence( 1074 *annot['confidence']) 1075 if isinstance(annot.get('properties'), list): 1076 ann_args['properties'] = [Property(**prop) 1077 for prop in annot['properties'] 1078 if isinstance(prop, dict)] 1079 params['annotations'].append(Annotation(**ann_args)) 1080 1081 # Unpack record.features 1082 if record.features: 1083 params['domain_architecture'] = DomainArchitecture( 1084 length=len(record.seq), 1085 domains=[ProteinDomain.from_seqfeature(feat) 1086 for feat in record.features]) 1087 1088 return Sequence(**params)
1089
1090 - def to_seqrecord(self):
1091 """Create a SeqRecord object from this Sequence instance. 1092 1093 The seqrecord.annotations dictionary is packed like so:: 1094 1095 { # Sequence attributes with no SeqRecord equivalent: 1096 'id_ref': self.id_ref, 1097 'id_source': self.id_source, 1098 'location': self.location, 1099 'uri': { 'value': self.uri.value, 1100 'desc': self.uri.desc, 1101 'type': self.uri.type }, 1102 # Sequence.annotations attribute (list of Annotations) 1103 'annotations': [{ 'ref': ann.ref, 1104 'source': ann.source, 1105 'evidence': ann.evidence, 1106 'type': ann.type, 1107 'confidence': [ ann.confidence.value, 1108 ann.confidence.type ], 1109 'properties': [{ 'value': prop.value, 1110 'ref': prop.ref, 1111 'applies_to': prop.applies_to, 1112 'datatype': prop.datatype, 1113 'unit': prop.unit, 1114 'id_ref': prop.id_ref } 1115 for prop in ann.properties], 1116 } for ann in self.annotations], 1117 } 1118 """ 1119 def clean_dict(dct): 1120 """Remove None-valued items from a dictionary.""" 1121 return dict((key, val) for key, val in dct.iteritems() 1122 if val is not None)
1123 1124 seqrec = SeqRecord(Seq(self.mol_seq.value, self.get_alphabet()), 1125 **clean_dict({ 1126 'id': str(self.accession), 1127 'name': self.symbol, 1128 'description': self.name, 1129 # 'dbxrefs': None, 1130 })) 1131 if self.domain_architecture: 1132 seqrec.features = [dom.to_seqfeature() 1133 for dom in self.domain_architecture.domains] 1134 # Sequence attributes with no SeqRecord equivalent 1135 seqrec.annotations = clean_dict({ 1136 'id_ref': self.id_ref, 1137 'id_source': self.id_source, 1138 'location': self.location, 1139 'uri': self.uri and clean_dict({ 1140 'value': self.uri.value, 1141 'desc': self.uri.desc, 1142 'type': self.uri.type, 1143 }), 1144 'annotations': self.annotations and [ 1145 clean_dict({ 1146 'ref': ann.ref, 1147 'source': ann.source, 1148 'evidence': ann.evidence, 1149 'type': ann.type, 1150 'confidence': ann.confidence and [ 1151 ann.confidence.value, 1152 ann.confidence.type], 1153 'properties': [clean_dict({ 1154 'value': prop.value, 1155 'ref': prop.ref, 1156 'applies_to': prop.applies_to, 1157 'datatype': prop.datatype, 1158 'unit': prop.unit, 1159 'id_ref': prop.id_ref }) 1160 for prop in ann.properties], 1161 }) for ann in self.annotations], 1162 }) 1163 return seqrec
1164
1165 - def get_alphabet(self):
1166 alph = self.alphabets.get(self.type, Alphabet.generic_alphabet) 1167 if self.mol_seq and self.mol_seq.is_aligned: 1168 return Alphabet.Gapped(alph) 1169 return alph
1170
1171 1172 -class SequenceRelation(PhyloElement):
1173 """Express a typed relationship between two sequences. 1174 1175 For example, this could be used to describe an orthology (in which case 1176 attribute 'type' is 'orthology'). 1177 1178 :Parameters: 1179 id_ref_0 : Id 1180 first sequence reference identifier 1181 id_ref_1 : Id 1182 second sequence reference identifier 1183 distance : float 1184 distance between the two sequences 1185 type : restricted string 1186 describe the type of relationship 1187 confidence : Confidence 1188 confidence value for this relation 1189 """ 1190 ok_type = set(('orthology', 'one_to_one_orthology', 'super_orthology', 1191 'paralogy', 'ultra_paralogy', 'xenology', 'unknown', 'other')) 1192
1193 - def __init__(self, type, id_ref_0, id_ref_1, 1194 distance=None, confidence=None):
1195 _check_str(type, self.ok_type.__contains__) 1196 self.distance = distance 1197 self.type = type 1198 self.id_ref_0 = id_ref_0 1199 self.id_ref_1 = id_ref_1 1200 self.confidence = confidence
1201
1202 1203 -class Taxonomy(PhyloElement):
1204 """Describe taxonomic information for a clade. 1205 1206 :Parameters: 1207 id_source : Id 1208 link other elements to a taxonomy (on the XML level) 1209 id : Id 1210 unique identifier of a taxon, e.g. Id('6500', 1211 provider='ncbi_taxonomy') for the California sea hare 1212 code : restricted string 1213 store UniProt/Swiss-Prot style organism codes, e.g. 'APLCA' for the 1214 California sea hare 'Aplysia californica' 1215 scientific_name : string 1216 the standard scientific name for this organism, e.g. 'Aplysia 1217 californica' for the California sea hare 1218 authority : string 1219 keep the authority, such as 'J. G. Cooper, 1863', associated with 1220 the 'scientific_name' 1221 common_names : list of strings 1222 common names for this organism 1223 synonyms : list of strings 1224 synonyms for this taxon? 1225 rank : restricted string 1226 taxonomic rank 1227 uri : Uri 1228 link 1229 other : list of Other objects 1230 non-phyloXML elements 1231 """ 1232 re_code = re.compile(r'[a-zA-Z0-9_]{2,10}') 1233 ok_rank = set(('domain', 'kingdom', 'subkingdom', 'branch', 'infrakingdom', 1234 'superphylum', 'phylum', 'subphylum', 'infraphylum', 'microphylum', 1235 'superdivision', 'division', 'subdivision', 'infradivision', 1236 'superclass', 'class', 'subclass', 'infraclass', 'superlegion', 1237 'legion', 'sublegion', 'infralegion', 'supercohort', 'cohort', 1238 'subcohort', 'infracohort', 'superorder', 'order', 'suborder', 1239 'superfamily', 'family', 'subfamily', 'supertribe', 'tribe', 'subtribe', 1240 'infratribe', 'genus', 'subgenus', 'superspecies', 'species', 1241 'subspecies', 'variety', 'subvariety', 'form', 'subform', 'cultivar', 1242 'unknown', 'other')) 1243
1244 - def __init__(self, 1245 # Attributes 1246 id_source=None, 1247 # Child nodes 1248 id=None, code=None, scientific_name=None, authority=None, 1249 rank=None, uri=None, 1250 # Collections 1251 common_names=None, synonyms=None, other=None, 1252 ):
1253 _check_str(code, self.re_code.match) 1254 _check_str(rank, self.ok_rank.__contains__) 1255 self.id_source = id_source 1256 self.id = id 1257 self.code = code 1258 self.scientific_name = scientific_name 1259 self.authority = authority 1260 self.rank = rank 1261 self.uri = uri 1262 self.common_names = common_names or [] 1263 self.synonyms = synonyms or [] 1264 self.other = other or []
1265
1266 - def __str__(self):
1267 """Show the class name and an identifying attribute.""" 1268 if self.code is not None: 1269 return self.code 1270 if self.scientific_name is not None: 1271 return self.scientific_name 1272 if self.rank is not None: 1273 return self.rank 1274 if self.id is not None: 1275 return str(self.id) 1276 return self.__class__.__name__
1277
1278 1279 -class Uri(PhyloElement):
1280 """A uniform resource identifier. 1281 1282 In general, this is expected to be an URL (for example, to link to an image 1283 on a website, in which case the ``type`` attribute might be 'image' and 1284 ``desc`` might be 'image of a California sea hare'). 1285 """
1286 - def __init__(self, value, desc=None, type=None):
1287 self.value = value 1288 self.desc = desc 1289 self.type = type
1290
1291 - def __str__(self):
1292 if self.value: 1293 return self.value 1294 return repr(self)
1295