Package Bio :: Package Phylo :: Module CDAOIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.CDAOIO

  1  # Copyright (C) 2013 by Ben Morris (ben@bendmorris.com) 
  2  # Based on Bio.Nexus, copyright 2005-2008 by Frank Kauff & Cymon J. Cox 
  3  # and Bio.Phylo.Newick, copyright 2009 by Eric Talevich. 
  4  # All rights reserved. 
  5  # This code is part of the Biopython distribution and governed by its 
  6  # license. Please see the LICENSE file that should have been included 
  7  # as part of this package. 
  8   
  9  """I/O function wrappers for the RDF/CDAO file format. 
 10   
 11  This is an RDF format that conforms to the Comparative Data Analysis Ontology (CDAO). 
 12  See: http://www.evolutionaryontology.org/cdao 
 13   
 14  This module requires the librdf Python bindings (http://www.librdf.org) 
 15   
 16  The CDAOIO.Parser, in addition to parsing text files, can also parse directly 
 17  from a triple store that implements the Redland storage interface; similarly, 
 18  the CDAOIO.Writer can store triples in a triple store instead of serializing 
 19  them to a file. 
 20  """ 
 21   
 22  __docformat__ = "restructuredtext en" 
 23   
 24  from Bio._py3k import StringIO 
 25   
 26  from Bio.Phylo import CDAO 
 27  from ._cdao_owl import cdao_elements, cdao_namespaces, resolve_uri 
 28  import os 
29 30 -class CDAOError(Exception):
31 """Exception raised when CDAO object construction cannot continue.""" 32 pass
33 34 try: 35 import rdflib 36 rdfver = rdflib.__version__ 37 if rdfver[0] in ["1", "2"] or (rdfver in ["3.0.0", "3.1.0", "3.2.0"]): 38 raise CDAOError('Support for CDAO tree format requires RDFlib v3.2.1 or later.') 39 except ImportError: 40 raise CDAOError('Support for CDAO tree format requires RDFlib.') 41 42 RDF_NAMESPACES = { 43 'owl': 'http://www.w3.org/2002/07/owl#', 44 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 45 'rdfs': 'http://www.w3.org/2000/01/rdf-schema#', 46 } 47 RDF_NAMESPACES.update(cdao_namespaces) 48 # pad node ids with zeroes until they're at least this length 49 ZEROES = 8
50 51 -def qUri(x):
52 return resolve_uri(x, namespaces=RDF_NAMESPACES)
53
54 -def format_label(x):
55 return x.replace('_', ' ')
56
57 58 # --------------------------------------------------------- 59 # Public API 60 61 -def parse(handle, **kwargs):
62 """Iterate over the trees in a CDAO file handle. 63 64 :returns: generator of Bio.Phylo.CDAO.Tree objects. 65 """ 66 return Parser(handle).parse(**kwargs)
67
68 69 -def write(trees, handle, plain=False, **kwargs):
70 """Write a trees in CDAO format to the given file handle. 71 72 :returns: number of trees written. 73 """ 74 return Writer(trees).write(handle, plain=plain, **kwargs)
75
76 77 # --------------------------------------------------------- 78 # Input 79 80 -class Parser(object):
81 """Parse a CDAO tree given a file handle. 82 """
83 - def __init__(self, handle=None):
84 self.handle = handle 85 self.graph = None 86 self.node_info = None 87 self.children = {} 88 self.rooted = False
89 90 @classmethod
91 - def from_string(cls, treetext):
92 handle = StringIO(treetext) 93 return cls(handle)
94
95 - def parse(self, **kwargs):
96 """Parse the text stream this object was initialized with.""" 97 self.parse_handle_to_graph(**kwargs) 98 return self.parse_graph()
99
100 - def parse_handle_to_graph(self, rooted=False, 101 parse_format='turtle', context=None, **kwargs):
102 '''Parse self.handle into RDF model self.model.''' 103 104 if self.graph is None: 105 self.graph = rdflib.Graph() 106 graph = self.graph 107 108 for k, v in RDF_NAMESPACES.items(): 109 graph.bind(k, v) 110 111 self.rooted = rooted 112 113 if 'base_uri' in kwargs: 114 base_uri = kwargs['base_uri'] 115 else: 116 base_uri = "file://"+os.path.abspath(self.handle.name) 117 118 graph.parse(file=self.handle, publicID=base_uri, format=parse_format) 119 120 return self.parse_graph(graph, context=context)
121 122
123 - def parse_graph(self, graph=None, context=None):
124 '''Generator that yields CDAO.Tree instances from an RDF model.''' 125 126 if graph is None: 127 graph = self.graph 128 129 # look up branch lengths/TUs for all nodes 130 self.get_node_info(graph, context=context) 131 132 for root_node in self.tree_roots: 133 clade = self.parse_children(root_node) 134 135 yield CDAO.Tree(root=clade, rooted=self.rooted)
136 137
138 - def new_clade(self, node):
139 '''Returns a CDAO.Clade object for a given named node.''' 140 141 result = self.node_info[node] 142 143 kwargs = {} 144 if 'branch_length' in result: 145 kwargs['branch_length'] = result['branch_length'] 146 if 'label' in result: 147 kwargs['name'] = result['label'].replace('_', ' ') 148 if 'confidence' in result: 149 kwargs['confidence'] = result['confidence'] 150 151 clade = CDAO.Clade(**kwargs) 152 153 return clade
154 155
156 - def get_node_info(self, graph, context=None):
157 '''Creates a dictionary containing information about all nodes in the tree.''' 158 159 self.node_info = {} 160 self.obj_info = {} 161 self.children = {} 162 self.nodes = set() 163 self.tree_roots = set() 164 165 assignments = { 166 qUri('cdao:has_Parent'): 'parent', 167 qUri('cdao:belongs_to_Edge_as_Child'): 'edge', 168 qUri('cdao:has_Annotation'): 'annotation', 169 qUri('cdao:has_Value'): 'value', 170 qUri('cdao:represents_TU'): 'tu', 171 qUri('rdfs:label'): 'label', 172 qUri('cdao:has_Support_Value'): 'confidence', 173 } 174 175 for s, v, o in graph: 176 # process each RDF triple in the graph sequentially 177 178 s, v, o = str(s), str(v), str(o) 179 180 if not s in self.obj_info: self.obj_info[s] = {} 181 this = self.obj_info[s] 182 183 try: 184 # if the predicate is one we care about, store information for later 185 this[assignments[v]] = o 186 except KeyError: 187 pass 188 189 if v == qUri('rdf:type'): 190 if o in (qUri('cdao:AncestralNode'), qUri('cdao:TerminalNode')): 191 # this is a tree node; store it in set of all nodes 192 self.nodes.add(s) 193 if v == qUri('cdao:has_Root'): 194 # this is a tree; store its root in set of all tree roots 195 self.tree_roots.add(o) 196 197 for node in self.nodes: 198 # for each node, look up all information needed to create a CDAO.Clade 199 self.node_info[node] = {} 200 node_info = self.node_info[node] 201 202 obj = self.obj_info[node] 203 if 'edge' in obj: 204 # if this object points to an edge, we need a branch length from 205 # the annotation on that edge 206 edge = self.obj_info[obj['edge']] 207 if 'annotation' in edge: 208 annotation = self.obj_info[edge['annotation']] 209 if 'value' in annotation: 210 node_info['branch_length'] = float(annotation['value']) 211 212 if 'tu' in obj: 213 # if this object points to a TU, we need the label of that TU 214 tu = self.obj_info[obj['tu']] 215 if 'label' in tu: 216 node_info['label'] = tu['label'] 217 218 if 'parent' in obj: 219 # store this node as a child of its parent, if it has one, 220 # so that the tree can be traversed from parent to children 221 parent = obj['parent'] 222 if not parent in self.children: 223 self.children[parent] = [] 224 self.children[parent].append(node)
225 226
227 - def parse_children(self, node):
228 '''Return a CDAO.Clade, and calls itself recursively for each child, 229 traversing the entire tree and creating a nested structure of CDAO.Clade 230 objects.''' 231 232 clade = self.new_clade(node) 233 234 children = self.children[node] if node in self.children else [] 235 clade.clades = [self.parse_children(child_node) for child_node in children] 236 237 return clade
238
239 240 # --------------------------------------------------------- 241 # Output 242 243 -class Writer(object):
244 """Based on the writer in Bio.Nexus.Trees (str, to_string).""" 245 prefixes = RDF_NAMESPACES 246
247 - def __init__(self, trees):
248 self.trees = trees 249 250 self.node_counter = 0 251 self.edge_counter = 0 252 self.tu_counter = 0 253 self.tree_counter = 0
254
255 - def write(self, handle, tree_uri='', record_complete_ancestry=False, 256 rooted=False, **kwargs):
257 """Write this instance's trees to a file handle.""" 258 259 self.rooted = rooted 260 self.record_complete_ancestry = record_complete_ancestry 261 262 if tree_uri and not tree_uri.endswith('/'): tree_uri += '/' 263 264 trees = self.trees 265 266 if tree_uri: handle.write('@base <%s>\n' % tree_uri) 267 for k, v in self.prefixes.items(): 268 handle.write('@prefix %s: <%s> .\n' % (k, v)) 269 270 handle.write('<%s> a owl:Ontology .\n' % self.prefixes['cdao']) 271 272 273 for tree in trees: 274 self.tree_counter += 1 275 self.tree_uri = 'tree%s' 276 277 first_clade = tree.clade 278 statements = self.process_clade(first_clade, root=tree) 279 for stmt in statements: 280 self.add_stmt_to_handle(handle, stmt)
281 282
283 - def add_stmt_to_handle(self, handle, stmt):
284 # apply URI prefixes 285 stmt_strings = [] 286 for n, part in enumerate(stmt): 287 if isinstance(part, rdflib.URIRef): 288 node_uri = str(part) 289 changed = False 290 for prefix, uri in self.prefixes.items(): 291 if node_uri.startswith(uri): 292 node_uri = node_uri.replace(uri, '%s:'%prefix, 1) 293 if node_uri == 'rdf:type': node_uri = 'a' 294 changed = True 295 if changed or ':' in node_uri: stmt_strings.append(node_uri) 296 else: stmt_strings.append('<%s>' % node_uri) 297 298 elif isinstance(part, rdflib.Literal): 299 stmt_strings.append(part.n3()) 300 301 else: 302 stmt_strings.append(str(part)) 303 304 handle.write('%s .\n' % ' '.join(stmt_strings))
305
306 - def process_clade(self, clade, parent=None, root=False):
307 '''recursively generate triples describing a tree of clades''' 308 309 self.node_counter += 1 310 clade.uri = 'node%s' % str(self.node_counter).zfill(ZEROES) 311 if parent: clade.ancestors = parent.ancestors + [parent.uri] 312 else: clade.ancestors = [] 313 314 nUri = lambda s: rdflib.URIRef(s)#':%s' % s 315 pUri = lambda s: rdflib.URIRef(qUri(s)) 316 tree_id = nUri('') 317 318 statements = [] 319 320 if not root is False: 321 # create a cdao:RootedTree with reference to the tree root 322 tree_type = pUri('cdao:RootedTree') if self.rooted else pUri('cdao:UnrootedTree') 323 324 statements += [ 325 (tree_id, pUri('rdf:type'), tree_type), 326 (tree_id, pUri('cdao:has_Root'), nUri(clade.uri)), 327 ] 328 329 try: tree_attributes = root.attributes 330 except AttributeError: tree_attributes = [] 331 332 for predicate, obj in tree_attributes: 333 statements.append((tree_id, predicate, obj)) 334 335 if clade.name: 336 # create TU 337 self.tu_counter += 1 338 tu_uri = 'tu%s' % str(self.tu_counter).zfill(ZEROES) 339 340 statements += [ 341 (nUri(tu_uri), pUri('rdf:type'), pUri('cdao:TU')), 342 (nUri(clade.uri), pUri('cdao:represents_TU'), nUri(tu_uri)), 343 (nUri(tu_uri), pUri('rdfs:label'), rdflib.Literal(format_label(clade.name))), 344 ] 345 346 try: tu_attributes = clade.tu_attributes 347 except AttributeError: tu_attributes = [] 348 349 for predicate, obj in tu_attributes: 350 yield (nUri(tu_uri), predicate, obj) 351 352 # create this node 353 node_type = 'cdao:TerminalNode' if clade.is_terminal() else 'cdao:AncestralNode' 354 statements += [ 355 (nUri(clade.uri), pUri('rdf:type'), pUri(node_type)), 356 (nUri(clade.uri), pUri('cdao:belongs_to_Tree'), tree_id), 357 ] 358 359 if not parent is None: 360 # create edge from the parent node to this node 361 self.edge_counter += 1 362 edge_uri = 'edge%s' % str(self.edge_counter).zfill(ZEROES) 363 364 statements += [ 365 (nUri(edge_uri), pUri('rdf:type'), pUri('cdao:DirectedEdge')), 366 (nUri(edge_uri), pUri('cdao:belongs_to_Tree'), tree_id), 367 (nUri(edge_uri), pUri('cdao:has_Parent_Node'), nUri(parent.uri)), 368 (nUri(edge_uri), pUri('cdao:has_Child_Node'), nUri(clade.uri)), 369 (nUri(clade.uri), pUri('cdao:belongs_to_Edge_as_Child'), nUri(edge_uri)), 370 (nUri(clade.uri), pUri('cdao:has_Parent'), nUri(parent.uri)), 371 (nUri(parent.uri), pUri('cdao:belongs_to_Edge_as_Parent'), nUri(edge_uri)), 372 ] 373 374 if hasattr(clade, 'confidence') and not clade.confidence is None: 375 confidence = rdflib.Literal(clade.confidence, datatype='http://www.w3.org/2001/XMLSchema#decimal') 376 377 statements += [(nUri(clade.uri), pUri('cdao:has_Support_Value'), confidence)] 378 379 380 if self.record_complete_ancestry and len(clade.ancestors) > 0: 381 statements += [(nUri(clade.uri), pUri('cdao:has_Ancestor'), nUri(ancestor)) 382 for ancestor in clade.ancestors] 383 384 if not clade.branch_length is None: 385 # add branch length 386 edge_ann_uri = 'edge_annotation%s' % str(self.edge_counter).zfill(ZEROES) 387 388 branch_length = rdflib.Literal(clade.branch_length, datatype=rdflib.URIRef('http://www.w3.org/2001/XMLSchema#decimal')) 389 statements += [ 390 (nUri(edge_ann_uri), pUri('rdf:type'), pUri('cdao:EdgeLength')), 391 (nUri(edge_uri), pUri('cdao:has_Annotation'), nUri(edge_ann_uri)), 392 (nUri(edge_ann_uri), pUri('cdao:has_Value'), branch_length), 393 ] 394 395 try: edge_attributes = clade.edge_attributes 396 except AttributeError: edge_attributes = [] 397 398 for predicate, obj in edge_attributes: 399 yield (nUri(edge_uri), predicate, obj) 400 401 for stmt in statements: 402 yield stmt 403 404 try: clade_attributes = clade.attributes 405 except AttributeError: clade_attributes = [] 406 407 for predicate, obj in clade_attributes: 408 yield (nUri(clade.uri), predicate, obj) 409 410 if not clade.is_terminal(): 411 for new_clade in clade.clades: 412 for stmt in self.process_clade(new_clade, parent=clade, root=False): 413 yield stmt
414