Package Bio :: Package Phylo :: Module NewickIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.NewickIO

  1  # Copyright (C) 2009 by Eric Talevich (eric.talevich@gmail.com) 
  2  # Based on Bio.Nexus, copyright 2005-2008 by Frank Kauff & Cymon J. Cox. 
  3  # All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license. Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7   
  8  """I/O function wrappers for the Newick file format. 
  9   
 10  See: http://evolution.genetics.washington.edu/phylip/newick_doc.html 
 11  """ 
 12  __docformat__ = "restructuredtext en" 
 13   
 14  import re 
 15  from Bio._py3k import StringIO 
 16   
 17  from Bio.Phylo import Newick 
18 19 20 -class NewickError(Exception):
21 """Exception raised when Newick object construction cannot continue.""" 22 pass
23 24 25 tokens = [ 26 (r"\(", 'open parens'), 27 (r"\)", 'close parens'), 28 (r"[^\s\(\)\[\]\'\:\;\,]+", 'unquoted node label'), 29 (r"\:[0-9]*\.?[0-9]+([eE][+-]?[0-9]+)?", 'edge length'), 30 (r"\,", 'comma'), 31 (r"\[(\\.|[^\]])*\]", 'comment'), 32 (r"\'(\\.|[^\'])*\'", 'quoted node label'), 33 (r"\;", 'semicolon'), 34 (r"\n", 'newline'), 35 ] 36 tokenizer = re.compile('(%s)' % '|'.join(token[0] for token in tokens)) 37 token_dict = dict((name, re.compile(token)) for (token, name) in tokens)
38 39 40 # --------------------------------------------------------- 41 # Public API 42 43 -def parse(handle, **kwargs):
44 """Iterate over the trees in a Newick file handle. 45 46 :returns: generator of Bio.Phylo.Newick.Tree objects. 47 """ 48 return Parser(handle).parse(**kwargs)
49
50 51 -def write(trees, handle, plain=False, **kwargs):
52 """Write a trees in Newick format to the given file handle. 53 54 :returns: number of trees written. 55 """ 56 return Writer(trees).write(handle, plain=plain, **kwargs)
57
58 59 # --------------------------------------------------------- 60 # Input 61 62 -def _parse_confidence(text):
63 if text.isdigit(): 64 return int(text) 65 # NB: Could make this more consistent by treating as a percentage 66 # return int(text) / 100. 67 try: 68 return float(text) 69 # NB: This should be in [0.0, 1.0], but who knows what people will do 70 # assert 0 <= current_clade.confidence <= 1 71 except ValueError: 72 return None
73
74 75 -def _format_comment(text):
76 return '[%s]' % (text.replace('[', '\\[').replace(']', '\\]'))
77
78 -def _get_comment(clade):
79 if hasattr(clade, 'comment') and clade.comment: 80 return _format_comment(str(clade.comment)) 81 else: 82 return ''
83
84 85 -class Parser(object):
86 """Parse a Newick tree given a file handle. 87 88 Based on the parser in `Bio.Nexus.Trees`. 89 """ 90
91 - def __init__(self, handle):
92 self.handle = handle
93 94 @classmethod
95 - def from_string(cls, treetext):
96 handle = StringIO(treetext) 97 return cls(handle)
98
99 - def parse(self, values_are_confidence=False, comments_are_confidence=False, rooted=False):
100 """Parse the text stream this object was initialized with.""" 101 self.values_are_confidence = values_are_confidence 102 self.comments_are_confidence = comments_are_confidence 103 self.rooted = rooted 104 buf = '' 105 unicodeChecked = False 106 unicodeLines = ("\xef", "\xff", "\xfe", "\x00") 107 for line in self.handle: 108 if not unicodeChecked: 109 # check for unicode byte order marks on first line only, 110 # these lead to parsing errors (on Python 2) 111 if line.startswith(unicodeLines): 112 raise NewickError("The file or stream you attempted to parse includes " 113 "unicode byte order marks. You must convert it to " 114 "ASCII before it can be parsed.") 115 unicodeChecked = True 116 buf += line.rstrip() 117 if buf.endswith(';'): 118 yield self._parse_tree(buf) 119 buf = '' 120 if buf: 121 # Last tree is missing a terminal ';' character -- that's OK 122 yield self._parse_tree(buf)
123
124 - def _parse_tree(self, text):
125 """Parses the text representation into an Tree object.""" 126 tokens = re.finditer(tokenizer, text.strip()) 127 128 new_clade = self.new_clade 129 root_clade = new_clade() 130 131 current_clade = root_clade 132 entering_branch_length = False 133 134 lp_count = 0 135 rp_count = 0 136 for match in tokens: 137 token = match.group() 138 139 if token.startswith("'"): 140 # quoted label; add characters to clade name 141 current_clade.name = token[1:-1] 142 143 elif token.startswith('['): 144 # comment 145 current_clade.comment = token[1:-1] 146 if self.comments_are_confidence: 147 # Try to use this comment as a numeric support value 148 current_clade.confidence = _parse_confidence(current_clade.comment) 149 150 elif token == '(': 151 # start a new clade, which is a child of the current clade 152 current_clade = new_clade(current_clade) 153 entering_branch_length = False 154 lp_count += 1 155 156 elif token == ',': 157 # if the current clade is the root, then the external parentheses are missing 158 # and a new root should be created 159 if current_clade is root_clade: 160 root_clade = new_clade() 161 current_clade.parent = root_clade 162 # start a new child clade at the same level as the current clade 163 parent = self.process_clade(current_clade) 164 current_clade = new_clade(parent) 165 entering_branch_length = False 166 167 elif token == ')': 168 # done adding children for this parent clade 169 parent = self.process_clade(current_clade) 170 if not parent: 171 raise NewickError('Parenthesis mismatch.') 172 current_clade = parent 173 entering_branch_length = False 174 rp_count += 1 175 176 elif token == ';': 177 break 178 179 elif token.startswith(':'): 180 # branch length or confidence 181 value = float(token[1:]) 182 if self.values_are_confidence: 183 current_clade.confidence = value 184 else: 185 current_clade.branch_length = value 186 187 elif token == '\n': 188 pass 189 190 else: 191 # unquoted node label 192 current_clade.name = token 193 194 if not lp_count == rp_count: 195 raise NewickError('Number of open/close parentheses do not match.') 196 197 # if ; token broke out of for loop, there should be no remaining tokens 198 try: 199 next_token = next(tokens) 200 raise NewickError('Text after semicolon in Newick tree: %s' 201 % next_token.group()) 202 except StopIteration: 203 pass 204 205 self.process_clade(current_clade) 206 self.process_clade(root_clade) 207 return Newick.Tree(root=root_clade, rooted=self.rooted)
208
209 - def new_clade(self, parent=None):
210 """Returns a new Newick.Clade, optionally with a temporary reference 211 to its parent clade.""" 212 clade = Newick.Clade() 213 if parent: 214 clade.parent = parent 215 return clade
216
217 - def process_clade(self, clade):
218 """Final processing of a parsed clade. Removes the node's parent and 219 returns it.""" 220 if (clade.name and not (self.values_are_confidence or 221 self.comments_are_confidence) 222 and clade.confidence is None): 223 clade.confidence = _parse_confidence(clade.name) 224 if not clade.confidence is None: 225 clade.name = None 226 227 if hasattr(clade, 'parent'): 228 parent = clade.parent 229 parent.clades.append(clade) 230 del clade.parent 231 return parent
232
233 234 # --------------------------------------------------------- 235 # Output 236 237 -class Writer(object):
238 """Based on the writer in Bio.Nexus.Trees (str, to_string).""" 239
240 - def __init__(self, trees):
241 self.trees = trees
242
243 - def write(self, handle, **kwargs):
244 """Write this instance's trees to a file handle.""" 245 count = 0 246 for treestr in self.to_strings(**kwargs): 247 handle.write(treestr + '\n') 248 count += 1 249 return count
250
251 - def to_strings(self, confidence_as_branch_length=False, 252 branch_length_only=False, plain=False, 253 plain_newick=True, ladderize=None, max_confidence=1.0, 254 format_confidence='%1.2f', format_branch_length='%1.5f'):
255 """Return an iterable of PAUP-compatible tree lines.""" 256 # If there's a conflict in the arguments, we override plain=True 257 if confidence_as_branch_length or branch_length_only: 258 plain = False 259 make_info_string = self._info_factory(plain, 260 confidence_as_branch_length, branch_length_only, max_confidence, 261 format_confidence, format_branch_length) 262 263 def newickize(clade): 264 """Convert a node tree to a Newick tree string, recursively.""" 265 label = clade.name or '' 266 if label: 267 unquoted_label = re.match(token_dict['unquoted node label'], label) 268 if (not unquoted_label) or (unquoted_label.end() < len(label)): 269 label = "'%s'" % label.replace('\\', '\\\\').replace("'", "\\'") 270 271 if clade.is_terminal(): # terminal 272 return (label 273 + make_info_string(clade, terminal=True)) 274 else: 275 subtrees = (newickize(sub) for sub in clade) 276 return '(%s)%s' % (','.join(subtrees), 277 label + make_info_string(clade))
278 279 # Convert each tree to a string 280 for tree in self.trees: 281 if ladderize in ('left', 'LEFT', 'right', 'RIGHT'): 282 # Nexus compatibility shim, kind of 283 tree.ladderize(reverse=(ladderize in ('right', 'RIGHT'))) 284 rawtree = newickize(tree.root) + ';' 285 if plain_newick: 286 yield rawtree 287 continue 288 # Nexus-style (?) notation before the raw Newick tree 289 treeline = ['tree', (tree.name or 'a_tree'), '='] 290 if tree.weight != 1: 291 treeline.append('[&W%s]' % round(float(tree.weight), 3)) 292 if tree.rooted: 293 treeline.append('[&R]') 294 treeline.append(rawtree) 295 yield ' '.join(treeline)
296
297 - def _info_factory(self, plain, confidence_as_branch_length, 298 branch_length_only, max_confidence, format_confidence, 299 format_branch_length):
300 """Return a function that creates a nicely formatted node tag.""" 301 if plain: 302 # Plain tree only. That's easy. 303 def make_info_string(clade, terminal=False): 304 return _get_comment(clade)
305 306 elif confidence_as_branch_length: 307 # Support as branchlengths (eg. PAUP), ignore actual branchlengths 308 def make_info_string(clade, terminal=False): 309 if terminal: 310 # terminal branches have 100% support 311 return (':' + format_confidence % max_confidence) + _get_comment(clade) 312 else: 313 return (':' + format_confidence % clade.confidence) + _get_comment(clade) 314 315 elif branch_length_only: 316 # write only branchlengths, ignore support 317 def make_info_string(clade, terminal=False): 318 return (':' + format_branch_length % clade.branch_length) + _get_comment(clade) 319 320 else: 321 # write support and branchlengths (e.g. .con tree of mrbayes) 322 def make_info_string(clade, terminal=False): 323 if (terminal or 324 not hasattr(clade, 'confidence') or 325 clade.confidence is None): 326 return (':' + format_branch_length 327 ) % (clade.branch_length or 0.0) + _get_comment(clade) 328 else: 329 return (format_confidence + ':' + format_branch_length 330 ) % (clade.confidence, clade.branch_length or 0.0) + _get_comment(clade) 331 332 return make_info_string 333