Package Bio :: Package Nexus :: Module Nexus
[hide private]
[frames] | no frames]

Source Code for Module Bio.Nexus.Nexus

   1  # Copyright 2005-2008 by Frank Kauff & Cymon J. Cox. All rights reserved. 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license. Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5  # 
   6  # Bug reports welcome: fkauff@biologie.uni-kl.de or on Biopython's bugzilla. 
   7  """Nexus class. Parse the contents of a NEXUS file. 
   8   
   9  Based upon 'NEXUS: An extensible file format for systematic information' 
  10  Maddison, Swofford, Maddison. 1997. Syst. Biol. 46(4):590-621 
  11  """ 
  12  from __future__ import print_function 
  13   
  14  from Bio._py3k import zip 
  15  from Bio._py3k import range 
  16  from Bio._py3k import basestring 
  17   
  18  from functools import reduce 
  19  import copy 
  20  import math 
  21  import random 
  22  import sys 
  23   
  24  from Bio import File 
  25  from Bio.Alphabet import IUPAC 
  26  from Bio.Data import IUPACData 
  27  from Bio.Seq import Seq 
  28   
  29  from .Trees import Tree 
  30   
  31  __docformat__ = "restructuredtext en" 
  32   
  33  INTERLEAVE = 70 
  34  SPECIAL_COMMANDS = ['charstatelabels', 'charlabels', 'taxlabels', 'taxset', 
  35                      'charset', 'charpartition', 'taxpartition', 'matrix', 
  36                      'tree', 'utree', 'translate', 'codonposset', 'title'] 
  37  KNOWN_NEXUS_BLOCKS = ['trees', 'data', 'characters', 'taxa', 'sets', 'codons'] 
  38  PUNCTUATION = '()[]{}/\,;:=*\'"`+-<>' 
  39  MRBAYESSAFE = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890_' 
  40  WHITESPACE = ' \t\n' 
  41  # SPECIALCOMMENTS = ['!','&','%','/','\\','@'] # original list of special comments 
  42  SPECIALCOMMENTS = ['&']  # supported special comment ('tree' command), all others are ignored 
  43  CHARSET = 'chars' 
  44  TAXSET = 'taxa' 
  45  CODONPOSITIONS = 'codonpositions' 
  46  DEFAULTNEXUS = '#NEXUS\nbegin data; dimensions ntax=0 nchar=0; format datatype=dna; end; ' 
  47   
  48   
49 -class NexusError(Exception):
50 pass
51 52
53 -class CharBuffer(object):
54 """Helps reading NEXUS-words and characters from a buffer (semi-PRIVATE). 55 56 This class is not intended for public use (any more). 57 """
58 - def __init__(self, string):
59 if string: 60 self.buffer = list(string) 61 else: 62 self.buffer = []
63
64 - def peek(self):
65 if self.buffer: 66 return self.buffer[0] 67 else: 68 return None
69
70 - def peek_nonwhitespace(self):
71 b = ''.join(self.buffer).strip() 72 if b: 73 return b[0] 74 else: 75 return None
76
77 - def __next__(self):
78 if self.buffer: 79 return self.buffer.pop(0) 80 else: 81 return None
82 83 if sys.version_info[0] < 3:
84 - def next(self):
85 """Deprecated Python 2 style alias for Python 3 style __next__ method.""" 86 return self.__next__()
87
88 - def next_nonwhitespace(self):
89 while True: 90 p = next(self) 91 if p is None: 92 break 93 if p not in WHITESPACE: 94 return p 95 return None
96
97 - def skip_whitespace(self):
98 while self.buffer[0] in WHITESPACE: 99 self.buffer = self.buffer[1:]
100
101 - def next_until(self, target):
102 for t in target: 103 try: 104 pos = self.buffer.index(t) 105 except ValueError: 106 pass 107 else: 108 found = ''.join(self.buffer[:pos]) 109 self.buffer = self.buffer[pos:] 110 return found 111 else: 112 return None
113
114 - def peek_word(self, word):
115 return ''.join(self.buffer[:len(word)]) == word
116
117 - def next_word(self):
118 """Return the next NEXUS word from a string. 119 120 This deals with single and double quotes, whitespace and punctuation. 121 """ 122 123 word = [] 124 quoted = False 125 first = self.next_nonwhitespace() # get first character 126 if not first: # return empty if only whitespace left 127 return None 128 word.append(first) 129 if first == "'": # word starts with a quote 130 quoted = "'" 131 elif first == '"': 132 quoted = '"' 133 elif first in PUNCTUATION: # if it's punctuation, return immediately 134 return first 135 while True: 136 c = self.peek() 137 if c == quoted: # a quote? 138 word.append(next(self)) # store quote 139 if self.peek() == quoted: # double quote 140 next(self) # skip second quote 141 elif quoted: # second single quote ends word 142 break 143 elif quoted: 144 word.append(next(self)) # if quoted, then add anything 145 elif not c or c in PUNCTUATION or c in WHITESPACE: 146 # if not quoted and special character, stop 147 break 148 else: 149 word.append(next(self)) # standard character 150 return ''.join(word)
151
152 - def rest(self):
153 """Return the rest of the string without parsing.""" 154 return ''.join(self.buffer)
155 156
157 -class StepMatrix(object):
158 """Calculate a stepmatrix for weighted parsimony. 159 160 See Wheeler (1990), Cladistics 6:269-275. 161 """ 162
163 - def __init__(self, symbols, gap):
164 self.data = {} 165 self.symbols = sorted(symbols) 166 if gap: 167 self.symbols.append(gap) 168 for x in self.symbols: 169 for y in [s for s in self.symbols if s != x]: 170 self.set(x, y, 0)
171
172 - def set(self, x, y, value):
173 if x > y: 174 x, y = y, x 175 self.data[x + y] = value
176
177 - def add(self, x, y, value):
178 if x > y: 179 x, y = y, x 180 self.data[x + y] += value
181
182 - def sum(self):
183 return reduce(lambda x, y: x + y, self.data.values())
184
185 - def transformation(self):
186 total = self.sum() 187 if total != 0: 188 for k in self.data: 189 self.data[k] = self.data[k] / float(total) 190 return self
191
192 - def weighting(self):
193 for k in self.data: 194 if self.data[k] != 0: 195 self.data[k] = -math.log(self.data[k]) 196 return self
197
198 - def smprint(self, name='your_name_here'):
199 matrix = 'usertype %s stepmatrix=%d\n' % (name, len(self.symbols)) 200 matrix += ' %s\n' % ' '.join(self.symbols) 201 for x in self.symbols: 202 matrix += '[%s]'.ljust(8) % x 203 for y in self.symbols: 204 if x == y: 205 matrix += ' . ' 206 else: 207 if x > y: 208 x1, y1 = y, x 209 else: 210 x1, y1 = x, y 211 if self.data[x1 + y1] == 0: 212 matrix += 'inf. ' 213 else: 214 matrix += '%2.2f'.ljust(10) % (self.data[x1 + y1]) 215 matrix += '\n' 216 matrix += ';\n' 217 return matrix
218 219
220 -def safename(name, mrbayes=False):
221 """Return a taxon identifier according to NEXUS standard. 222 223 Wrap quotes around names with punctuation or whitespace, and double 224 single quotes. 225 226 mrbayes=True: write names without quotes, whitespace or punctuation 227 for the mrbayes software package. 228 """ 229 if mrbayes: 230 safe = name.replace(' ', '_') 231 safe = ''.join(c for c in safe if c in MRBAYESSAFE) 232 else: 233 safe = name.replace("'", "''") 234 if set(safe).intersection(set(WHITESPACE + PUNCTUATION)): 235 safe = "'" + safe + "'" 236 return safe
237 238
239 -def quotestrip(word):
240 """Remove quotes and/or double quotes around identifiers.""" 241 if not word: 242 return None 243 while (word.startswith("'") and word.endswith("'")) or (word.startswith('"') and word.endswith('"')): 244 word = word[1:-1] 245 return word
246 247
248 -def get_start_end(sequence, skiplist=None):
249 """Return position of first and last character which is not in skiplist. 250 251 Skiplist defaults to ['-','?']. 252 """ 253 if skiplist is None: 254 skiplist = ["-", "?"] 255 256 length = len(sequence) 257 if length == 0: 258 return None, None 259 end = length - 1 260 while end >= 0 and (sequence[end] in skiplist): 261 end -= 1 262 start = 0 263 while start < length and (sequence[start] in skiplist): 264 start += 1 265 if start == length and end == -1: # empty sequence 266 return -1, -1 267 else: 268 return start, end
269 270
271 -def _sort_keys_by_values(p):
272 """Returns a sorted list of keys of p sorted by values of p.""" 273 return sorted((pn for pn in p if p[pn]), key=lambda pn: p[pn])
274 275
276 -def _make_unique(l):
277 """Check that all values in list are unique and return a pruned and sorted list.""" 278 return sorted(set(l))
279 280
281 -def _unique_label(previous_labels, label):
282 """Returns a unique name if label is already in previous_labels.""" 283 while label in previous_labels: 284 if label.split('.')[-1].startswith('copy'): 285 label = '.'.join(label.split('.')[:-1]) \ 286 + '.copy' + str(eval('0' + label.split('.')[-1][4:]) + 1) 287 else: 288 label += '.copy' 289 return label
290 291
292 -def _seqmatrix2strmatrix(matrix):
293 """Converts a Seq-object matrix to a plain sequence-string matrix.""" 294 return dict((t, str(matrix[t])) for t in matrix)
295 296
297 -def _compact4nexus(orig_list):
298 """Transform [1 2 3 5 6 7 8 12 15 18 20] (baseindex 0, used in the Nexus class) 299 into '2-4 6-9 13-19\\3 21' (baseindex 1, used in programs like Paup or MrBayes.). 300 """ 301 302 if not orig_list: 303 return '' 304 orig_list = sorted(set(orig_list)) 305 shortlist = [] 306 clist = orig_list[:] 307 clist.append(clist[-1] + .5) # dummy value makes it easier 308 while len(clist) > 1: 309 step = 1 310 for i, x in enumerate(clist): 311 if x == clist[0] + i * step: # are we still in the right step? 312 continue 313 elif i == 1 and len(clist) > 3 and clist[i + 1] - x == x - clist[0]: 314 # second element, and possibly at least 3 elements to link, 315 # and the next one is in the right step 316 step = x - clist[0] 317 else: # pattern broke, add all values before current position to new list 318 sub = clist[:i] 319 if len(sub) == 1: 320 shortlist.append(str(sub[0] + 1)) 321 else: 322 if step == 1: 323 shortlist.append('%d-%d' % (sub[0] + 1, sub[-1] + 1)) 324 else: 325 shortlist.append('%d-%d\\%d' % (sub[0] + 1, sub[-1] + 1, step)) 326 clist = clist[i:] 327 break 328 return ' '.join(shortlist)
329 330
331 -def combine(matrices):
332 """Combine matrices in [(name,nexus-instance),...] and return new nexus instance. 333 334 combined_matrix=combine([(name1,nexus_instance1),(name2,nexus_instance2),...] 335 Character sets, character partitions and taxon sets are prefixed, readjusted 336 and present in the combined matrix. 337 """ 338 339 if not matrices: 340 return None 341 name = matrices[0][0] 342 combined = copy.deepcopy(matrices[0][1]) # initiate with copy of first matrix 343 mixed_datatypes = (len(set(n[1].datatype for n in matrices)) > 1) 344 if mixed_datatypes: 345 # dealing with mixed matrices is application specific. 346 # You take care of that yourself! 347 combined.datatype = 'None' 348 # raise NexusError('Matrices must be of same datatype') 349 combined.charlabels = None 350 combined.statelabels = None 351 combined.interleave = False 352 combined.translate = None 353 354 # rename taxon sets and character sets and name them with prefix 355 for cn, cs in combined.charsets.items(): 356 combined.charsets['%s.%s' % (name, cn)] = cs 357 del combined.charsets[cn] 358 for tn, ts in combined.taxsets.items(): 359 combined.taxsets['%s.%s' % (name, tn)] = ts 360 del combined.taxsets[tn] 361 # previous partitions usually don't make much sense in combined matrix 362 # just initiate one new partition parted by single matrices 363 combined.charpartitions = {'combined': {name: list(range(combined.nchar))}} 364 for n, m in matrices[1:]: # add all other matrices 365 both = [t for t in combined.taxlabels if t in m.taxlabels] 366 combined_only = [t for t in combined.taxlabels if t not in both] 367 m_only = [t for t in m.taxlabels if t not in both] 368 for t in both: 369 # concatenate sequences and unify gap and missing character symbols 370 combined.matrix[t] += Seq(str(m.matrix[t]).replace(m.gap, combined.gap).replace(m.missing, combined.missing), combined.alphabet) 371 # replace date of missing taxa with symbol for missing data 372 for t in combined_only: 373 combined.matrix[t] += Seq(combined.missing * m.nchar, combined.alphabet) 374 for t in m_only: 375 combined.matrix[t] = Seq(combined.missing * combined.nchar, combined.alphabet) + \ 376 Seq(str(m.matrix[t]).replace(m.gap, combined.gap).replace(m.missing, combined.missing), combined.alphabet) 377 combined.taxlabels.extend(m_only) # new taxon list 378 for cn, cs in m.charsets.items(): # adjust character sets for new matrix 379 combined.charsets['%s.%s' % (n, cn)] = [x + combined.nchar for x in cs] 380 if m.taxsets: 381 if not combined.taxsets: 382 combined.taxsets = {} 383 # update taxon sets 384 combined.taxsets.update(dict(('%s.%s' % (n, tn), ts) 385 for tn, ts in m.taxsets.items())) 386 # update new charpartition 387 combined.charpartitions['combined'][n] = list(range(combined.nchar, combined.nchar + m.nchar)) 388 # update charlabels 389 if m.charlabels: 390 if not combined.charlabels: 391 combined.charlabels = {} 392 combined.charlabels.update(dict((combined.nchar + i, label) 393 for (i, label) in m.charlabels.items())) 394 combined.nchar += m.nchar # update nchar and ntax 395 combined.ntax += len(m_only) 396 397 # some prefer partitions, some charsets: 398 # make separate charset for ecah initial dataset 399 for c in combined.charpartitions['combined']: 400 combined.charsets[c] = combined.charpartitions['combined'][c] 401 402 return combined
403 404
405 -def _kill_comments_and_break_lines(text):
406 """Delete []-delimited comments out of a file and break into lines separated by ';'. 407 408 stripped_text=_kill_comments_and_break_lines(text): 409 Nested and multiline comments are allowed. [ and ] symbols within single 410 or double quotes are ignored, newline ends a quote, all symbols with quotes are 411 treated the same (thus not quoting inside comments like [this character ']' ends a comment]) 412 Special [&...] and [\...] comments remain untouched, if not inside standard comment. 413 Quotes inside special [& and [\ are treated as normal characters, 414 but no nesting inside these special comments allowed (like [& [\ ]]). 415 ';' ist deleted from end of line. 416 417 NOTE: this function is very slow for large files, and obsolete when using C extension cnexus 418 """ 419 contents = iter(text) 420 newtext = [] 421 newline = [] 422 quotelevel = '' 423 speciallevel = False 424 commlevel = 0 425 # Parse with one character look ahead (for special comments) 426 t2 = next(contents) 427 while True: 428 t = t2 429 try: 430 t2 = next(contents) 431 except StopIteration: 432 t2 = None 433 if t is None: 434 break 435 if t == quotelevel and not (commlevel or speciallevel): 436 # matching quote ends quotation 437 quotelevel = '' 438 elif not quotelevel and not (commlevel or speciallevel) and (t == '"' or t == "'"): 439 # single or double quote starts quotation 440 quotelevel = t 441 elif not quotelevel and t == '[': 442 # opening bracket outside a quote 443 if t2 in SPECIALCOMMENTS and commlevel == 0 and not speciallevel: 444 speciallevel = True 445 else: 446 commlevel += 1 447 elif not quotelevel and t == ']': 448 # closing bracket ioutside a quote 449 if speciallevel: 450 speciallevel = False 451 else: 452 commlevel -= 1 453 if commlevel < 0: 454 raise NexusError('Nexus formatting error: unmatched ]') 455 continue 456 if commlevel == 0: 457 # copy if we're not in comment 458 if t == ';' and not quotelevel: 459 newtext.append(''.join(newline)) 460 newline = [] 461 else: 462 newline.append(t) 463 # level of comments should be 0 at the end of the file 464 if newline: 465 newtext.append('\n'.join(newline)) 466 if commlevel > 0: 467 raise NexusError('Nexus formatting error: unmatched [') 468 return newtext
469 470
471 -def _adjust_lines(lines):
472 """Adjust linebreaks to match ';', strip leading/trailing whitespace. 473 474 list_of_commandlines=_adjust_lines(input_text) 475 Lines are adjusted so that no linebreaks occur within a commandline 476 (except matrix command line) 477 """ 478 formatted_lines = [] 479 for l in lines: 480 # Convert line endings 481 l = l.replace('\r\n', '\n').replace('\r', '\n').strip() 482 if l.lower().startswith('matrix'): 483 formatted_lines.append(l) 484 else: 485 l = l.replace('\n', ' ') 486 if l: 487 formatted_lines.append(l) 488 return formatted_lines
489 490
491 -def _replace_parenthesized_ambigs(seq, rev_ambig_values):
492 """Replaces ambigs in xxx(ACG)xxx format by IUPAC ambiguity code.""" 493 494 opening = seq.find('(') 495 while opening > -1: 496 closing = seq.find(')') 497 if closing < 0: 498 raise NexusError('Missing closing parenthesis in: ' + seq) 499 elif closing < opening: 500 raise NexusError('Missing opening parenthesis in: ' + seq) 501 ambig = ''.join(sorted(seq[opening + 1:closing])) 502 ambig_code = rev_ambig_values[ambig.upper()] 503 if ambig != ambig.upper(): 504 ambig_code = ambig_code.lower() 505 seq = seq[:opening] + ambig_code + seq[closing + 1:] 506 opening = seq.find('(') 507 return seq
508 509
510 -class Commandline(object):
511 """Represent a commandline as command and options.""" 512
513 - def __init__(self, line, title):
514 self.options = {} 515 options = [] 516 self.command = None 517 try: 518 # Assume matrix (all other command lines have been stripped of \n) 519 self.command, options = line.strip().split('\n', 1) 520 except ValueError: # Not matrix 521 # self.command,options=line.split(' ',1) # no: could be tab or spaces (translate...) 522 self.command = line.split()[0] 523 options = ' '.join(line.split()[1:]) 524 self.command = self.command.strip().lower() 525 if self.command in SPECIAL_COMMANDS: 526 # special command that need newlines and order of options preserved 527 self.options = options.strip() 528 else: 529 if len(options) > 0: 530 try: 531 options = options.replace('=', ' = ').split() 532 valued_indices = [(n - 1, n, n + 1) for n in range(len(options)) 533 if options[n] == '=' and n != 0 and n != len((options))] 534 indices = [] 535 for sl in valued_indices: 536 indices.extend(sl) 537 token_indices = [n for n in range(len(options)) if n not in indices] 538 for opt in valued_indices: 539 # self.options[options[opt[0]].lower()] = options[opt[2]].lower() 540 self.options[options[opt[0]].lower()] = options[opt[2]] 541 for token in token_indices: 542 self.options[options[token].lower()] = None 543 except ValueError: 544 raise NexusError('Incorrect formatting in line: %s' % line)
545 546
547 -class Block(object):
548 """Represent a NEXUS block with block name and list of commandlines."""
549 - def __init__(self, title=None):
550 self.title = title 551 self.commandlines = []
552 553
554 -class Nexus(object):
555
556 - def __init__(self, input=None):
557 self.ntax = 0 # number of taxa 558 self.nchar = 0 # number of characters 559 self.unaltered_taxlabels = [] # taxlabels as the appear in the input file (incl. duplicates, etc.) 560 self.taxlabels = [] # labels for taxa, ordered by their id 561 self.charlabels = None # ... and for characters 562 self.statelabels = None # ... and for states 563 self.datatype = 'dna' # (standard), dna, rna, nucleotide, protein 564 self.respectcase = False # case sensitivity 565 self.missing = '?' # symbol for missing characters 566 self.gap = '-' # symbol for gap 567 self.symbols = None # set of symbols 568 self.equate = None # set of symbol synonyms 569 self.matchchar = None # matching char for matrix representation 570 self.labels = None # left, right, no 571 self.transpose = False # whether matrix is transposed 572 self.interleave = False # whether matrix is interleaved 573 self.tokens = False # unsupported 574 self.eliminate = None # unsupported 575 self.matrix = None # ... 576 self.unknown_blocks = [] # blocks we don't care about 577 self.taxsets = {} 578 self.charsets = {} 579 self.charpartitions = {} 580 self.taxpartitions = {} 581 self.trees = [] # list of Trees (instances of Tree class) 582 self.translate = None # Dict to translate taxon <-> taxon numbers 583 self.structured = [] # structured input representation 584 self.set = {} # dict of the set command to set various options 585 self.options = {} # dict of the options command in the data block 586 self.codonposset = None # name of the charpartition that defines codon positions 587 588 # some defaults 589 self.options['gapmode'] = 'missing' 590 591 if input: 592 self.read(input) 593 else: 594 self.read(DEFAULTNEXUS)
595
596 - def get_original_taxon_order(self):
597 """Included for backwards compatibility (DEPRECATED).""" 598 return self.taxlabels
599
600 - def set_original_taxon_order(self, value):
601 """Included for backwards compatibility (DEPRECATED).""" 602 self.taxlabels = value
603 604 original_taxon_order = property(get_original_taxon_order, set_original_taxon_order) 605
606 - def read(self, input):
607 """Read and parse NEXUS input (a filename, file-handle, or string).""" 608 609 # 1. Assume we have the name of a file in the execution dir or a 610 # file-like object. 611 # Note we need to add parsing of the path to dir/filename 612 try: 613 with File.as_handle(input, 'rU') as fp: 614 file_contents = fp.read() 615 self.filename = getattr(fp, 'name', 'Unknown_nexus_file') 616 except (TypeError, IOError, AttributeError): 617 # 2 Assume we have a string from a fh.read() 618 if isinstance(input, basestring): 619 file_contents = input 620 self.filename = 'input_string' 621 else: 622 print(input.strip()[:50]) 623 raise NexusError('Unrecognized input: %s ...' % input[:100]) 624 file_contents = file_contents.strip() 625 if file_contents.startswith('#NEXUS'): 626 file_contents = file_contents[6:] 627 commandlines = _get_command_lines(file_contents) 628 # get rid of stupid 'NEXUS token - in merged treefiles, this might appear multiple times' 629 for i, cl in enumerate(commandlines): 630 try: 631 if cl[:6].upper() == '#NEXUS': 632 commandlines[i] = cl[6:].strip() 633 except: 634 pass 635 # now loop through blocks (we parse only data in known blocks, thus ignoring non-block commands 636 nexus_block_gen = self._get_nexus_block(commandlines) 637 while True: 638 try: 639 title, contents = next(nexus_block_gen) 640 except StopIteration: 641 break 642 if title in KNOWN_NEXUS_BLOCKS: 643 self._parse_nexus_block(title, contents) 644 else: 645 self._unknown_nexus_block(title, contents)
646
647 - def _get_nexus_block(self, file_contents):
648 """Generator for looping through Nexus blocks.""" 649 inblock = False 650 blocklines = [] 651 while file_contents: 652 cl = file_contents.pop(0) 653 if cl.lower().startswith('begin'): 654 if not inblock: 655 inblock = True 656 title = cl.split()[1].lower() 657 else: 658 raise NexusError('Illegal block nesting in block %s' % title) 659 elif cl.lower().startswith('end'): 660 if inblock: 661 inblock = False 662 yield title, blocklines 663 blocklines = [] 664 else: 665 raise NexusError('Unmatched \'end\'.') 666 elif inblock: 667 blocklines.append(cl)
668
669 - def _unknown_nexus_block(self, title, contents):
670 block = Block() 671 block.commandlines.append(contents) 672 block.title = title 673 self.unknown_blocks.append(block)
674
675 - def _parse_nexus_block(self, title, contents):
676 """Parse a known Nexus Block (PRIVATE).""" 677 # attached the structered block representation 678 self._apply_block_structure(title, contents) 679 # now check for taxa,characters,data blocks. If this stuff is defined more than once 680 # the later occurences will override the previous ones. 681 block = self.structured[-1] 682 for line in block.commandlines: 683 try: 684 getattr(self, '_' + line.command)(line.options) 685 except AttributeError: 686 raise NexusError('Unknown command: %s ' % line.command)
687
688 - def _title(self, options):
689 pass
690 693
694 - def _dimensions(self, options):
695 if 'ntax' in options: 696 self.ntax = eval(options['ntax']) 697 if 'nchar' in options: 698 self.nchar = eval(options['nchar'])
699
700 - def _format(self, options):
701 # print options 702 # we first need to test respectcase, then symbols (which depends on respectcase) 703 # then datatype (which, if standard, depends on symbols and respectcase in order to generate 704 # dicts for ambiguous values and alphabet 705 if 'respectcase' in options: 706 self.respectcase = True 707 # adjust symbols to for respectcase 708 if 'symbols' in options: 709 self.symbols = options['symbols'] 710 if (self.symbols.startswith('"') and self.symbols.endswith('"')) or\ 711 (self.symbols.startswith("'") and self.symbols.endswith("'")): 712 self.symbols = self.symbols[1:-1].replace(' ', '') 713 if not self.respectcase: 714 self.symbols = self.symbols.lower() + self.symbols.upper() 715 self.symbols = list(set(self.symbols)) 716 if 'datatype' in options: 717 self.datatype = options['datatype'].lower() 718 if self.datatype == 'dna' or self.datatype == 'nucleotide': 719 self.alphabet = IUPAC.IUPACAmbiguousDNA() # fresh instance! 720 self.ambiguous_values = IUPACData.ambiguous_dna_values.copy() 721 self.unambiguous_letters = IUPACData.unambiguous_dna_letters 722 elif self.datatype == 'rna': 723 self.alphabet = IUPAC.IUPACAmbiguousDNA() # fresh instance! 724 self.ambiguous_values = IUPACData.ambiguous_rna_values.copy() 725 self.unambiguous_letters = IUPACData.unambiguous_rna_letters 726 elif self.datatype == 'protein': 727 # TODO - Should this not be ExtendedIUPACProtein? 728 self.alphabet = IUPAC.IUPACProtein() # fresh instance 729 self.ambiguous_values = {'B': 'DN', 'Z': 'EQ', 'X': IUPACData.protein_letters} 730 # that's how PAUP handles it 731 self.unambiguous_letters = IUPACData.protein_letters + '*' # stop-codon 732 elif self.datatype == 'standard': 733 raise NexusError('Datatype standard is not yet supported.') 734 # self.alphabet = None 735 # self.ambiguous_values = {} 736 # if not self.symbols: 737 # self.symbols = '01' # if nothing else defined, then 0 and 1 are the default states 738 # self.unambiguous_letters = self.symbols 739 else: 740 raise NexusError('Unsupported datatype: ' + self.datatype) 741 self.valid_characters = ''.join(self.ambiguous_values) + self.unambiguous_letters 742 if not self.respectcase: 743 self.valid_characters = self.valid_characters.lower() + self.valid_characters.upper() 744 # we have to sort the reverse ambig coding dict key characters: 745 # to be sure that it's 'ACGT':'N' and not 'GTCA':'N' 746 rev = dict((i[1], i[0]) for i in self.ambiguous_values.items() if i[0] != 'X') 747 self.rev_ambiguous_values = {} 748 for (k, v) in rev.items(): 749 key = sorted(c for c in k) 750 self.rev_ambiguous_values[''.join(key)] = v 751 # overwrite symbols for datype rna,dna,nucleotide 752 if self.datatype in ['dna', 'rna', 'nucleotide']: 753 self.symbols = self.alphabet.letters 754 if self.missing not in self.ambiguous_values: 755 self.ambiguous_values[self.missing] = self.unambiguous_letters + self.gap 756 self.ambiguous_values[self.gap] = self.gap 757 elif self.datatype == 'standard': 758 if not self.symbols: 759 self.symbols = ['1', '0'] 760 if 'missing' in options: 761 self.missing = options['missing'][0] 762 if 'gap' in options: 763 self.gap = options['gap'][0] 764 if 'equate' in options: 765 self.equate = options['equate'] 766 if 'matchchar' in options: 767 self.matchchar = options['matchchar'][0] 768 if 'labels' in options: 769 self.labels = options['labels'] 770 if 'transpose' in options: 771 raise NexusError('TRANSPOSE is not supported!') 772 self.transpose = True 773 if 'interleave' in options: 774 if options['interleave'] is None or options['interleave'].lower() == 'yes': 775 self.interleave = True 776 if 'tokens' in options: 777 self.tokens = True 778 if 'notokens' in options: 779 self.tokens = False
780
781 - def _set(self, options):
782 self.set = options
783
784 - def _options(self, options):
785 self.options = options
786
787 - def _eliminate(self, options):
788 self.eliminate = options
789
790 - def _taxlabels(self, options):
791 """Get taxon labels (PRIVATE). 792 793 As the taxon names are already in the matrix, this is superfluous 794 except for transpose matrices, which are currently unsupported anyway. 795 Thus, we ignore the taxlabels command to make handling of duplicate 796 taxon names easier. 797 """ 798 pass
799 # self.taxlabels = [] 800 # opts = CharBuffer(options) 801 # while True: 802 # taxon = quotestrip(opts.next_word()) 803 # if not taxon: 804 # break 805 # self.taxlabels.append(taxon) 806
807 - def _check_taxlabels(self, taxon):
808 """Check for presence of taxon in self.taxlabels.""" 809 # According to NEXUS standard, underscores shall be treated as spaces..., 810 # so checking for identity is more difficult 811 nextaxa = dict((t.replace(' ', '_'), t) for t in self.taxlabels) 812 nexid = taxon.replace(' ', '_') 813 return nextaxa.get(nexid)
814
815 - def _charlabels(self, options):
816 self.charlabels = {} 817 opts = CharBuffer(options) 818 while True: 819 # get id and state 820 w = opts.next_word() 821 if w is None: # McClade saves and reads charlabel-lists with terminal comma?! 822 break 823 identifier = self._resolve(w, set_type=CHARSET) 824 state = quotestrip(opts.next_word()) 825 self.charlabels[identifier] = state 826 # check for comma or end of command 827 c = opts.next_nonwhitespace() 828 if c is None: 829 break 830 elif c != ',': 831 raise NexusError('Missing \',\' in line %s.' % options)
832
833 - def _charstatelabels(self, options):
834 # warning: charstatelabels supports only charlabels-syntax! 835 self._charlabels(options)
836
837 - def _statelabels(self, options):
838 # self.charlabels = options 839 # print 'Command statelabels is not supported and will be ignored.' 840 pass
841
842 - def _matrix(self, options):
843 if not self.ntax or not self.nchar: 844 raise NexusError('Dimensions must be specified before matrix!') 845 self.matrix = {} 846 taxcount = 0 847 first_matrix_block = True 848 849 # eliminate empty lines and leading/trailing whitespace 850 lines = [l.strip() for l in options.split('\n') if l.strip() != ''] 851 lineiter = iter(lines) 852 while True: 853 try: 854 l = next(lineiter) 855 except StopIteration: 856 if taxcount < self.ntax: 857 raise NexusError('Not enough taxa in matrix.') 858 elif taxcount > self.ntax: 859 raise NexusError('Too many taxa in matrix.') 860 else: 861 break 862 # count the taxa and check for interleaved matrix 863 taxcount += 1 864 # print taxcount 865 if taxcount > self.ntax: 866 if not self.interleave: 867 raise NexusError('Too many taxa in matrix - should matrix be interleaved?') 868 else: 869 taxcount = 1 870 first_matrix_block = False 871 # get taxon name and sequence 872 linechars = CharBuffer(l) 873 id = quotestrip(linechars.next_word()) 874 l = linechars.rest().strip() 875 chars = '' 876 if self.interleave: 877 # interleaved matrix 878 # print 'In interleave' 879 if l: 880 chars = ''.join(l.split()) 881 else: 882 chars = ''.join(next(lineiter).split()) 883 else: 884 # non-interleaved matrix 885 chars = ''.join(l.split()) 886 while len(chars) < self.nchar: 887 l = next(lineiter) 888 chars += ''.join(l.split()) 889 iupac_seq = Seq(_replace_parenthesized_ambigs(chars, self.rev_ambiguous_values), self.alphabet) 890 # first taxon has the reference sequence if matchhar is used 891 if taxcount == 1: 892 refseq = iupac_seq 893 else: 894 if self.matchchar: 895 while True: 896 p = str(iupac_seq).find(self.matchchar) 897 if p == -1: 898 break 899 iupac_seq = Seq(str(iupac_seq)[:p] + refseq[p] + str(iupac_seq)[p + 1:], self.alphabet) 900 # check for invalid characters 901 for c in str(iupac_seq): 902 if c not in self.valid_characters and c != self.gap and c != self.missing: 903 raise NexusError("Taxon %s: Illegal character %s in sequence %s " 904 "(check dimensions/interleaving)" % (id, c, iupac_seq)) 905 # add sequence to matrix 906 if first_matrix_block: 907 self.unaltered_taxlabels.append(id) 908 id = _unique_label(list(self.matrix.keys()), id) 909 self.matrix[id] = iupac_seq 910 self.taxlabels.append(id) 911 else: 912 # taxon names need to be in the same order in each interleaved block 913 id = _unique_label(self.taxlabels[:taxcount - 1], id) 914 taxon_present = self._check_taxlabels(id) 915 if taxon_present: 916 self.matrix[taxon_present] += iupac_seq 917 else: 918 raise NexusError("Taxon %s not in first block of interleaved " 919 "matrix. Check matrix dimensions and interleave." % id) 920 # check all sequences for length according to nchar 921 for taxon in self.matrix: 922 if len(self.matrix[taxon]) != self.nchar: 923 raise NexusError('Matrix Nchar %d does not match data length (%d) for taxon %s' 924 % (self.nchar, len(self.matrix[taxon]), taxon)) 925 # check that taxlabels is identical with matrix.keys. If not, it's a problem 926 matrixkeys = sorted(self.matrix) 927 taxlabelssort = sorted(self.taxlabels[:]) 928 assert matrixkeys == taxlabelssort, \ 929 "ERROR: TAXLABELS must be identical with MATRIX. " + \ 930 "Please Report this as a bug, and send in data file."
931
932 - def _translate(self, options):
933 self.translate = {} 934 opts = CharBuffer(options) 935 while True: 936 try: 937 # get id and state 938 identifier = int(opts.next_word()) 939 label = quotestrip(opts.next_word()) 940 self.translate[identifier] = label 941 # check for comma or end of command 942 c = opts.next_nonwhitespace() 943 if c is None: 944 break 945 elif c != ',': 946 raise NexusError('Missing \',\' in line %s.' % options) 947 except NexusError: 948 raise 949 except: 950 raise NexusError('Format error in line %s.' % options)
951
952 - def _utree(self, options):
953 """Some software (clustalx) uses 'utree' to denote an unrooted tree.""" 954 self._tree(options)
955
956 - def _tree(self, options):
957 opts = CharBuffer(options) 958 if opts.peek_nonwhitespace() == '*': 959 # a star can be used to make it the default tree in some software packages 960 dummy = opts.next_nonwhitespace() 961 name = opts.next_word() 962 if opts.next_nonwhitespace() != '=': 963 raise NexusError('Syntax error in tree description: %s' 964 % options[:50]) 965 rooted = False 966 weight = 1.0 967 while opts.peek_nonwhitespace() == '[': 968 opts.next_nonwhitespace() # discard opening bracket 969 symbol = next(opts) 970 if symbol != '&': 971 raise NexusError('Illegal special comment [%s...] in tree description: %s' 972 % (symbol, options[:50])) 973 special = next(opts) 974 value = opts.next_until(']') 975 next(opts) # discard closing bracket 976 if special == 'R': 977 rooted = True 978 elif special == 'U': 979 rooted = False 980 elif special == 'W': 981 weight = float(value) 982 tree = Tree(name=name, weight=weight, rooted=rooted, tree=opts.rest().strip()) 983 # if there's an active translation table, translate 984 if self.translate: 985 for n in tree.get_terminals(): 986 try: 987 tree.node(n).data.taxon = safename(self.translate[int(tree.node(n).data.taxon)]) 988 except (ValueError, KeyError): 989 raise NexusError('Unable to substitute %s using \'translate\' data.' 990 % tree.node(n).data.taxon) 991 self.trees.append(tree)
992
993 - def _apply_block_structure(self, title, lines):
994 block = Block('') 995 block.title = title 996 for line in lines: 997 block.commandlines.append(Commandline(line, title)) 998 self.structured.append(block)
999
1000 - def _taxset(self, options):
1001 name, taxa = self._get_indices(options, set_type=TAXSET) 1002 self.taxsets[name] = _make_unique(taxa)
1003
1004 - def _charset(self, options):
1005 name, sites = self._get_indices(options, set_type=CHARSET) 1006 self.charsets[name] = _make_unique(sites)
1007
1008 - def _taxpartition(self, options):
1009 taxpartition = {} 1010 quotelevel = False 1011 opts = CharBuffer(options) 1012 name = self._name_n_vector(opts) 1013 if not name: 1014 raise NexusError('Formatting error in taxpartition: %s ' % options) 1015 # now collect thesubbpartitions and parse them 1016 # subpartitons separated by commas - which unfortunately could be part of a quoted identifier... 1017 # this is rather unelegant, but we have to avoid double-parsing and potential change of special nexus-words 1018 sub = '' 1019 while True: 1020 w = next(opts) 1021 if w is None or (w == ',' and not quotelevel): 1022 subname, subindices = self._get_indices(sub, set_type=TAXSET, separator=':') 1023 taxpartition[subname] = _make_unique(subindices) 1024 sub = '' 1025 if w is None: 1026 break 1027 else: 1028 if w == "'": 1029 quotelevel = not quotelevel 1030 sub += w 1031 self.taxpartitions[name] = taxpartition
1032
1033 - def _codonposset(self, options):
1034 """Read codon positions from a codons block as written from McClade. 1035 1036 Here codonposset is just a fancy name for a character partition with 1037 the name CodonPositions and the partitions N,1,2,3 1038 """ 1039 1040 prev_partitions = list(self.charpartitions.keys()) 1041 self._charpartition(options) 1042 # mcclade calls it CodonPositions, but you never know... 1043 codonname = [n for n in self.charpartitions if n not in prev_partitions] 1044 if codonname == [] or len(codonname) > 1: 1045 raise NexusError('Formatting Error in codonposset: %s ' % options) 1046 else: 1047 self.codonposset = codonname[0]
1048
1049 - def _codeset(self, options):
1050 pass
1051
1052 - def _charpartition(self, options):
1053 charpartition = {} 1054 quotelevel = False 1055 opts = CharBuffer(options) 1056 name = self._name_n_vector(opts) 1057 if not name: 1058 raise NexusError('Formatting error in charpartition: %s ' % options) 1059 # now collect thesubbpartitions and parse them 1060 # subpartitons separated by commas - which unfortunately could be part of a quoted identifier... 1061 sub = '' 1062 while True: 1063 w = next(opts) 1064 if w is None or (w == ',' and not quotelevel): 1065 subname, subindices = self._get_indices(sub, set_type=CHARSET, separator=':') 1066 charpartition[subname] = _make_unique(subindices) 1067 sub = '' 1068 if w is None: 1069 break 1070 else: 1071 if w == "'": 1072 quotelevel = not quotelevel 1073 sub += w 1074 self.charpartitions[name] = charpartition
1075
1076 - def _get_indices(self, options, set_type=CHARSET, separator='='):
1077 """Parse the taxset/charset specification (PRIVATE). 1078 1079 e.g. '1 2 3 - 5 dog cat 10 - 20 \\ 3' 1080 --> [0,1,2,3,4,'dog','cat',9,12,15,18] 1081 """ 1082 opts = CharBuffer(options) 1083 name = self._name_n_vector(opts, separator=separator) 1084 indices = self._parse_list(opts, set_type=set_type) 1085 if indices is None: 1086 raise NexusError('Formatting error in line: %s ' % options) 1087 return name, indices
1088
1089 - def _name_n_vector(self, opts, separator='='):
1090 """Extract name and check that it's not in vector format.""" 1091 rest = opts.rest() 1092 name = opts.next_word() 1093 # we ignore * before names 1094 if name == '*': 1095 name = opts.next_word() 1096 if not name: 1097 raise NexusError('Formatting error in line: %s ' % rest) 1098 name = quotestrip(name) 1099 if opts.peek_nonwhitespace == '(': 1100 open = opts.next_nonwhitespace() 1101 qualifier = open.next_word() 1102 close = opts.next_nonwhitespace() 1103 if qualifier.lower() == 'vector': 1104 raise NexusError('Unsupported VECTOR format in line %s' 1105 % (opts)) 1106 elif qualifier.lower() != 'standard': 1107 raise NexusError('Unknown qualifier %s in line %s' 1108 % (qualifier, opts)) 1109 if opts.next_nonwhitespace() != separator: 1110 raise NexusError('Formatting error in line: %s ' % rest) 1111 return name
1112
1113 - def _parse_list(self, options_buffer, set_type):
1114 """Parse a NEXUS list (PRIVATE). 1115 1116 e.g. [1, 2, 4-8\\2, dog, cat] --> [1,2,4,6,8,17,21], 1117 (assuming dog is taxon no. 17 and cat is taxon no. 21). 1118 """ 1119 plain_list = [] 1120 if options_buffer.peek_nonwhitespace(): 1121 try: 1122 # capture all possible exceptions and treat them as formatting 1123 # errors, if they are not NexusError 1124 while True: 1125 identifier = options_buffer.next_word() # next list element 1126 if not identifier: # end of list? 1127 break 1128 start = self._resolve(identifier, set_type=set_type) 1129 if options_buffer.peek_nonwhitespace() == '-': # followd by - 1130 end = start 1131 step = 1 1132 # get hyphen and end of range 1133 hyphen = options_buffer.next_nonwhitespace() 1134 end = self._resolve(options_buffer.next_word(), set_type=set_type) 1135 if set_type == CHARSET: 1136 if options_buffer.peek_nonwhitespace() == '\\': # followd by \ 1137 backslash = options_buffer.next_nonwhitespace() 1138 step = int(options_buffer.next_word()) # get backslash and step 1139 plain_list.extend(range(start, end + 1, step)) 1140 else: 1141 if isinstance(start, list) or isinstance(end, list): 1142 raise NexusError('Name if character sets not allowed in range definition: %s' 1143 % identifier) 1144 start = self.taxlabels.index(start) 1145 end = self.taxlabels.index(end) 1146 taxrange = self.taxlabels[start:end + 1] 1147 plain_list.extend(taxrange) 1148 else: 1149 if isinstance(start, list): # start was the name of charset or taxset 1150 plain_list.extend(start) 1151 else: # start was an ordinary identifier 1152 plain_list.append(start) 1153 except NexusError: 1154 raise 1155 except: 1156 return None 1157 return plain_list
1158
1159 - def _resolve(self, identifier, set_type=None):
1160 """Translate identifier in list into character/taxon index. 1161 1162 Characters (which are referred to by their index in Nexus.py): 1163 Plain numbers are returned minus 1 (Nexus indices to python indices) 1164 Text identifiers are translated into their indices (if plain character identifiers), 1165 the first hit in charlabels is returned (charlabels don't need to be unique) 1166 or the range of indices is returned (if names of character sets). 1167 Taxa (which are referred to by their unique name in Nexus.py): 1168 Plain numbers are translated in their taxon name, underscores and spaces are considered equal. 1169 Names are returned unchanged (if plain taxon identifiers), or the names in 1170 the corresponding taxon set is returned. 1171 """ 1172 identifier = quotestrip(identifier) 1173 if not set_type: 1174 raise NexusError('INTERNAL ERROR: Need type to resolve identifier.') 1175 if set_type == CHARSET: 1176 try: 1177 n = int(identifier) 1178 except ValueError: 1179 if self.charlabels and identifier in self.charlabels.values(): 1180 for k in self.charlabels: 1181 if self.charlabels[k] == identifier: 1182 return k 1183 elif self.charsets and identifier in self.charsets: 1184 return self.charsets[identifier] 1185 else: 1186 raise NexusError('Unknown character identifier: %s' 1187 % identifier) 1188 else: 1189 if n <= self.nchar: 1190 return n - 1 1191 else: 1192 raise NexusError('Illegal character identifier: %d>nchar (=%d).' 1193 % (identifier, self.nchar)) 1194 elif set_type == TAXSET: 1195 try: 1196 n = int(identifier) 1197 except ValueError: 1198 taxlabels_id = self._check_taxlabels(identifier) 1199 if taxlabels_id: 1200 return taxlabels_id 1201 elif self.taxsets and identifier in self.taxsets: 1202 return self.taxsets[identifier] 1203 else: 1204 raise NexusError('Unknown taxon identifier: %s' 1205 % identifier) 1206 else: 1207 if n > 0 and n <= self.ntax: 1208 return self.taxlabels[n - 1] 1209 else: 1210 raise NexusError('Illegal taxon identifier: %d>ntax (=%d).' 1211 % (identifier, self.ntax)) 1212 else: 1213 raise NexusError('Unknown set specification: %s.' % set_type)
1214
1215 - def _stateset(self, options):
1216 # Not implemented 1217 pass
1218
1219 - def _changeset(self, options):
1220 # Not implemented 1221 pass
1222
1223 - def _treeset(self, options):
1224 # Not implemented 1225 pass
1226
1227 - def _treepartition(self, options):
1228 # Not implemented 1229 pass
1230
1231 - def write_nexus_data_partitions(self, matrix=None, filename=None, blocksize=None, 1232 interleave=False, exclude=[], delete=[], 1233 charpartition=None, comment='', mrbayes=False):
1234 """Writes a nexus file for each partition in charpartition. 1235 1236 Only non-excluded characters and non-deleted taxa are included, 1237 just the data block is written. 1238 """ 1239 1240 if not matrix: 1241 matrix = self.matrix 1242 if not matrix: 1243 return 1244 if not filename: 1245 filename = self.filename 1246 if charpartition: 1247 pfilenames = {} 1248 for p in charpartition: 1249 total_exclude = [] + exclude 1250 total_exclude.extend(c for c in range(self.nchar) if c not in charpartition[p]) 1251 total_exclude = _make_unique(total_exclude) 1252 pcomment = comment + '\nPartition: ' + p + '\n' 1253 dot = filename.rfind('.') 1254 if dot > 0: 1255 pfilename = filename[:dot] + '_' + p + '.data' 1256 else: 1257 pfilename = filename + '_' + p 1258 pfilenames[p] = pfilename 1259 self.write_nexus_data(filename=pfilename, matrix=matrix, blocksize=blocksize, 1260 interleave=interleave, exclude=total_exclude, delete=delete, 1261 comment=pcomment, append_sets=False, mrbayes=mrbayes) 1262 return pfilenames 1263 else: 1264 fn = self.filename + '.data' 1265 self.write_nexus_data(filename=fn, matrix=matrix, blocksize=blocksize, 1266 interleave=interleave, exclude=exclude, delete=delete, 1267 comment=comment, append_sets=False, mrbayes=mrbayes) 1268 return fn
1269
1270 - def write_nexus_data(self, filename=None, matrix=None, exclude=[], delete=[], 1271 blocksize=None, interleave=False, interleave_by_partition=False, 1272 comment=None, omit_NEXUS=False, append_sets=True, mrbayes=False, 1273 codons_block=True):
1274 """Writes a nexus file with data and sets block to a file or handle. 1275 1276 Character sets and partitions are appended by default, and are 1277 adjusted according to excluded characters (i.e. character sets 1278 still point to the same sites (not necessarily same positions), 1279 without including the deleted characters. 1280 1281 - filename - Either a filename as a string (which will be opened, 1282 written to and closed), or a handle object (which will 1283 be written to but NOT closed). 1284 - interleave_by_partition - Optional name of partition (string) 1285 - omit_NEXUS - Boolean. If true, the '#NEXUS' line normally at the 1286 start of the file is omitted. 1287 1288 Returns the filename/handle used to write the data. 1289 """ 1290 if not matrix: 1291 matrix = self.matrix 1292 if not matrix: 1293 return 1294 if not filename: 1295 filename = self.filename 1296 if [t for t in delete if not self._check_taxlabels(t)]: 1297 raise NexusError('Unknown taxa: %s' 1298 % ', '.join(set(delete).difference(set(self.taxlabels)))) 1299 if interleave_by_partition: 1300 if interleave_by_partition not in self.charpartitions: 1301 raise NexusError('Unknown partition: %r' % interleave_by_partition) 1302 else: 1303 partition = self.charpartitions[interleave_by_partition] 1304 # we need to sort the partition names by starting position before we exclude characters 1305 names = _sort_keys_by_values(partition) 1306 newpartition = {} 1307 for p in partition: 1308 newpartition[p] = [c for c in partition[p] if c not in exclude] 1309 # how many taxa and how many characters are left? 1310 undelete = [taxon for taxon in self.taxlabels if taxon in matrix and taxon not in delete] 1311 cropped_matrix = _seqmatrix2strmatrix(self.crop_matrix(matrix, exclude=exclude, delete=delete)) 1312 ntax_adjusted = len(undelete) 1313 nchar_adjusted = len(cropped_matrix[undelete[0]]) 1314 if not undelete or (undelete and undelete[0] == ''): 1315 return 1316 1317 with File.as_handle(filename, mode='w') as fh: 1318 if not omit_NEXUS: 1319 fh.write('#NEXUS\n') 1320 if comment: 1321 fh.write('[' + comment + ']\n') 1322 fh.write('begin data;\n') 1323 fh.write('\tdimensions ntax=%d nchar=%d;\n' % (ntax_adjusted, nchar_adjusted)) 1324 fh.write('\tformat datatype=' + self.datatype) 1325 if self.respectcase: 1326 fh.write(' respectcase') 1327 if self.missing: 1328 fh.write(' missing=' + self.missing) 1329 if self.gap: 1330 fh.write(' gap=' + self.gap) 1331 if self.matchchar: 1332 fh.write(' matchchar=' + self.matchchar) 1333 if self.labels: 1334 fh.write(' labels=' + self.labels) 1335 if self.equate: 1336 fh.write(' equate=' + self.equate) 1337 if interleave or interleave_by_partition: 1338 fh.write(' interleave') 1339 fh.write(';\n') 1340 # if self.taxlabels: 1341 # fh.write('taxlabels '+' '.join(self.taxlabels)+';\n') 1342 if self.charlabels: 1343 newcharlabels = self._adjust_charlabels(exclude=exclude) 1344 clkeys = sorted(newcharlabels) 1345 fh.write('charlabels ' 1346 + ', '.join("%s %s" % (k + 1, safename(newcharlabels[k])) for k in clkeys) 1347 + ';\n') 1348 fh.write('matrix\n') 1349 if not blocksize: 1350 if interleave: 1351 blocksize = 70 1352 else: 1353 blocksize = self.nchar 1354 # delete deleted taxa and ecxclude excluded characters... 1355 namelength = max(len(safename(t, mrbayes=mrbayes)) for t in undelete) 1356 if interleave_by_partition: 1357 # interleave by partitions, but adjust partitions with regard to excluded characters 1358 seek = 0 1359 for p in names: 1360 fh.write('[%s: %s]\n' % (interleave_by_partition, p)) 1361 if len(newpartition[p]) > 0: 1362 for taxon in undelete: 1363 fh.write(safename(taxon, mrbayes=mrbayes).ljust(namelength + 1)) 1364 fh.write(cropped_matrix[taxon][seek:seek + len(newpartition[p])] + '\n') 1365 fh.write('\n') 1366 else: 1367 fh.write('[empty]\n\n') 1368 seek += len(newpartition[p]) 1369 elif interleave: 1370 for seek in range(0, nchar_adjusted, blocksize): 1371 for taxon in undelete: 1372 fh.write(safename(taxon, mrbayes=mrbayes).ljust(namelength + 1)) 1373 fh.write(cropped_matrix[taxon][seek:seek + blocksize] + '\n') 1374 fh.write('\n') 1375 else: 1376 for taxon in undelete: 1377 if blocksize < nchar_adjusted: 1378 fh.write(safename(taxon, mrbayes=mrbayes) + '\n') 1379 else: 1380 fh.write(safename(taxon, mrbayes=mrbayes).ljust(namelength + 1)) 1381 taxon_seq = cropped_matrix[taxon] 1382 for seek in range(0, nchar_adjusted, blocksize): 1383 fh.write(taxon_seq[seek:seek + blocksize] + '\n') 1384 del taxon_seq 1385 fh.write(';\nend;\n') 1386 if append_sets: 1387 if codons_block: 1388 fh.write(self.append_sets(exclude=exclude, delete=delete, mrbayes=mrbayes, include_codons=False)) 1389 fh.write(self.append_sets(exclude=exclude, delete=delete, mrbayes=mrbayes, codons_only=True)) 1390 else: 1391 fh.write(self.append_sets(exclude=exclude, delete=delete, mrbayes=mrbayes)) 1392 return filename
1393
1394 - def append_sets(self, exclude=[], delete=[], mrbayes=False, include_codons=True, codons_only=False):
1395 """Returns a sets block.""" 1396 if not self.charsets and not self.taxsets and not self.charpartitions: 1397 return '' 1398 if codons_only: 1399 setsb = ['\nbegin codons'] 1400 else: 1401 setsb = ['\nbegin sets'] 1402 # - now if characters have been excluded, the character sets need to be adjusted, 1403 # so that they still point to the right character positions 1404 # calculate a list of offsets: for each deleted character, the following character position 1405 # in the new file will have an additional offset of -1 1406 offset = 0 1407 offlist = [] 1408 for c in range(self.nchar): 1409 if c in exclude: 1410 offset += 1 1411 offlist.append(-1) # dummy value as these character positions are excluded 1412 else: 1413 offlist.append(c - offset) 1414 # now adjust each of the character sets 1415 if not codons_only: 1416 for n, ns in self.charsets.items(): 1417 cset = [offlist[c] for c in ns if c not in exclude] 1418 if cset: 1419 setsb.append('charset %s = %s' % (safename(n), _compact4nexus(cset))) 1420 for n, s in self.taxsets.items(): 1421 tset = [safename(t, mrbayes=mrbayes) for t in s if t not in delete] 1422 if tset: 1423 setsb.append('taxset %s = %s' % (safename(n), ' '.join(tset))) 1424 for n, p in self.charpartitions.items(): 1425 if not include_codons and n == CODONPOSITIONS: 1426 continue 1427 elif codons_only and n != CODONPOSITIONS: 1428 continue 1429 # as characters have been excluded, the partitions must be adjusted 1430 # if a partition is empty, it will be omitted from the charpartition command 1431 # (although paup allows charpartition part=t1:,t2:,t3:1-100) 1432 names = _sort_keys_by_values(p) 1433 newpartition = {} 1434 for sn in names: 1435 nsp = [offlist[c] for c in p[sn] if c not in exclude] 1436 if nsp: 1437 newpartition[sn] = nsp 1438 if newpartition: 1439 if include_codons and n == CODONPOSITIONS: 1440 command = 'codonposset' 1441 else: 1442 command = 'charpartition' 1443 setsb.append('%s %s = %s' % (command, safename(n), 1444 ', '.join('%s: %s' % (sn, _compact4nexus(newpartition[sn])) 1445 for sn in names if sn in newpartition))) 1446 # now write charpartititions, much easier than charpartitions 1447 for n, p in self.taxpartitions.items(): 1448 names = _sort_keys_by_values(p) 1449 newpartition = {} 1450 for sn in names: 1451 nsp = [t for t in p[sn] if t not in delete] 1452 if nsp: 1453 newpartition[sn] = nsp 1454 if newpartition: 1455 setsb.append('taxpartition %s = %s' % (safename(n), 1456 ', '.join('%s: %s' % (safename(sn), 1457 ' '.join(safename(x) for x in newpartition[sn])) 1458 for sn in names if sn in newpartition))) 1459 # add 'end' and return everything 1460 setsb.append('end;\n') 1461 if len(setsb) == 2: # begin and end only 1462 return '' 1463 else: 1464 return ';\n'.join(setsb)
1465
1466 - def export_fasta(self, filename=None, width=70):
1467 """Writes matrix into a fasta file.""" 1468 if not filename: 1469 if '.' in self.filename and self.filename.split('.')[-1].lower() in ['paup', 'nexus', 'nex', 'dat']: 1470 filename = '.'.join(self.filename.split('.')[:-1]) + '.fas' 1471 else: 1472 filename = self.filename + '.fas' 1473 with open(filename, 'w') as fh: 1474 for taxon in self.taxlabels: 1475 fh.write('>' + safename(taxon) + '\n') 1476 for i in range(0, len(str(self.matrix[taxon])), width): 1477 fh.write(str(self.matrix[taxon])[i:i + width] + '\n') 1478 return filename
1479
1480 - def export_phylip(self, filename=None):
1481 """Writes matrix into a PHYLIP file. 1482 1483 Note that this writes a relaxed PHYLIP format file, where the names 1484 are not truncated, nor checked for invalid characters.""" 1485 if not filename: 1486 if '.' in self.filename and self.filename.split('.')[-1].lower() in ['paup', 'nexus', 'nex', 'dat']: 1487 filename = '.'.join(self.filename.split('.')[:-1]) + '.phy' 1488 else: 1489 filename = self.filename + '.phy' 1490 with open(filename, 'w') as fh: 1491 fh.write('%d %d\n' % (self.ntax, self.nchar)) 1492 for taxon in self.taxlabels: 1493 fh.write('%s %s\n' % (safename(taxon), str(self.matrix[taxon]))) 1494 return filename
1495
1496 - def constant(self, matrix=None, delete=[], exclude=[]):
1497 """Return a list with all constant characters.""" 1498 if not matrix: 1499 matrix = self.matrix 1500 undelete = [t for t in self.taxlabels if t in matrix and t not in delete] 1501 if not undelete: 1502 return None 1503 elif len(undelete) == 1: 1504 return [x for x in range(len(matrix[undelete[0]])) if x not in exclude] 1505 # get the first sequence and expand all ambiguous values 1506 constant = [(x, self.ambiguous_values.get(n.upper(), n.upper())) for 1507 x, n in enumerate(str(matrix[undelete[0]])) if x not in exclude] 1508 1509 for taxon in undelete[1:]: 1510 newconstant = [] 1511 for site in constant: 1512 # print '%d (paup=%d)' % (site[0],site[0]+1), 1513 seqsite = matrix[taxon][site[0]].upper() 1514 # print seqsite,'checked against',site[1],'\t', 1515 if seqsite == self.missing \ 1516 or (seqsite == self.gap and self.options['gapmode'].lower() == 'missing') \ 1517 or seqsite == site[1]: 1518 # missing or same as before -> ok 1519 newconstant.append(site) 1520 elif seqsite in site[1] \ 1521 or site[1] == self.missing \ 1522 or (self.options['gapmode'].lower() == 'missing' and site[1] == self.gap): 1523 # subset of an ambig or only missing in previous -> take subset 1524 newconstant.append((site[0], self.ambiguous_values.get(seqsite, seqsite))) 1525 elif seqsite in self.ambiguous_values: 1526 # is it an ambig: check the intersection with prev. values 1527 intersect = set(self.ambiguous_values[seqsite]).intersection(set(site[1])) 1528 if intersect: 1529 newconstant.append((site[0], ''.join(intersect))) 1530 # print 'ok' 1531 # else: 1532 # print 'failed' 1533 # else: 1534 # print 'failed' 1535 constant = newconstant 1536 cpos = [s[0] for s in constant] 1537 return cpos
1538
1539 - def cstatus(self, site, delete=[], narrow=True):
1540 """Summarize character. 1541 1542 narrow=True: paup-mode (a c ? --> ac; ? ? ? --> ?) 1543 narrow=false: (a c ? --> a c g t -; ? ? ? --> a c g t -) 1544 """ 1545 undelete = [t for t in self.taxlabels if t not in delete] 1546 if not undelete: 1547 return None 1548 cstatus = [] 1549 for t in undelete: 1550 c = self.matrix[t][