Package Bio :: Package Nexus :: Module Nexus
[hide private]
[frames] | no frames]

Source Code for Module Bio.Nexus.Nexus

   1  # Copyright 2005-2008 by Frank Kauff & Cymon J. Cox. All rights reserved. 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license. Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5  # 
   6  # Bug reports welcome: fkauff@biologie.uni-kl.de or on Biopython's bugzilla. 
   7  """Nexus class. Parse the contents of a NEXUS file. 
   8   
   9  Based upon 'NEXUS: An extensible file format for systematic information' 
  10  Maddison, Swofford, Maddison. 1997. Syst. Biol. 46(4):590-621 
  11  """ 
  12  from __future__ import print_function 
  13   
  14  from Bio._py3k import zip 
  15  from Bio._py3k import range 
  16  from Bio._py3k import basestring 
  17   
  18  from functools import reduce 
  19  import copy 
  20  import math 
  21  import random 
  22  import sys 
  23   
  24  from Bio import File 
  25  from Bio.Alphabet import IUPAC 
  26  from Bio.Data import IUPACData 
  27  from Bio.Seq import Seq 
  28   
  29  from .Trees import Tree 
  30   
  31  INTERLEAVE = 70 
  32  SPECIAL_COMMANDS = ['charstatelabels', 'charlabels', 'taxlabels', 'taxset', 
  33                      'charset', 'charpartition', 'taxpartition', 'matrix', 
  34                      'tree', 'utree', 'translate', 'codonposset', 'title'] 
  35  KNOWN_NEXUS_BLOCKS = ['trees', 'data', 'characters', 'taxa', 'sets', 'codons'] 
  36  PUNCTUATION = '()[]{}/\,;:=*\'"`+-<>' 
  37  MRBAYESSAFE = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890_' 
  38  WHITESPACE = ' \t\n' 
  39  #SPECIALCOMMENTS = ['!','&','%','/','\\','@'] #original list of special comments 
  40  SPECIALCOMMENTS = ['&'] # supported special comment ('tree' command), all others are ignored 
  41  CHARSET = 'chars' 
  42  TAXSET = 'taxa' 
  43  CODONPOSITIONS = 'codonpositions' 
  44  DEFAULTNEXUS = '#NEXUS\nbegin data; dimensions ntax=0 nchar=0; format datatype=dna; end; ' 
  45   
  46   
47 -class NexusError(Exception):
48 pass
49 50
51 -class CharBuffer(object):
52 """Helps reading NEXUS-words and characters from a buffer (semi-PRIVATE). 53 54 This class is not intended for public use (any more). 55 """
56 - def __init__(self, string):
57 if string: 58 self.buffer = list(string) 59 else: 60 self.buffer = []
61
62 - def peek(self):
63 if self.buffer: 64 return self.buffer[0] 65 else: 66 return None
67
68 - def peek_nonwhitespace(self):
69 b = ''.join(self.buffer).strip() 70 if b: 71 return b[0] 72 else: 73 return None
74
75 - def __next__(self):
76 if self.buffer: 77 return self.buffer.pop(0) 78 else: 79 return None
80 81 if sys.version_info[0] < 3:
82 - def next(self):
83 """Deprecated Python 2 style alias for Python 3 style __next__ method.""" 84 return self.__next__()
85
86 - def next_nonwhitespace(self):
87 while True: 88 p = next(self) 89 if p is None: 90 break 91 if p not in WHITESPACE: 92 return p 93 return None
94
95 - def skip_whitespace(self):
96 while self.buffer[0] in WHITESPACE: 97 self.buffer = self.buffer[1:]
98
99 - def next_until(self, target):
100 for t in target: 101 try: 102 pos = self.buffer.index(t) 103 except ValueError: 104 pass 105 else: 106 found = ''.join(self.buffer[:pos]) 107 self.buffer = self.buffer[pos:] 108 return found 109 else: 110 return None
111
112 - def peek_word(self, word):
113 return ''.join(self.buffer[:len(word)]) == word
114
115 - def next_word(self):
116 """Return the next NEXUS word from a string. 117 118 This deals with single and double quotes, whitespace and punctuation. 119 """ 120 121 word = [] 122 quoted = False 123 first = self.next_nonwhitespace() # get first character 124 if not first: # return empty if only whitespace left 125 return None 126 word.append(first) 127 if first == "'": # word starts with a quote 128 quoted = "'" 129 elif first == '"': 130 quoted = '"' 131 elif first in PUNCTUATION: # if it's punctuation, return immediately 132 return first 133 while True: 134 c = self.peek() 135 if c == quoted: # a quote? 136 word.append(next(self)) # store quote 137 if self.peek() == quoted: # double quote 138 skip = next(self) # skip second quote 139 elif quoted: # second single quote ends word 140 break 141 elif quoted: 142 word.append(next(self)) # if quoted, then add anything 143 elif not c or c in PUNCTUATION or c in WHITESPACE: 144 # if not quoted and special character, stop 145 break 146 else: 147 word.append(next(self)) # standard character 148 return ''.join(word)
149
150 - def rest(self):
151 """Return the rest of the string without parsing.""" 152 return ''.join(self.buffer)
153 154
155 -class StepMatrix(object):
156 """Calculate a stepmatrix for weighted parsimony. 157 158 See Wheeler (1990), Cladistics 6:269-275. 159 """ 160
161 - def __init__(self, symbols, gap):
162 self.data = {} 163 self.symbols = sorted(symbols) 164 if gap: 165 self.symbols.append(gap) 166 for x in self.symbols: 167 for y in [s for s in self.symbols if s != x]: 168 self.set(x, y, 0)
169
170 - def set(self, x, y, value):
171 if x > y: 172 x, y = y, x 173 self.data[x + y] = value
174
175 - def add(self, x, y, value):
176 if x > y: 177 x, y = y, x 178 self.data[x + y] += value
179
180 - def sum(self):
181 return reduce(lambda x, y:x+y, self.data.values())
182
183 - def transformation(self):
184 total = self.sum() 185 if total != 0: 186 for k in self.data: 187 self.data[k] = self.data[k] / float(total) 188 return self
189
190 - def weighting(self):
191 for k in self.data: 192 if self.data[k] != 0: 193 self.data[k] = -math.log(self.data[k]) 194 return self
195
196 - def smprint(self, name='your_name_here'):
197 matrix = 'usertype %s stepmatrix=%d\n' % (name, len(self.symbols)) 198 matrix += ' %s\n' % ' '.join(self.symbols) 199 for x in self.symbols: 200 matrix += '[%s]'.ljust(8) % x 201 for y in self.symbols: 202 if x == y: 203 matrix += ' . ' 204 else: 205 if x > y: 206 x1, y1 = y, x 207 else: 208 x1, y1 = x, y 209 if self.data[x1 + y1] == 0: 210 matrix += 'inf. ' 211 else: 212 matrix += '%2.2f'.ljust(10) % (self.data[x1 + y1]) 213 matrix += '\n' 214 matrix += ';\n' 215 return matrix
216 217
218 -def safename(name, mrbayes=False):
219 """Return a taxon identifier according to NEXUS standard. 220 221 Wrap quotes around names with punctuation or whitespace, and double 222 single quotes. 223 224 mrbayes=True: write names without quotes, whitespace or punctuation 225 for the mrbayes software package. 226 """ 227 if mrbayes: 228 safe = name.replace(' ', '_') 229 safe = ''.join(c for c in safe if c in MRBAYESSAFE) 230 else: 231 safe = name.replace("'", "''") 232 if set(safe).intersection(set(WHITESPACE+PUNCTUATION)): 233 safe = "'" + safe + "'" 234 return safe
235 236
237 -def quotestrip(word):
238 """Remove quotes and/or double quotes around identifiers.""" 239 if not word: 240 return None 241 while (word.startswith("'") and word.endswith("'")) or (word.startswith('"') and word.endswith('"')): 242 word = word[1:-1] 243 return word
244 245
246 -def get_start_end(sequence, skiplist=['-', '?']):
247 """Return position of first and last character which is not in skiplist. 248 249 Skiplist defaults to ['-','?']).""" 250 251 length = len(sequence) 252 if length == 0: 253 return None, None 254 end = length - 1 255 while end >= 0 and (sequence[end] in skiplist): 256 end -= 1 257 start = 0 258 while start < length and (sequence[start] in skiplist): 259 start += 1 260 if start == length and end == -1: # empty sequence 261 return -1, -1 262 else: 263 return start, end
264 265
266 -def _sort_keys_by_values(p):
267 """Returns a sorted list of keys of p sorted by values of p.""" 268 return sorted((pn for pn in p if p[pn]), key = lambda pn: p[pn])
269 270
271 -def _make_unique(l):
272 """Check that all values in list are unique and return a pruned and sorted list.""" 273 return sorted(set(l))
274 275
276 -def _unique_label(previous_labels, label):
277 """Returns a unique name if label is already in previous_labels.""" 278 while label in previous_labels: 279 if label.split('.')[-1].startswith('copy'): 280 label = '.'.join(label.split('.')[:-1]) \ 281 + '.copy' + str(eval('0'+label.split('.')[-1][4:])+1) 282 else: 283 label += '.copy' 284 return label
285 286
287 -def _seqmatrix2strmatrix(matrix):
288 """Converts a Seq-object matrix to a plain sequence-string matrix.""" 289 return dict((t, str(matrix[t])) for t in matrix)
290 291
292 -def _compact4nexus(orig_list):
293 """Transform [1 2 3 5 6 7 8 12 15 18 20] (baseindex 0, used in the Nexus class) 294 into '2-4 6-9 13-19\\3 21' (baseindex 1, used in programs like Paup or MrBayes.). 295 """ 296 297 if not orig_list: 298 return '' 299 orig_list = sorted(set(orig_list)) 300 shortlist = [] 301 clist = orig_list[:] 302 clist.append(clist[-1] + .5) # dummy value makes it easier 303 while len(clist) > 1: 304 step = 1 305 for i, x in enumerate(clist): 306 if x == clist[0] + i*step: # are we still in the right step? 307 continue 308 elif i == 1 and len(clist) > 3 and clist[i+1] - x == x - clist[0]: 309 # second element, and possibly at least 3 elements to link, 310 # and the next one is in the right step 311 step = x - clist[0] 312 else: # pattern broke, add all values before current position to new list 313 sub = clist[:i] 314 if len(sub) == 1: 315 shortlist.append(str(sub[0]+1)) 316 else: 317 if step == 1: 318 shortlist.append('%d-%d' % (sub[0]+1, sub[-1]+1)) 319 else: 320 shortlist.append('%d-%d\\%d' % (sub[0]+1, sub[-1]+1, step)) 321 clist = clist[i:] 322 break 323 return ' '.join(shortlist)
324 325
326 -def combine(matrices):
327 """Combine matrices in [(name,nexus-instance),...] and return new nexus instance. 328 329 combined_matrix=combine([(name1,nexus_instance1),(name2,nexus_instance2),...] 330 Character sets, character partitions and taxon sets are prefixed, readjusted 331 and present in the combined matrix. 332 """ 333 334 if not matrices: 335 return None 336 name = matrices[0][0] 337 combined = copy.deepcopy(matrices[0][1]) # initiate with copy of first matrix 338 mixed_datatypes = (len(set(n[1].datatype for n in matrices)) > 1) 339 if mixed_datatypes: 340 # dealing with mixed matrices is application specific. 341 # You take care of that yourself! 342 combined.datatype = 'None' 343 # raise NexusError('Matrices must be of same datatype') 344 combined.charlabels = None 345 combined.statelabels = None 346 combined.interleave = False 347 combined.translate = None 348 349 # rename taxon sets and character sets and name them with prefix 350 for cn, cs in combined.charsets.items(): 351 combined.charsets['%s.%s' % (name, cn)]=cs 352 del combined.charsets[cn] 353 for tn, ts in combined.taxsets.items(): 354 combined.taxsets['%s.%s' % (name, tn)]=ts 355 del combined.taxsets[tn] 356 # previous partitions usually don't make much sense in combined matrix 357 # just initiate one new partition parted by single matrices 358 combined.charpartitions = {'combined':{name:list(range(combined.nchar))}} 359 for n, m in matrices[1:]: # add all other matrices 360 both = [t for t in combined.taxlabels if t in m.taxlabels] 361 combined_only = [t for t in combined.taxlabels if t not in both] 362 m_only = [t for t in m.taxlabels if t not in both] 363 for t in both: 364 # concatenate sequences and unify gap and missing character symbols 365 combined.matrix[t] += Seq(str(m.matrix[t]).replace(m.gap, combined.gap).replace(m.missing, combined.missing), combined.alphabet) 366 # replace date of missing taxa with symbol for missing data 367 for t in combined_only: 368 combined.matrix[t] += Seq(combined.missing*m.nchar, combined.alphabet) 369 for t in m_only: 370 combined.matrix[t] = Seq(combined.missing*combined.nchar, combined.alphabet) + \ 371 Seq(str(m.matrix[t]).replace(m.gap, combined.gap).replace(m.missing, combined.missing), combined.alphabet) 372 combined.taxlabels.extend(m_only) # new taxon list 373 for cn, cs in m.charsets.items(): # adjust character sets for new matrix 374 combined.charsets['%s.%s' % (n, cn)] = [x+combined.nchar for x in cs] 375 if m.taxsets: 376 if not combined.taxsets: 377 combined.taxsets = {} 378 # update taxon sets 379 combined.taxsets.update(dict(('%s.%s' % (n, tn), ts) 380 for tn, ts in m.taxsets.items())) 381 # update new charpartition 382 combined.charpartitions['combined'][n] = list(range(combined.nchar, combined.nchar+m.nchar)) 383 # update charlabels 384 if m.charlabels: 385 if not combined.charlabels: 386 combined.charlabels = {} 387 combined.charlabels.update(dict((combined.nchar + i, label) 388 for (i, label) in m.charlabels.items())) 389 combined.nchar += m.nchar # update nchar and ntax 390 combined.ntax += len(m_only) 391 392 # some prefer partitions, some charsets: 393 # make separate charset for ecah initial dataset 394 for c in combined.charpartitions['combined']: 395 combined.charsets[c] = combined.charpartitions['combined'][c] 396 397 return combined
398 399
400 -def _kill_comments_and_break_lines(text):
401 """Delete []-delimited comments out of a file and break into lines separated by ';'. 402 403 stripped_text=_kill_comments_and_break_lines(text): 404 Nested and multiline comments are allowed. [ and ] symbols within single 405 or double quotes are ignored, newline ends a quote, all symbols with quotes are 406 treated the same (thus not quoting inside comments like [this character ']' ends a comment]) 407 Special [&...] and [\...] comments remain untouched, if not inside standard comment. 408 Quotes inside special [& and [\ are treated as normal characters, 409 but no nesting inside these special comments allowed (like [& [\ ]]). 410 ';' ist deleted from end of line. 411 412 NOTE: this function is very slow for large files, and obsolete when using C extension cnexus 413 """ 414 contents = iter(text) 415 newtext = [] 416 newline = [] 417 quotelevel = '' 418 speciallevel = False 419 commlevel = 0 420 #Parse with one character look ahead (for special comments) 421 t2 = next(contents) 422 while True: 423 t = t2 424 try: 425 t2 = next(contents) 426 except StopIteration: 427 t2 = None 428 if t is None: 429 break 430 if t == quotelevel and not (commlevel or speciallevel): 431 # matching quote ends quotation 432 quotelevel = '' 433 elif not quotelevel and not (commlevel or speciallevel) and (t == '"' or t == "'"): 434 # single or double quote starts quotation 435 quotelevel=t 436 elif not quotelevel and t == '[': 437 # opening bracket outside a quote 438 if t2 in SPECIALCOMMENTS and commlevel == 0 and not speciallevel: 439 speciallevel = True 440 else: 441 commlevel += 1 442 elif not quotelevel and t == ']': 443 # closing bracket ioutside a quote 444 if speciallevel: 445 speciallevel = False 446 else: 447 commlevel -= 1 448 if commlevel < 0: 449 raise NexusError('Nexus formatting error: unmatched ]') 450 continue 451 if commlevel == 0: 452 # copy if we're not in comment 453 if t == ';' and not quotelevel: 454 newtext.append(''.join(newline)) 455 newline=[] 456 else: 457 newline.append(t) 458 #level of comments should be 0 at the end of the file 459 if newline: 460 newtext.append('\n'.join(newline)) 461 if commlevel > 0: 462 raise NexusError('Nexus formatting error: unmatched [') 463 return newtext
464 465
466 -def _adjust_lines(lines):
467 """Adjust linebreaks to match ';', strip leading/trailing whitespace. 468 469 list_of_commandlines=_adjust_lines(input_text) 470 Lines are adjusted so that no linebreaks occur within a commandline 471 (except matrix command line) 472 """ 473 formatted_lines = [] 474 for l in lines: 475 #Convert line endings 476 l = l.replace('\r\n', '\n').replace('\r', '\n').strip() 477 if l.lower().startswith('matrix'): 478 formatted_lines.append(l) 479 else: 480 l = l.replace('\n', ' ') 481 if l: 482 formatted_lines.append(l) 483 return formatted_lines
484 485
486 -def _replace_parenthesized_ambigs(seq, rev_ambig_values):
487 """Replaces ambigs in xxx(ACG)xxx format by IUPAC ambiguity code.""" 488 489 opening = seq.find('(') 490 while opening > -1: 491 closing = seq.find(')') 492 if closing < 0: 493 raise NexusError('Missing closing parenthesis in: '+seq) 494 elif closing < opening: 495 raise NexusError('Missing opening parenthesis in: '+seq) 496 ambig = ''.join(sorted(seq[opening+1:closing])) 497 ambig_code = rev_ambig_values[ambig.upper()] 498 if ambig != ambig.upper(): 499 ambig_code = ambig_code.lower() 500 seq = seq[:opening] + ambig_code + seq[closing+1:] 501 opening = seq.find('(') 502 return seq
503 504
505 -class Commandline(object):
506 """Represent a commandline as command and options.""" 507
508 - def __init__(self, line, title):
509 self.options = {} 510 options = [] 511 self.command = None 512 try: 513 #Assume matrix (all other command lines have been stripped of \n) 514 self.command, options = line.strip().split('\n', 1) 515 except ValueError: # Not matrix 516 #self.command,options=line.split(' ',1) #no: could be tab or spaces (translate...) 517 self.command = line.split()[0] 518 options=' '.join(line.split()[1:]) 519 self.command = self.command.strip().lower() 520 if self.command in SPECIAL_COMMANDS: 521 # special command that need newlines and order of options preserved 522 self.options = options.strip() 523 else: 524 if len(options) > 0: 525 try: 526 options = options.replace('=', ' = ').split() 527 valued_indices = [(n-1, n, n+1) for n in range(len(options)) 528 if options[n] == '=' and n != 0 and n != len((options))] 529 indices = [] 530 for sl in valued_indices: 531 indices.extend(sl) 532 token_indices = [n for n in range(len(options)) if n not in indices] 533 for opt in valued_indices: 534 #self.options[options[opt[0]].lower()] = options[opt[2]].lower() 535 self.options[options[opt[0]].lower()] = options[opt[2]] 536 for token in token_indices: 537 self.options[options[token].lower()] = None 538 except ValueError: 539 raise NexusError('Incorrect formatting in line: %s' % line)
540 541
542 -class Block(object):
543 """Represent a NEXUS block with block name and list of commandlines."""
544 - def __init__(self, title=None):
545 self.title = title 546 self.commandlines = []
547 548
549 -class Nexus(object):
550
551 - def __init__(self, input=None):
552 self.ntax = 0 # number of taxa 553 self.nchar = 0 # number of characters 554 self.unaltered_taxlabels = [] # taxlabels as the appear in the input file (incl. duplicates, etc.) 555 self.taxlabels = [] # labels for taxa, ordered by their id 556 self.charlabels = None # ... and for characters 557 self.statelabels = None # ... and for states 558 self.datatype = 'dna' # (standard), dna, rna, nucleotide, protein 559 self.respectcase = False # case sensitivity 560 self.missing = '?' # symbol for missing characters 561 self.gap = '-' # symbol for gap 562 self.symbols = None # set of symbols 563 self.equate = None # set of symbol synonyms 564 self.matchchar = None # matching char for matrix representation 565 self.labels = None # left, right, no 566 self.transpose = False # whether matrix is transposed 567 self.interleave = False # whether matrix is interleaved 568 self.tokens = False # unsupported 569 self.eliminate = None # unsupported 570 self.matrix = None # ... 571 self.unknown_blocks = [] # blocks we don't care about 572 self.taxsets = {} 573 self.charsets = {} 574 self.charpartitions = {} 575 self.taxpartitions = {} 576 self.trees = [] # list of Trees (instances of Tree class) 577 self.translate = None # Dict to translate taxon <-> taxon numbers 578 self.structured = [] # structured input representation 579 self.set = {} # dict of the set command to set various options 580 self.options = {} # dict of the options command in the data block 581 self.codonposset = None # name of the charpartition that defines codon positions 582 583 # some defaults 584 self.options['gapmode'] = 'missing' 585 586 if input: 587 self.read(input) 588 else: 589 self.read(DEFAULTNEXUS)
590
591 - def get_original_taxon_order(self):
592 """Included for backwards compatibility (DEPRECATED).""" 593 return self.taxlabels
594
595 - def set_original_taxon_order(self, value):
596 """Included for backwards compatibility (DEPRECATED).""" 597 self.taxlabels = value
598 599 original_taxon_order = property(get_original_taxon_order, set_original_taxon_order) 600
601 - def read(self, input):
602 """Read and parse NEXUS input (a filename, file-handle, or string).""" 603 604 # 1. Assume we have the name of a file in the execution dir or a 605 # file-like object. 606 # Note we need to add parsing of the path to dir/filename 607 try: 608 with File.as_handle(input, 'rU') as fp: 609 file_contents = fp.read() 610 self.filename = getattr(fp, 'name', 'Unknown_nexus_file') 611 except (TypeError, IOError, AttributeError): 612 #2 Assume we have a string from a fh.read() 613 if isinstance(input, basestring): 614 file_contents = input 615 self.filename = 'input_string' 616 else: 617 print(input.strip()[:50]) 618 raise NexusError('Unrecognized input: %s ...' % input[:100]) 619 file_contents = file_contents.strip() 620 if file_contents.startswith('#NEXUS'): 621 file_contents = file_contents[6:] 622 commandlines = _get_command_lines(file_contents) 623 # get rid of stupid 'NEXUS token - in merged treefiles, this might appear multiple times' 624 for i, cl in enumerate(commandlines): 625 try: 626 if cl[:6].upper() == '#NEXUS': 627 commandlines[i] = cl[6:].strip() 628 except: 629 pass 630 # now loop through blocks (we parse only data in known blocks, thus ignoring non-block commands 631 nexus_block_gen = self._get_nexus_block(commandlines) 632 while True: 633 try: 634 title, contents = next(nexus_block_gen) 635 except StopIteration: 636 break 637 if title in KNOWN_NEXUS_BLOCKS: 638 self._parse_nexus_block(title, contents) 639 else: 640 self._unknown_nexus_block(title, contents)
641
642 - def _get_nexus_block(self, file_contents):
643 """Generator for looping through Nexus blocks.""" 644 inblock = False 645 blocklines = [] 646 while file_contents: 647 cl = file_contents.pop(0) 648 if cl.lower().startswith('begin'): 649 if not inblock: 650 inblock = True 651 title = cl.split()[1].lower() 652 else: 653 raise NexusError('Illegal block nesting in block %s' % title) 654 elif cl.lower().startswith('end'): 655 if inblock: 656 inblock = False 657 yield title, blocklines 658 blocklines = [] 659 else: 660 raise NexusError('Unmatched \'end\'.') 661 elif inblock: 662 blocklines.append(cl)
663
664 - def _unknown_nexus_block(self, title, contents):
665 block = Block() 666 block.commandlines.append(contents) 667 block.title = title 668 self.unknown_blocks.append(block)
669
670 - def _parse_nexus_block(self, title, contents):
671 """Parse a known Nexus Block (PRIVATE).""" 672 # attached the structered block representation 673 self._apply_block_structure(title, contents) 674 #now check for taxa,characters,data blocks. If this stuff is defined more than once 675 #the later occurences will override the previous ones. 676 block = self.structured[-1] 677 for line in block.commandlines: 678 try: 679 getattr(self, '_' + line.command)(line.options) 680 except AttributeError: 681 raise NexusError('Unknown command: %s ' % line.command)
682
683 - def _title(self, options):
684 pass
685 688
689 - def _dimensions(self, options):
690 if 'ntax' in options: 691 self.ntax = eval(options['ntax']) 692 if 'nchar' in options: 693 self.nchar = eval(options['nchar'])
694
695 - def _format(self, options):
696 # print options 697 # we first need to test respectcase, then symbols (which depends on respectcase) 698 # then datatype (which, if standard, depends on symbols and respectcase in order to generate 699 # dicts for ambiguous values and alphabet 700 if 'respectcase' in options: 701 self.respectcase = True 702 # adjust symbols to for respectcase 703 if 'symbols' in options: 704 self.symbols = options['symbols'] 705 if (self.symbols.startswith('"') and self.symbols.endswith('"')) or\ 706 (self.symbold.startswith("'") and self.symbols.endswith("'")): 707 self.symbols = self.symbols[1:-1].replace(' ', '') 708 if not self.respectcase: 709 self.symbols = self.symbols.lower() + self.symbols.upper() 710 self.symbols = list(set(self.symbols)) 711 if 'datatype' in options: 712 self.datatype = options['datatype'].lower() 713 if self.datatype == 'dna' or self.datatype == 'nucleotide': 714 self.alphabet = IUPAC.IUPACAmbiguousDNA() # fresh instance! 715 self.ambiguous_values = IUPACData.ambiguous_dna_values.copy() 716 self.unambiguous_letters = IUPACData.unambiguous_dna_letters 717 elif self.datatype == 'rna': 718 self.alphabet = IUPAC.IUPACAmbiguousDNA() # fresh instance! 719 self.ambiguous_values = IUPACData.ambiguous_rna_values.copy() 720 self.unambiguous_letters = IUPACData.unambiguous_rna_letters 721 elif self.datatype == 'protein': 722 #TODO - Should this not be ExtendedIUPACProtein? 723 self.alphabet = IUPAC.IUPACProtein() # fresh instance 724 self.ambiguous_values = {'B':'DN', 'Z':'EQ', 'X':IUPACData.protein_letters} 725 # that's how PAUP handles it 726 self.unambiguous_letters = IUPACData.protein_letters + '*' # stop-codon 727 elif self.datatype == 'standard': 728 raise NexusError('Datatype standard is not yet supported.') 729 #self.alphabet = None 730 #self.ambiguous_values = {} 731 #if not self.symbols: 732 # self.symbols = '01' # if nothing else defined, then 0 and 1 are the default states 733 #self.unambiguous_letters = self.symbols 734 else: 735 raise NexusError('Unsupported datatype: ' + self.datatype) 736 self.valid_characters = ''.join(self.ambiguous_values) + self.unambiguous_letters 737 if not self.respectcase: 738 self.valid_characters = self.valid_characters.lower() + self.valid_characters.upper() 739 #we have to sort the reverse ambig coding dict key characters: 740 #to be sure that it's 'ACGT':'N' and not 'GTCA':'N' 741 rev=dict((i[1], i[0]) for i in self.ambiguous_values.items() if i[0]!='X') 742 self.rev_ambiguous_values = {} 743 for (k, v) in rev.items(): 744 key = sorted(c for c in k) 745 self.rev_ambiguous_values[''.join(key)] = v 746 #overwrite symbols for datype rna,dna,nucleotide 747 if self.datatype in ['dna', 'rna', 'nucleotide']: 748 self.symbols = self.alphabet.letters 749 if self.missing not in self.ambiguous_values: 750 self.ambiguous_values[self.missing] = self.unambiguous_letters+self.gap 751 self.ambiguous_values[self.gap] = self.gap 752 elif self.datatype == 'standard': 753 if not self.symbols: 754 self.symbols = ['1', '0'] 755 if 'missing' in options: 756 self.missing = options['missing'][0] 757 if 'gap' in options: 758 self.gap = options['gap'][0] 759 if 'equate' in options: 760 self.equate = options['equate'] 761 if 'matchchar' in options: 762 self.matchchar = options['matchchar'][0] 763 if 'labels' in options: 764 self.labels = options['labels'] 765 if 'transpose' in options: 766 raise NexusError('TRANSPOSE is not supported!') 767 self.transpose = True 768 if 'interleave' in options: 769 if options['interleave'] is None or options['interleave'].lower() == 'yes': 770 self.interleave = True 771 if 'tokens' in options: 772 self.tokens = True 773 if 'notokens' in options: 774 self.tokens = False
775
776 - def _set(self, options):
777 self.set = options
778
779 - def _options(self, options):
780 self.options = options
781
782 - def _eliminate(self, options):
783 self.eliminate = options
784
785 - def _taxlabels(self, options):
786 """Get taxon labels (PRIVATE). 787 788 As the taxon names are already in the matrix, this is superfluous 789 except for transpose matrices, which are currently unsupported anyway. 790 Thus, we ignore the taxlabels command to make handling of duplicate 791 taxon names easier. 792 """ 793 pass
794 #self.taxlabels = [] 795 #opts = CharBuffer(options) 796 #while True: 797 # taxon = quotestrip(opts.next_word()) 798 # if not taxon: 799 # break 800 # self.taxlabels.append(taxon) 801
802 - def _check_taxlabels(self, taxon):
803 """Check for presence of taxon in self.taxlabels.""" 804 # According to NEXUS standard, underscores shall be treated as spaces..., 805 # so checking for identity is more difficult 806 nextaxa = dict((t.replace(' ', '_'), t) for t in self.taxlabels) 807 nexid = taxon.replace(' ', '_') 808 return nextaxa.get(nexid)
809
810 - def _charlabels(self, options):
811 self.charlabels = {} 812 opts = CharBuffer(options) 813 while True: 814 # get id and state 815 w = opts.next_word() 816 if w is None: # McClade saves and reads charlabel-lists with terminal comma?! 817 break 818 identifier = self._resolve(w, set_type=CHARSET) 819 state = quotestrip(opts.next_word()) 820 self.charlabels[identifier] = state 821 # check for comma or end of command 822 c = opts.next_nonwhitespace() 823 if c is None: 824 break 825 elif c != ',': 826 raise NexusError('Missing \',\' in line %s.' % options)
827
828 - def _charstatelabels(self, options):
829 # warning: charstatelabels supports only charlabels-syntax! 830 self._charlabels(options)
831
832 - def _statelabels(self, options):
833 #self.charlabels = options 834 #print 'Command statelabels is not supported and will be ignored.' 835 pass
836
837 - def _matrix(self, options):
838 if not self.ntax or not self.nchar: 839 raise NexusError('Dimensions must be specified before matrix!') 840 self.matrix = {} 841 taxcount = 0 842 first_matrix_block = True 843 844 #eliminate empty lines and leading/trailing whitespace 845 lines = [l.strip() for l in options.split('\n') if l.strip() != ''] 846 lineiter = iter(lines) 847 while True: 848 try: 849 l = next(lineiter) 850 except StopIteration: 851 if taxcount < self.ntax: 852 raise NexusError('Not enough taxa in matrix.') 853 elif taxcount > self.ntax: 854 raise NexusError('Too many taxa in matrix.') 855 else: 856 break 857 # count the taxa and check for interleaved matrix 858 taxcount += 1 859 ##print taxcount 860 if taxcount > self.ntax: 861 if not self.interleave: 862 raise NexusError('Too many taxa in matrix - should matrix be interleaved?') 863 else: 864 taxcount = 1 865 first_matrix_block = False 866 #get taxon name and sequence 867 linechars = CharBuffer(l) 868 id = quotestrip(linechars.next_word()) 869 l = linechars.rest().strip() 870 chars = '' 871 if self.interleave: 872 #interleaved matrix 873 #print 'In interleave' 874 if l: 875 chars = ''.join(l.split()) 876 else: 877 chars = ''.join(next(lineiter).split()) 878 else: 879 #non-interleaved matrix 880 chars = ''.join(l.split()) 881 while len(chars)<self.nchar: 882 l = next(lineiter) 883 chars += ''.join(l.split()) 884 iupac_seq = Seq(_replace_parenthesized_ambigs(chars, self.rev_ambiguous_values), self.alphabet) 885 #first taxon has the reference sequence if matchhar is used 886 if taxcount == 1: 887 refseq = iupac_seq 888 else: 889 if self.matchchar: 890 while True: 891 p = str(iupac_seq).find(self.matchchar) 892 if p == -1: 893 break 894 iupac_seq = Seq(str(iupac_seq)[:p]+refseq[p]+str(iupac_seq)[p+1:], self.alphabet) 895 #check for invalid characters 896 for i, c in enumerate(str(iupac_seq)): 897 if c not in self.valid_characters and c != self.gap and c != self.missing: 898 raise NexusError("Taxon %s: Illegal character %s in sequence %s " 899 "(check dimensions/interleaving)" % (id, c, iupac_seq)) 900 #add sequence to matrix 901 if first_matrix_block: 902 self.unaltered_taxlabels.append(id) 903 id = _unique_label(list(self.matrix.keys()), id) 904 self.matrix[id] = iupac_seq 905 self.taxlabels.append(id) 906 else: 907 # taxon names need to be in the same order in each interleaved block 908 id = _unique_label(self.taxlabels[:taxcount-1], id) 909 taxon_present = self._check_taxlabels(id) 910 if taxon_present: 911 self.matrix[taxon_present] += iupac_seq 912 else: 913 raise NexusError("Taxon %s not in first block of interleaved " 914 "matrix. Check matrix dimensions and interleave." % id) 915 #check all sequences for length according to nchar 916 for taxon in self.matrix: 917 if len(self.matrix[taxon]) != self.nchar: 918 raise NexusError('Matrix Nchar %d does not match data length (%d) for taxon %s' 919 % (self.nchar, len(self.matrix[taxon]), taxon)) 920 #check that taxlabels is identical with matrix.keys. If not, it's a problem 921 matrixkeys = sorted(self.matrix) 922 taxlabelssort = sorted(self.taxlabels[:]) 923 assert matrixkeys == taxlabelssort, \ 924 "ERROR: TAXLABELS must be identical with MATRIX. " + \ 925 "Please Report this as a bug, and send in data file."
926
927 - def _translate(self, options):
928 self.translate = {} 929 opts = CharBuffer(options) 930 while True: 931 try: 932 # get id and state 933 identifier = int(opts.next_word()) 934 label = quotestrip(opts.next_word()) 935 self.translate[identifier] = label 936 # check for comma or end of command 937 c = opts.next_nonwhitespace() 938 if c is None: 939 break 940 elif c != ',': 941 raise NexusError('Missing \',\' in line %s.' % options) 942 except NexusError: 943 raise 944 except: 945 raise NexusError('Format error in line %s.' % options)
946
947 - def _utree(self, options):
948 """Some software (clustalx) uses 'utree' to denote an unrooted tree.""" 949 self._tree(options)
950
951 - def _tree(self, options):
952 opts = CharBuffer(options) 953 if opts.peek_nonwhitespace() == '*': 954 # a star can be used to make it the default tree in some software packages 955 dummy = opts.next_nonwhitespace() 956 name = opts.next_word() 957 if opts.next_nonwhitespace() != '=': 958 raise NexusError('Syntax error in tree description: %s' 959 % options[:50]) 960 rooted = False 961 weight = 1.0 962 while opts.peek_nonwhitespace() == '[': 963 open = opts.next_nonwhitespace() 964 symbol = next(opts) 965 if symbol != '&': 966 raise NexusError('Illegal special comment [%s...] in tree description: %s' 967 % (symbol, options[:50])) 968 special = next(opts) 969 value = opts.next_until(']') 970 closing = next(opts) 971 if special == 'R': 972 rooted = True 973 elif special == 'U': 974 rooted = False 975 elif special == 'W': 976 weight = float(value) 977 tree = Tree(name=name, weight=weight, rooted=rooted, tree=opts.rest().strip()) 978 # if there's an active translation table, translate 979 if self.translate: 980 for n in tree.get_terminals(): 981 try: 982 tree.node(n).data.taxon=safename(self.translate[int(tree.node(n).data.taxon)]) 983 except (ValueError, KeyError): 984 raise NexusError('Unable to substitute %s using \'translate\' data.' 985 % tree.node(n).data.taxon) 986 self.trees.append(tree)
987
988 - def _apply_block_structure(self, title, lines):
989 block = Block('') 990 block.title = title 991 for line in lines: 992 block.commandlines.append(Commandline(line, title)) 993 self.structured.append(block)
994
995 - def _taxset(self, options):
996 name, taxa = self._get_indices(options, set_type=TAXSET) 997 self.taxsets[name] = _make_unique(taxa)
998
999 - def _charset(self, options):
1000 name, sites = self._get_indices(options, set_type=CHARSET) 1001 self.charsets[name] = _make_unique(sites)
1002
1003 - def _taxpartition(self, options):
1004 taxpartition = {} 1005 quotelevel = False 1006 opts = CharBuffer(options) 1007 name=self._name_n_vector(opts) 1008 if not name: 1009 raise NexusError('Formatting error in taxpartition: %s ' % options) 1010 # now collect thesubbpartitions and parse them 1011 # subpartitons separated by commas - which unfortunately could be part of a quoted identifier... 1012 # this is rather unelegant, but we have to avoid double-parsing and potential change of special nexus-words 1013 sub = '' 1014 while True: 1015 w = next(opts) 1016 if w is None or (w == ',' and not quotelevel): 1017 subname, subindices = self._get_indices(sub, set_type=TAXSET, separator=':') 1018 taxpartition[subname] = _make_unique(subindices) 1019 sub = '' 1020 if w is None: 1021 break 1022 else: 1023 if w == "'": 1024 quotelevel = not quotelevel 1025 sub += w 1026 self.taxpartitions[name] = taxpartition
1027
1028 - def _codonposset(self, options):
1029 """Read codon positions from a codons block as written from McClade. 1030 1031 Here codonposset is just a fancy name for a character partition with 1032 the name CodonPositions and the partitions N,1,2,3 1033 """ 1034 1035 prev_partitions = list(self.charpartitions.keys()) 1036 self._charpartition(options) 1037 # mcclade calls it CodonPositions, but you never know... 1038 codonname = [n for n in self.charpartitions if n not in prev_partitions] 1039 if codonname == [] or len(codonname) > 1: 1040 raise NexusError('Formatting Error in codonposset: %s ' % options) 1041 else: 1042 self.codonposset = codonname[0]
1043
1044 - def _codeset(self, options):
1045 pass
1046
1047 - def _charpartition(self, options):
1048 charpartition = {} 1049 quotelevel = False 1050 opts = CharBuffer(options) 1051 name = self._name_n_vector(opts) 1052 if not name: 1053 raise NexusError('Formatting error in charpartition: %s ' % options) 1054 # now collect thesubbpartitions and parse them 1055 # subpartitons separated by commas - which unfortunately could be part of a quoted identifier... 1056 sub = '' 1057 while True: 1058 w = next(opts) 1059 if w is None or (w == ',' and not quotelevel): 1060 subname, subindices = self._get_indices(sub, set_type=CHARSET, separator=':') 1061 charpartition[subname] = _make_unique(subindices) 1062 sub = '' 1063 if w is None: 1064 break 1065 else: 1066 if w == "'": 1067 quotelevel = not quotelevel 1068 sub += w 1069 self.charpartitions[name]=charpartition
1070
1071 - def _get_indices(self, options, set_type=CHARSET, separator='='):
1072 """Parse the taxset/charset specification (PRIVATE). 1073 1074 e.g. '1 2 3 - 5 dog cat 10 - 20 \\ 3' 1075 --> [0,1,2,3,4,'dog','cat',9,12,15,18] 1076 """ 1077 opts = CharBuffer(options) 1078 name = self._name_n_vector(opts, separator=separator) 1079 indices = self._parse_list(opts, set_type=set_type) 1080 if indices is None: 1081 raise NexusError('Formatting error in line: %s ' % options) 1082 return name, indices
1083
1084 - def _name_n_vector(self, opts, separator='='):
1085 """Extract name and check that it's not in vector format.""" 1086 rest = opts.rest() 1087 name = opts.next_word() 1088 # we ignore * before names 1089 if name == '*': 1090 name = opts.next_word() 1091 if not name: 1092 raise NexusError('Formatting error in line: %s ' % rest) 1093 name = quotestrip(name) 1094 if opts.peek_nonwhitespace == '(': 1095 open = opts.next_nonwhitespace() 1096 qualifier = open.next_word() 1097 close = opts.next_nonwhitespace() 1098 if qualifier.lower() == 'vector': 1099 raise NexusError('Unsupported VECTOR format in line %s' 1100 % (opts)) 1101 elif qualifier.lower() != 'standard': 1102 raise NexusError('Unknown qualifier %s in line %s' 1103 % (qualifier, opts)) 1104 if opts.next_nonwhitespace() != separator: 1105 raise NexusError('Formatting error in line: %s ' % rest) 1106 return name
1107
1108 - def _parse_list(self, options_buffer, set_type):
1109 """Parse a NEXUS list (PRIVATE). 1110 1111 e.g. [1, 2, 4-8\\2, dog, cat] --> [1,2,4,6,8,17,21], 1112 (assuming dog is taxon no. 17 and cat is taxon no. 21). 1113 """ 1114 plain_list = [] 1115 if options_buffer.peek_nonwhitespace(): 1116 try: 1117 # capture all possible exceptions and treat them as formatting 1118 # errors, if they are not NexusError 1119 while True: 1120 identifier = options_buffer.next_word() # next list element 1121 if not identifier: # end of list? 1122 break 1123 start = self._resolve(identifier, set_type=set_type) 1124 if options_buffer.peek_nonwhitespace() == '-': # followd by - 1125 end = start 1126 step = 1 1127 # get hyphen and end of range 1128 hyphen = options_buffer.next_nonwhitespace() 1129 end = self._resolve(options_buffer.next_word(), set_type=set_type) 1130 if set_type == CHARSET: 1131 if options_buffer.peek_nonwhitespace() == '\\': # followd by \ 1132 backslash = options_buffer.next_nonwhitespace() 1133 step = int(options_buffer.next_word()) # get backslash and step 1134 plain_list.extend(range(start, end+1, step)) 1135 else: 1136 if isinstance(start, list) or isinstance(end, list): 1137 raise NexusError('Name if character sets not allowed in range definition: %s' 1138 % identifier) 1139 start = self.taxlabels.index(start) 1140 end = self.taxlabels.index(end) 1141 taxrange = self.taxlabels[start:end+1] 1142 plain_list.extend(taxrange) 1143 else: 1144 if isinstance(start, list): # start was the name of charset or taxset 1145 plain_list.extend(start) 1146 else: # start was an ordinary identifier 1147 plain_list.append(start) 1148 except NexusError: 1149 raise 1150 except: 1151 return None 1152 return plain_list
1153
1154 - def _resolve(self, identifier, set_type=None):
1155 """Translate identifier in list into character/taxon index. 1156 1157 Characters (which are referred to by their index in Nexus.py): 1158 Plain numbers are returned minus 1 (Nexus indices to python indices) 1159 Text identifiers are translated into their indices (if plain character identifiers), 1160 the first hit in charlabels is returned (charlabels don't need to be unique) 1161 or the range of indices is returned (if names of character sets). 1162 Taxa (which are referred to by their unique name in Nexus.py): 1163 Plain numbers are translated in their taxon name, underscores and spaces are considered equal. 1164 Names are returned unchanged (if plain taxon identifiers), or the names in 1165 the corresponding taxon set is returned. 1166 """ 1167 identifier = quotestrip(identifier) 1168 if not set_type: 1169 raise NexusError('INTERNAL ERROR: Need type to resolve identifier.') 1170 if set_type == CHARSET: 1171 try: 1172 n = int(identifier) 1173 except ValueError: 1174 if self.charlabels and identifier in self.charlabels.values(): 1175 for k in self.charlabels: 1176 if self.charlabels[k] == identifier: 1177 return k 1178 elif self.charsets and identifier in self.charsets: 1179 return self.charsets[identifier] 1180 else: 1181 raise NexusError('Unknown character identifier: %s' 1182 % identifier) 1183 else: 1184 if n <= self.nchar: 1185 return n-1 1186 else: 1187 raise NexusError('Illegal character identifier: %d>nchar (=%d).' 1188 % (identifier, self.nchar)) 1189 elif set_type == TAXSET: 1190 try: 1191 n = int(identifier) 1192 except ValueError: 1193 taxlabels_id = self._check_taxlabels(identifier) 1194 if taxlabels_id: 1195 return taxlabels_id 1196 elif self.taxsets and identifier in self.taxsets: 1197 return self.taxsets[identifier] 1198 else: 1199 raise NexusError('Unknown taxon identifier: %s' 1200 % identifier) 1201 else: 1202 if n > 0 and n <= self.ntax: 1203 return self.taxlabels[n-1] 1204 else: 1205 raise NexusError('Illegal taxon identifier: %d>ntax (=%d).' 1206 % (identifier, self.ntax)) 1207 else: 1208 raise NexusError('Unknown set specification: %s.'% set_type)
1209
1210 - def _stateset(self, options):
1211 #Not implemented 1212 pass
1213
1214 - def _changeset(self, options):
1215 #Not implemented 1216 pass
1217
1218 - def _treeset(self, options):
1219 #Not implemented 1220 pass
1221
1222 - def _treepartition(self, options):
1223 #Not implemented 1224 pass
1225
1226 - def write_nexus_data_partitions(self, matrix=None, filename=None, blocksize=None, 1227 interleave=False, exclude=[], delete=[], 1228 charpartition=None, comment='', mrbayes=False):
1229 """Writes a nexus file for each partition in charpartition. 1230 1231 Only non-excluded characters and non-deleted taxa are included, 1232 just the data block is written. 1233 """ 1234 1235 if not matrix: 1236 matrix = self.matrix 1237 if not matrix: 1238 return 1239 if not filename: 1240 filename = self.filename 1241 if charpartition: 1242 pfilenames = {} 1243 for p in charpartition: 1244 total_exclude = [] + exclude 1245 total_exclude.extend(c for c in range(self.nchar) if c not in charpartition[p]) 1246 total_exclude = _make_unique(total_exclude) 1247 pcomment = comment + '\nPartition: ' + p + '\n' 1248 dot = filename.rfind('.') 1249 if dot > 0: 1250 pfilename = filename[:dot] + '_' + p + '.data' 1251 else: 1252 pfilename = filename+'_'+p 1253 pfilenames[p] = pfilename 1254 self.write_nexus_data(filename=pfilename, matrix=matrix, blocksize=blocksize, 1255 interleave=interleave, exclude=total_exclude, delete=delete, 1256 comment=pcomment, append_sets=False, mrbayes=mrbayes) 1257 return pfilenames 1258 else: 1259 fn=self.filename+'.data' 1260 self.write_nexus_data(filename=fn, matrix=matrix, blocksize=blocksize, 1261 interleave=interleave, exclude=exclude, delete=delete, 1262 comment=comment, append_sets=False, mrbayes=mrbayes) 1263 return fn
1264
1265 - def write_nexus_data(self, filename=None, matrix=None, exclude=[], delete=[], 1266 blocksize=None, interleave=False, interleave_by_partition=False, 1267 comment=None, omit_NEXUS=False, append_sets=True, mrbayes=False, 1268 codons_block=True):
1269 """Writes a nexus file with data and sets block to a file or handle. 1270 1271 Character sets and partitions are appended by default, and are 1272 adjusted according to excluded characters (i.e. character sets 1273 still point to the same sites (not necessarily same positions), 1274 without including the deleted characters. 1275 1276 filename - Either a filename as a string (which will be opened, 1277 written to and closed), or a handle object (which will 1278 be written to but NOT closed). 1279 interleave_by_partition - Optional name of partition (string) 1280 omit_NEXUS - Boolean. If true, the '#NEXUS' line normally at the 1281 start of the file is omitted. 1282 1283 Returns the filename/handle used to write the data. 1284 """ 1285 if not matrix: 1286 matrix = self.matrix 1287 if not matrix: 1288 return 1289 if not filename: 1290 filename = self.filename 1291 if [t for t in delete if not self._check_taxlabels(t)]: 1292 raise NexusError('Unknown taxa: %s' 1293 % ', '.join(set(delete).difference(set(self.taxlabels)))) 1294 if interleave_by_partition: 1295 if not interleave_by_partition in self.charpartitions: 1296 raise NexusError('Unknown partition: %r' % interleave_by_partition) 1297 else: 1298 partition = self.charpartitions[interleave_by_partition] 1299 # we need to sort the partition names by starting position before we exclude characters 1300 names = _sort_keys_by_values(partition) 1301 newpartition = {} 1302 for p in partition: 1303 newpartition[p] = [c for c in partition[p] if c not in exclude] 1304 # how many taxa and how many characters are left? 1305 undelete = [taxon for taxon in self.taxlabels if taxon in matrix and taxon not in delete] 1306 cropped_matrix = _seqmatrix2strmatrix(self.crop_matrix(matrix, exclude=exclude, delete=delete)) 1307 ntax_adjusted = len(undelete) 1308 nchar_adjusted = len(cropped_matrix[undelete[0]]) 1309 if not undelete or (undelete and undelete[0] == ''): 1310 return 1311 1312 with File.as_handle(filename, mode='w') as fh: 1313 if not omit_NEXUS: 1314 fh.write('#NEXUS\n') 1315 if comment: 1316 fh.write('[' + comment + ']\n') 1317 fh.write('begin data;\n') 1318 fh.write('\tdimensions ntax=%d nchar=%d;\n' % (ntax_adjusted, nchar_adjusted)) 1319 fh.write('\tformat datatype=' + self.datatype) 1320 if self.respectcase: 1321 fh.write(' respectcase') 1322 if self.missing: 1323 fh.write(' missing=' + self.missing) 1324 if self.gap: 1325 fh.write(' gap=' + self.gap) 1326 if self.matchchar: 1327 fh.write(' matchchar=' + self.matchchar) 1328 if self.labels: 1329 fh.write(' labels=' + self.labels) 1330 if self.equate: 1331 fh.write(' equate=' + self.equate) 1332 if interleave or interleave_by_partition: 1333 fh.write(' interleave') 1334 fh.write(';\n') 1335 #if self.taxlabels: 1336 # fh.write('taxlabels '+' '.join(self.taxlabels)+';\n') 1337 if self.charlabels: 1338 newcharlabels = self._adjust_charlabels(exclude=exclude) 1339 clkeys = sorted(newcharlabels) 1340 fh.write('charlabels ' 1341 + ', '.join("%s %s" % (k+1, safename(newcharlabels[k])) for k in clkeys) 1342 + ';\n') 1343 fh.write('matrix\n') 1344 if not blocksize: 1345 if interleave: 1346 blocksize = 70 1347 else: 1348 blocksize = self.nchar 1349 # delete deleted taxa and ecxclude excluded characters... 1350 namelength = max(len(safename(t, mrbayes=mrbayes)) for t in undelete) 1351 if interleave_by_partition: 1352 # interleave by partitions, but adjust partitions with regard to excluded characters 1353 seek = 0 1354 for p in names: 1355 fh.write('[%s: %s]\n' % (interleave_by_partition, p)) 1356 if len(newpartition[p]) > 0: 1357 for taxon in undelete: 1358 fh.write(safename(taxon, mrbayes=mrbayes).ljust(namelength+1)) 1359 fh.write(cropped_matrix[taxon][seek:seek+len(newpartition[p])]+'\n') 1360 fh.write('\n') 1361 else: 1362 fh.write('[empty]\n\n') 1363 seek += len(newpartition[p]) 1364 elif interleave: 1365 for seek in range(0, nchar_adjusted, blocksize): 1366 for taxon in undelete: 1367 fh.write(safename(taxon, mrbayes=mrbayes).ljust(namelength+1)) 1368 fh.write(cropped_matrix[taxon][seek:seek+blocksize]+'\n') 1369 fh.write('\n') 1370 else: 1371 for taxon in undelete: 1372 if blocksize<nchar_adjusted: 1373 fh.write(safename(taxon, mrbayes=mrbayes)+'\n') 1374 else: 1375 fh.write(safename(taxon, mrbayes=mrbayes).ljust(namelength+1)) 1376 taxon_seq = cropped_matrix[taxon] 1377 for seek in range(0, nchar_adjusted, blocksize): 1378 fh.write(taxon_seq[seek:seek+blocksize]+'\n') 1379 del taxon_seq 1380 fh.write(';\nend;\n') 1381 if append_sets: 1382 if codons_block: 1383 fh.write(self.append_sets(exclude=exclude, delete=delete, mrbayes=mrbayes, include_codons=False)) 1384 fh.write(self.append_sets(exclude=exclude, delete=delete, mrbayes=mrbayes, codons_only=True)) 1385 else: 1386 fh.write(self.append_sets(exclude=exclude, delete=delete, mrbayes=mrbayes)) 1387 return filename
1388
1389 - def append_sets(self, exclude=[], delete=[], mrbayes=False, include_codons=True, codons_only=False):
1390 """Returns a sets block.""" 1391 if not self.charsets and not self.taxsets and not self.charpartitions: 1392 return '' 1393 if codons_only: 1394 setsb = ['\nbegin codons'] 1395 else: 1396 setsb = ['\nbegin sets'] 1397 # - now if characters have been excluded, the character sets need to be adjusted, 1398 # so that they still point to the right character positions 1399 # calculate a list of offsets: for each deleted character, the following character position 1400 # in the new file will have an additional offset of -1 1401 offset = 0 1402 offlist = [] 1403 for c in range(self.nchar): 1404 if c in exclude: 1405 offset += 1 1406 offlist.append(-1) # dummy value as these character positions are excluded 1407 else: 1408 offlist.append(c-offset) 1409 # now adjust each of the character sets 1410 if not codons_only: 1411 for n, ns in self.charsets.items(): 1412 cset = [offlist[c] for c in ns if c not in exclude] 1413 if cset: 1414 setsb.append('charset %s = %s' % (safename(n), _compact4nexus(cset))) 1415 for n, s in self.taxsets.items(): 1416 tset = [safename(t, mrbayes=mrbayes) for t in s if t not in delete] 1417 if tset: 1418 setsb.append('taxset %s = %s' % (safename(n), ' '.join(tset))) 1419 for n, p in self.charpartitions.items(): 1420 if not include_codons and n == CODONPOSITIONS: 1421 continue 1422 elif codons_only and n != CODONPOSITIONS: 1423 continue 1424 # as characters have been excluded, the partitions must be adjusted 1425 # if a partition is empty, it will be omitted from the charpartition command 1426 # (although paup allows charpartition part=t1:,t2:,t3:1-100) 1427 names = _sort_keys_by_values(p) 1428 newpartition = {} 1429 for sn in names: 1430 nsp = [offlist[c] for c in p[sn] if c not in exclude] 1431 if nsp: 1432 newpartition[sn] = nsp 1433 if newpartition: 1434 if include_codons and n == CODONPOSITIONS: 1435 command = 'codonposset' 1436 else: 1437 command = 'charpartition' 1438 setsb.append('%s %s = %s' % (command, safename(n), 1439 ', '.join('%s: %s' % (sn, _compact4nexus(newpartition[sn])) 1440 for sn in names if sn in newpartition))) 1441 # now write charpartititions, much easier than charpartitions 1442 for n, p in self.taxpartitions.items(): 1443 names = _sort_keys_by_values(p) 1444 newpartition = {} 1445 for sn in names: 1446 nsp = [t for t in p[sn] if t not in delete] 1447 if nsp: 1448 newpartition[sn]=nsp 1449 if newpartition: 1450 setsb.append('taxpartition %s = %s' % (safename(n), 1451 ', '.join('%s: %s' % (safename(sn), 1452 ' '.join(safename(x) for x in newpartition[sn])) 1453 for sn in names if sn in newpartition))) 1454 # add 'end' and return everything 1455 setsb.append('end;\n') 1456 if len(setsb) == 2: # begin and end only 1457 return '' 1458 else: 1459 return ';\n'.join(setsb)
1460
1461 - def export_fasta(self, filename=None, width=70):
1462 """Writes matrix into a fasta file.""" 1463 if not filename: 1464 if '.' in self.filename and self.filename.split('.')[-1].lower() in ['paup', 'nexus', 'nex', 'dat']: 1465 filename = '.'.join(self.filename.split('.')[:-1])+'.fas' 1466 else: 1467 filename = self.filename+'.fas' 1468 with open(filename, 'w') as fh: 1469 for taxon in self.taxlabels: 1470 fh.write('>' + safename(taxon) + '\n') 1471 for i in range(0, len(str(self.matrix[taxon])), width): 1472 fh.write(str(self.matrix[taxon])[i:i+width] + '\n') 1473 return filename
1474
1475 - def export_phylip(self, filename=None):
1476 """Writes matrix into a PHYLIP file. 1477 1478 Note that this writes a relaxed PHYLIP format file, where the names 1479 are not truncated, nor checked for invalid characters.""" 1480 if not filename: 1481 if '.' in self.filename and self.filename.split('.')[-1].lower() in ['paup', 'nexus', 'nex', 'dat']: 1482 filename = '.'.join(self.filename.split('.')[:-1])+'.phy' 1483 else: 1484 filename = self.filename+'.phy' 1485 with open(filename, 'w') as fh: 1486 fh.write('%d %d\n' % (self.ntax, self.nchar)) 1487 for taxon in self.taxlabels: 1488 fh.write('%s %s\n' % (safename(taxon), str(self.matrix[taxon]))) 1489 return filename
1490
1491 - def constant(self, matrix=None, delete=[], exclude=[]):
1492 """Return a list with all constant characters.""" 1493 if not matrix: 1494 matrix = self.matrix 1495 undelete = [t for t in self.taxlabels if t in matrix and t not in delete] 1496 if not undelete: 1497 return None 1498 elif len(undelete) == 1: 1499 return [x for x in range(len(matrix[undelete[0]])) if x not in exclude] 1500 # get the first sequence and expand all ambiguous values 1501 constant = [(x, self.ambiguous_values.get(n.upper(), n.upper())) for 1502 x, n in enumerate(str(matrix[undelete[0]])) if x not in exclude] 1503 1504 for taxon in undelete[1:]: 1505 newconstant = [] 1506 for site in constant: 1507 #print '%d (paup=%d)' % (site[0],site[0]+1), 1508 seqsite = matrix[taxon][site[0]].upper() 1509 #print seqsite,'checked against',site[1],'\t', 1510 if seqsite == self.missing \ 1511 or (seqsite == self.gap and self.options['gapmode'].lower() == 'missing') \ 1512 or seqsite == site[1]: 1513 # missing or same as before -> ok 1514 newconstant.append(site) 1515 elif seqsite in site[1] \ 1516 or site[1] == self.missing \ 1517 or (self.options['gapmode'].lower() == 'missing' and site[1] == self.gap): 1518 # subset of an ambig or only missing in previous -> take subset 1519 newconstant.append((site[0], self.ambiguous_values.get(seqsite, seqsite))) 1520 elif seqsite in self.ambiguous_values: 1521 # is it an ambig: check the intersection with prev. values 1522 intersect = set(self.ambiguous_values[seqsite]).intersection(set(site[1])) 1523 if intersect: 1524 newconstant.append((site[0], ''.join(intersect))) 1525 # print 'ok' 1526 #else: 1527 # print 'failed' 1528 #else: 1529 # print 'failed' 1530 constant = newconstant 1531 cpos = [s[0] for s in constant] 1532 return cpos
1533
1534 - def cstatus(self, site, delete=[], narrow=True):
1535 """Summarize character. 1536 1537 narrow=True: paup-mode (a c ? --> ac; ? ? ? --> ?) 1538 narrow=false: (a c ? --> a c g t -; ? ? ? --> a c g t -) 1539 """ 1540 undelete = [t for t in self.taxlabels if t not in delete] 1541 if not undelete: 1542 return None 1543 cstatus = [] 1544 for t in undelete: 1545 c = self.matrix[t][