Package Bio :: Package Nexus :: Module Nexus
[hide private]
[frames] | no frames]

Source Code for Module Bio.Nexus.Nexus

   1  # Copyright 2005-2008 by Frank Kauff & Cymon J. Cox. All rights reserved. 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license. Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5  # 
   6  # Bug reports welcome: fkauff@biologie.uni-kl.de or on Biopython's bugzilla. 
   7  """Nexus class. Parse the contents of a NEXUS file. 
   8   
   9  Based upon 'NEXUS: An extensible file format for systematic information' 
  10  Maddison, Swofford, Maddison. 1997. Syst. Biol. 46(4):590-621 
  11  """ 
  12  # For with in Python/Jython 2.5 
  13  from __future__ import with_statement 
  14   
  15  import copy 
  16  import math 
  17  import random 
  18  import sys 
  19   
  20  from Bio import File 
  21  from Bio.Alphabet import IUPAC 
  22  from Bio.Data import IUPACData 
  23  from Bio.Seq import Seq 
  24   
  25  from Trees import Tree 
  26   
  27  INTERLEAVE=70 
  28  SPECIAL_COMMANDS=['charstatelabels','charlabels','taxlabels', 'taxset', 'charset','charpartition','taxpartition', 
  29          'matrix','tree', 'utree','translate','codonposset','title'] 
  30  KNOWN_NEXUS_BLOCKS = ['trees','data', 'characters', 'taxa', 'sets','codons'] 
  31  PUNCTUATION='()[]{}/\,;:=*\'"`+-<>' 
  32  MRBAYESSAFE='abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890_' 
  33  WHITESPACE=' \t\n' 
  34  #SPECIALCOMMENTS=['!','&','%','/','\\','@'] #original list of special comments 
  35  SPECIALCOMMENTS=['&'] # supported special comment ('tree' command), all others are ignored 
  36  CHARSET='chars' 
  37  TAXSET='taxa' 
  38  CODONPOSITIONS='codonpositions' 
  39  DEFAULTNEXUS='#NEXUS\nbegin data; dimensions ntax=0 nchar=0; format datatype=dna; end; ' 
  40   
  41   
42 -class NexusError(Exception):
43 pass
44 45
46 -class CharBuffer(object):
47 """Helps reading NEXUS-words and characters from a buffer."""
48 - def __init__(self,string):
49 if string: 50 self.buffer=list(string) 51 else: 52 self.buffer=[]
53
54 - def peek(self):
55 if self.buffer: 56 return self.buffer[0] 57 else: 58 return None
59
60 - def peek_nonwhitespace(self):
61 b=''.join(self.buffer).strip() 62 if b: 63 return b[0] 64 else: 65 return None
66
67 - def next(self):
68 if self.buffer: 69 return self.buffer.pop(0) 70 else: 71 return None
72
73 - def next_nonwhitespace(self):
74 while True: 75 p=self.next() 76 if p is None: 77 break 78 if p not in WHITESPACE: 79 return p 80 return None
81
82 - def skip_whitespace(self):
83 while self.buffer[0] in WHITESPACE: 84 self.buffer=self.buffer[1:]
85
86 - def next_until(self,target):
87 for t in target: 88 try: 89 pos=self.buffer.index(t) 90 except ValueError: 91 pass 92 else: 93 found=''.join(self.buffer[:pos]) 94 self.buffer=self.buffer[pos:] 95 return found 96 else: 97 return None
98
99 - def peek_word(self,word):
100 return ''.join(self.buffer[:len(word)])==word
101
102 - def next_word(self):
103 """Return the next NEXUS word from a string. 104 105 This deals with single and double quotes, whitespace and punctuation. 106 """ 107 108 word=[] 109 quoted=False 110 first=self.next_nonwhitespace() # get first character 111 if not first: # return empty if only whitespace left 112 return None 113 word.append(first) 114 if first=="'": # word starts with a quote 115 quoted="'" 116 elif first=='"': 117 quoted='"' 118 elif first in PUNCTUATION: # if it's punctuation, return immediately 119 return first 120 while True: 121 c=self.peek() 122 if c==quoted: # a quote? 123 word.append(self.next()) # store quote 124 if self.peek()==quoted: # double quote 125 skip=self.next() # skip second quote 126 elif quoted: # second single quote ends word 127 break 128 elif quoted: 129 word.append(self.next()) # if quoted, then add anything 130 elif not c or c in PUNCTUATION or c in WHITESPACE: # if not quoted and special character, stop 131 break 132 else: 133 word.append(self.next()) # standard character 134 return ''.join(word)
135
136 - def rest(self):
137 """Return the rest of the string without parsing.""" 138 return ''.join(self.buffer)
139 140
141 -class StepMatrix(object):
142 """Calculate a stepmatrix for weighted parsimony. 143 144 See Wheeler (1990), Cladistics 6:269-275. 145 """ 146
147 - def __init__(self,symbols,gap):
148 self.data={} 149 self.symbols=[s for s in symbols] 150 self.symbols.sort() 151 if gap: 152 self.symbols.append(gap) 153 for x in self.symbols: 154 for y in [s for s in self.symbols if s!=x]: 155 self.set(x,y,0)
156
157 - def set(self,x,y,value):
158 if x>y: 159 x,y=y,x 160 self.data[x+y]=value
161
162 - def add(self,x,y,value):
163 if x>y: 164 x,y=y,x 165 self.data[x+y]+=value
166
167 - def sum(self):
168 return reduce(lambda x,y:x+y,self.data.values())
169
170 - def transformation(self):
171 total=self.sum() 172 if total!=0: 173 for k in self.data: 174 self.data[k]=self.data[k]/float(total) 175 return self
176
177 - def weighting(self):
178 for k in self.data: 179 if self.data[k]!=0: 180 self.data[k]=-math.log(self.data[k]) 181 return self
182
183 - def smprint(self,name='your_name_here'):
184 matrix='usertype %s stepmatrix=%d\n' % (name,len(self.symbols)) 185 matrix+=' %s\n' % ' '.join(self.symbols) 186 for x in self.symbols: 187 matrix+='[%s]'.ljust(8) % x 188 for y in self.symbols: 189 if x==y: 190 matrix+=' . ' 191 else: 192 if x>y: 193 x1,y1=y,x 194 else: 195 x1,y1=x,y 196 if self.data[x1+y1]==0: 197 matrix+='inf. ' 198 else: 199 matrix+='%2.2f'.ljust(10) % (self.data[x1+y1]) 200 matrix+='\n' 201 matrix+=';\n' 202 return matrix
203 204
205 -def safename(name,mrbayes=False):
206 """Return a taxon identifier according to NEXUS standard. 207 208 Wrap quotes around names with punctuation or whitespace, and double 209 single quotes. 210 211 mrbayes=True: write names without quotes, whitespace or punctuation 212 for the mrbayes software package. 213 """ 214 if mrbayes: 215 safe=name.replace(' ','_') 216 safe=''.join([c for c in safe if c in MRBAYESSAFE]) 217 else: 218 safe=name.replace("'","''") 219 if set(safe).intersection(set(WHITESPACE+PUNCTUATION)): 220 safe="'"+safe+"'" 221 return safe
222 223
224 -def quotestrip(word):
225 """Remove quotes and/or double quotes around identifiers.""" 226 if not word: 227 return None 228 while (word.startswith("'") and word.endswith("'")) or (word.startswith('"') and word.endswith('"')): 229 word=word[1:-1] 230 return word
231 232
233 -def get_start_end(sequence, skiplist=['-','?']):
234 """Return position of first and last character which is not in skiplist. 235 236 Skiplist defaults to ['-','?']).""" 237 238 length=len(sequence) 239 if length==0: 240 return None,None 241 end=length-1 242 while end>=0 and (sequence[end] in skiplist): 243 end-=1 244 start=0 245 while start<length and (sequence[start] in skiplist): 246 start+=1 247 if start==length and end==-1: # empty sequence 248 return -1,-1 249 else: 250 return start,end
251 252
253 -def _sort_keys_by_values(p):
254 """Returns a sorted list of keys of p sorted by values of p.""" 255 startpos=[(p[pn],pn) for pn in p if p[pn]] 256 startpos.sort() 257 # parenthisis added because of py3k 258 return (zip(*startpos))[1]
259 260
261 -def _make_unique(l):
262 """Check that all values in list are unique and return a pruned and sorted list.""" 263 l=list(set(l)) 264 l.sort() 265 return l
266 267
268 -def _unique_label(previous_labels,label):
269 """Returns a unique name if label is already in previous_labels.""" 270 while label in previous_labels: 271 if label.split('.')[-1].startswith('copy'): 272 label='.'.join(label.split('.')[:-1])+'.copy'+str(eval('0'+label.split('.')[-1][4:])+1) 273 else: 274 label+='.copy' 275 return label
276 277
278 -def _seqmatrix2strmatrix(matrix):
279 """Converts a Seq-object matrix to a plain sequence-string matrix.""" 280 return dict([(t, str(matrix[t])) for t in matrix])
281 282
283 -def _compact4nexus(orig_list):
284 """Transform [1 2 3 5 6 7 8 12 15 18 20] (baseindex 0, used in the Nexus class) 285 into '2-4 6-9 13-19\\3 21' (baseindex 1, used in programs like Paup or MrBayes.). 286 """ 287 288 if not orig_list: 289 return '' 290 orig_list=list(set(orig_list)) 291 orig_list.sort() 292 shortlist=[] 293 clist=orig_list[:] 294 clist.append(clist[-1]+.5) # dummy value makes it easier 295 while len(clist)>1: 296 step=1 297 for i,x in enumerate(clist): 298 if x==clist[0]+i*step: # are we still in the right step? 299 continue 300 elif i==1 and len(clist)>3 and clist[i+1]-x==x-clist[0]: 301 # second element, and possibly at least 3 elements to link, 302 # and the next one is in the right step 303 step=x-clist[0] 304 else: # pattern broke, add all values before current position to new list 305 sub=clist[:i] 306 if len(sub)==1: 307 shortlist.append(str(sub[0]+1)) 308 else: 309 if step==1: 310 shortlist.append('%d-%d' % (sub[0]+1,sub[-1]+1)) 311 else: 312 shortlist.append('%d-%d\\%d' % (sub[0]+1,sub[-1]+1,step)) 313 clist=clist[i:] 314 break 315 return ' '.join(shortlist)
316 317
318 -def combine(matrices):
319 """Combine matrices in [(name,nexus-instance),...] and return new nexus instance. 320 321 combined_matrix=combine([(name1,nexus_instance1),(name2,nexus_instance2),...] 322 Character sets, character partitions and taxon sets are prefixed, readjusted 323 and present in the combined matrix. 324 """ 325 326 if not matrices: 327 return None 328 name=matrices[0][0] 329 combined=copy.deepcopy(matrices[0][1]) # initiate with copy of first matrix 330 mixed_datatypes=(len(set([n[1].datatype for n in matrices]))>1) 331 if mixed_datatypes: 332 combined.datatype='None' # dealing with mixed matrices is application specific. You take care of that yourself! 333 # raise NexusError('Matrices must be of same datatype') 334 combined.charlabels=None 335 combined.statelabels=None 336 combined.interleave=False 337 combined.translate=None 338 339 # rename taxon sets and character sets and name them with prefix 340 for cn,cs in combined.charsets.iteritems(): 341 combined.charsets['%s.%s' % (name,cn)]=cs 342 del combined.charsets[cn] 343 for tn,ts in combined.taxsets.iteritems(): 344 combined.taxsets['%s.%s' % (name,tn)]=ts 345 del combined.taxsets[tn] 346 # previous partitions usually don't make much sense in combined matrix 347 # just initiate one new partition parted by single matrices 348 combined.charpartitions={'combined':{name:range(combined.nchar)}} 349 for n,m in matrices[1:]: # add all other matrices 350 both=[t for t in combined.taxlabels if t in m.taxlabels] 351 combined_only=[t for t in combined.taxlabels if t not in both] 352 m_only=[t for t in m.taxlabels if t not in both] 353 for t in both: 354 # concatenate sequences and unify gap and missing character symbols 355 combined.matrix[t]+=Seq(str(m.matrix[t]).replace(m.gap,combined.gap).replace(m.missing,combined.missing),combined.alphabet) 356 # replace date of missing taxa with symbol for missing data 357 for t in combined_only: 358 combined.matrix[t]+=Seq(combined.missing*m.nchar,combined.alphabet) 359 for t in m_only: 360 combined.matrix[t]=Seq(combined.missing*combined.nchar,combined.alphabet)+\ 361 Seq(str(m.matrix[t]).replace(m.gap,combined.gap).replace(m.missing,combined.missing),combined.alphabet) 362 combined.taxlabels.extend(m_only) # new taxon list 363 for cn,cs in m.charsets.iteritems(): # adjust character sets for new matrix 364 combined.charsets['%s.%s' % (n,cn)]=[x+combined.nchar for x in cs] 365 if m.taxsets: 366 if not combined.taxsets: 367 combined.taxsets={} 368 # update taxon sets 369 combined.taxsets.update(dict(('%s.%s' % (n,tn),ts) 370 for tn,ts in m.taxsets.iteritems())) 371 # update new charpartition 372 combined.charpartitions['combined'][n]=range(combined.nchar,combined.nchar+m.nchar) 373 # update charlabels 374 if m.charlabels: 375 if not combined.charlabels: 376 combined.charlabels={} 377 combined.charlabels.update(dict((combined.nchar+i,label) 378 for (i,label) in m.charlabels.iteritems())) 379 combined.nchar+=m.nchar # update nchar and ntax 380 combined.ntax+=len(m_only) 381 382 # some prefer partitions, some charsets: 383 # make separate charset for ecah initial dataset 384 for c in combined.charpartitions['combined']: 385 combined.charsets[c]=combined.charpartitions['combined'][c] 386 387 return combined
388 389
390 -def _kill_comments_and_break_lines(text):
391 """Delete []-delimited comments out of a file and break into lines separated by ';'. 392 393 stripped_text=_kill_comments_and_break_lines(text): 394 Nested and multiline comments are allowed. [ and ] symbols within single 395 or double quotes are ignored, newline ends a quote, all symbols with quotes are 396 treated the same (thus not quoting inside comments like [this character ']' ends a comment]) 397 Special [&...] and [\...] comments remain untouched, if not inside standard comment. 398 Quotes inside special [& and [\ are treated as normal characters, 399 but no nesting inside these special comments allowed (like [& [\ ]]). 400 ';' ist deleted from end of line. 401 402 NOTE: this function is very slow for large files, and obsolete when using C extension cnexus 403 """ 404 contents=iter(text) 405 newtext=[] 406 newline=[] 407 quotelevel='' 408 speciallevel=False 409 commlevel=0 410 #Parse with one character look ahead (for special comments) 411 t2 = contents.next() 412 while True: 413 t = t2 414 try: 415 t2 = contents.next() 416 except StopIteration: 417 t2 = None 418 if t is None: 419 break 420 if t==quotelevel and not (commlevel or speciallevel): # matching quote ends quotation 421 quotelevel='' 422 elif not quotelevel and not (commlevel or speciallevel) and (t=='"' or t=="'"): # single or double quote starts quotation 423 quotelevel=t 424 elif not quotelevel and t=='[': # opening bracket outside a quote 425 if t2 in SPECIALCOMMENTS and commlevel==0 and not speciallevel: 426 speciallevel=True 427 else: 428 commlevel+=1 429 elif not quotelevel and t==']': # closing bracket ioutside a quote 430 if speciallevel: 431 speciallevel=False 432 else: 433 commlevel-=1 434 if commlevel<0: 435 raise NexusError('Nexus formatting error: unmatched ]') 436 continue 437 if commlevel==0: # copy if we're not in comment 438 if t==';' and not quotelevel: 439 newtext.append(''.join(newline)) 440 newline=[] 441 else: 442 newline.append(t) 443 #level of comments should be 0 at the end of the file 444 if newline: 445 newtext.append('\n'.join(newline)) 446 if commlevel>0: 447 raise NexusError('Nexus formatting error: unmatched [') 448 return newtext
449 450
451 -def _adjust_lines(lines):
452 """Adjust linebreaks to match ';', strip leading/trailing whitespace. 453 454 list_of_commandlines=_adjust_lines(input_text) 455 Lines are adjusted so that no linebreaks occur within a commandline 456 (except matrix command line) 457 """ 458 formatted_lines=[] 459 for l in lines: 460 #Convert line endings 461 l=l.replace('\r\n','\n').replace('\r','\n').strip() 462 if l.lower().startswith('matrix'): 463 formatted_lines.append(l) 464 else: 465 l=l.replace('\n',' ') 466 if l: 467 formatted_lines.append(l) 468 return formatted_lines
469 470
471 -def _replace_parenthesized_ambigs(seq,rev_ambig_values):
472 """Replaces ambigs in xxx(ACG)xxx format by IUPAC ambiguity code.""" 473 474 opening=seq.find('(') 475 while opening>-1: 476 closing=seq.find(')') 477 if closing<0: 478 raise NexusError('Missing closing parenthesis in: '+seq) 479 elif closing<opening: 480 raise NexusError('Missing opening parenthesis in: '+seq) 481 ambig=[x for x in seq[opening+1:closing]] 482 ambig.sort() 483 ambig=''.join(ambig) 484 ambig_code=rev_ambig_values[ambig.upper()] 485 if ambig!=ambig.upper(): 486 ambig_code=ambig_code.lower() 487 seq=seq[:opening]+ambig_code+seq[closing+1:] 488 opening=seq.find('(') 489 return seq
490 491
492 -class Commandline(object):
493 """Represent a commandline as command and options.""" 494
495 - def __init__(self, line, title):
496 self.options={} 497 options=[] 498 self.command=None 499 try: 500 #Assume matrix (all other command lines have been stripped of \n) 501 self.command, options = line.strip().split('\n', 1) 502 except ValueError: # Not matrix 503 #self.command,options=line.split(' ',1) #no: could be tab or spaces (translate...) 504 self.command=line.split()[0] 505 options=' '.join(line.split()[1:]) 506 self.command = self.command.strip().lower() 507 if self.command in SPECIAL_COMMANDS: # special command that need newlines and order of options preserved 508 self.options=options.strip() 509 else: 510 if len(options) > 0: 511 try: 512 options = options.replace('=', ' = ').split() 513 valued_indices=[(n-1,n,n+1) for n in range(len(options)) if options[n]=='=' and n!=0 and n!=len((options))] 514 indices = [] 515 for sl in valued_indices: 516 indices.extend(sl) 517 token_indices = [n for n in range(len(options)) if n not in indices] 518 for opt in valued_indices: 519 #self.options[options[opt[0]].lower()] = options[opt[2]].lower() 520 self.options[options[opt[0]].lower()] = options[opt[2]] 521 for token in token_indices: 522 self.options[options[token].lower()] = None 523 except ValueError: 524 raise NexusError('Incorrect formatting in line: %s' % line)
525 526
527 -class Block(object):
528 """Represent a NEXUS block with block name and list of commandlines."""
529 - def __init__(self,title=None):
530 self.title=title 531 self.commandlines=[]
532 533
534 -class Nexus(object):
535
536 - def __init__(self, input=None):
537 self.ntax=0 # number of taxa 538 self.nchar=0 # number of characters 539 self.unaltered_taxlabels=[] # taxlabels as the appear in the input file (incl. duplicates, etc.) 540 self.taxlabels=[] # labels for taxa, ordered by their id 541 self.charlabels=None # ... and for characters 542 self.statelabels=None # ... and for states 543 self.datatype='dna' # (standard), dna, rna, nucleotide, protein 544 self.respectcase=False # case sensitivity 545 self.missing='?' # symbol for missing characters 546 self.gap='-' # symbol for gap 547 self.symbols=None # set of symbols 548 self.equate=None # set of symbol synonyms 549 self.matchchar=None # matching char for matrix representation 550 self.labels=None # left, right, no 551 self.transpose=False # whether matrix is transposed 552 self.interleave=False # whether matrix is interleaved 553 self.tokens=False # unsupported 554 self.eliminate=None # unsupported 555 self.matrix=None # ... 556 self.unknown_blocks=[] # blocks we don't care about 557 self.taxsets={} 558 self.charsets={} 559 self.charpartitions={} 560 self.taxpartitions={} 561 self.trees=[] # list of Trees (instances of Tree class) 562 self.translate=None # Dict to translate taxon <-> taxon numbers 563 self.structured=[] # structured input representation 564 self.set={} # dict of the set command to set various options 565 self.options={} # dict of the options command in the data block 566 self.codonposset=None # name of the charpartition that defines codon positions 567 568 # some defaults 569 self.options['gapmode']='missing' 570 571 if input: 572 self.read(input) 573 else: 574 self.read(DEFAULTNEXUS)
575
576 - def get_original_taxon_order(self):
577 """Included for backwards compatibility (DEPRECATED).""" 578 return self.taxlabels
579
580 - def set_original_taxon_order(self,value):
581 """Included for backwards compatibility (DEPRECATED).""" 582 self.taxlabels=value
583 584 original_taxon_order=property(get_original_taxon_order,set_original_taxon_order) 585
586 - def read(self,input):
587 """Read and parse NEXUS input (a filename, file-handle, or string).""" 588 589 # 1. Assume we have the name of a file in the execution dir or a 590 # file-like object. 591 # Note we need to add parsing of the path to dir/filename 592 try: 593 with File.as_handle(input, 'rU') as fp: 594 file_contents = fp.read() 595 self.filename = getattr(fp, 'name', 'Unknown_nexus_file') 596 except (TypeError,IOError,AttributeError): 597 #2 Assume we have a string from a fh.read() 598 if isinstance(input, basestring): 599 file_contents = input 600 self.filename='input_string' 601 else: 602 print input.strip()[:50] 603 raise NexusError('Unrecognized input: %s ...' % input[:100]) 604 file_contents=file_contents.strip() 605 if file_contents.startswith('#NEXUS'): 606 file_contents=file_contents[6:] 607 commandlines=_get_command_lines(file_contents) 608 # get rid of stupid 'NEXUS token - in merged treefiles, this might appear multiple times' 609 for i,cl in enumerate(commandlines): 610 try: 611 if cl[:6].upper()=='#NEXUS': 612 commandlines[i]=cl[6:].strip() 613 except: 614 pass 615 # now loop through blocks (we parse only data in known blocks, thus ignoring non-block commands 616 nexus_block_gen = self._get_nexus_block(commandlines) 617 while 1: 618 try: 619 title, contents = nexus_block_gen.next() 620 except StopIteration: 621 break 622 if title in KNOWN_NEXUS_BLOCKS: 623 self._parse_nexus_block(title, contents) 624 else: 625 self._unknown_nexus_block(title, contents)
626
627 - def _get_nexus_block(self,file_contents):
628 """Generator for looping through Nexus blocks.""" 629 inblock=False 630 blocklines=[] 631 while file_contents: 632 cl=file_contents.pop(0) 633 if cl.lower().startswith('begin'): 634 if not inblock: 635 inblock=True 636 title=cl.split()[1].lower() 637 else: 638 raise NexusError('Illegal block nesting in block %s' % title) 639 elif cl.lower().startswith('end'): 640 if inblock: 641 inblock=False 642 yield title,blocklines 643 blocklines=[] 644 else: 645 raise NexusError('Unmatched \'end\'.') 646 elif inblock: 647 blocklines.append(cl)
648
649 - def _unknown_nexus_block(self,title, contents):
650 block = Block() 651 block.commandlines.append(contents) 652 block.title = title 653 self.unknown_blocks.append(block)
654
655 - def _parse_nexus_block(self,title, contents):
656 """Parse a known Nexus Block (PRIVATE).""" 657 # attached the structered block representation 658 self._apply_block_structure(title, contents) 659 #now check for taxa,characters,data blocks. If this stuff is defined more than once 660 #the later occurences will override the previous ones. 661 block=self.structured[-1] 662 for line in block.commandlines: 663 try: 664 getattr(self,'_'+line.command)(line.options) 665 except AttributeError: 666 raise 667 raise NexusError('Unknown command: %s ' % line.command)
668
669 - def _title(self,options):
670 pass
671
672 - def _dimensions(self,options):
673 if 'ntax' in options: 674 self.ntax=eval(options['ntax']) 675 if 'nchar' in options: 676 self.nchar=eval(options['nchar'])
677
678 - def _format(self,options):
679 # print options 680 # we first need to test respectcase, then symbols (which depends on respectcase) 681 # then datatype (which, if standard, depends on symbols and respectcase in order to generate 682 # dicts for ambiguous values and alphabet 683 if 'respectcase' in options: 684 self.respectcase=True 685 # adjust symbols to for respectcase 686 if 'symbols' in options: 687 self.symbols=options['symbols'] 688 if (self.symbols.startswith('"') and self.symbols.endswith('"')) or\ 689 (self.symbold.startswith("'") and self.symbols.endswith("'")): 690 self.symbols=self.symbols[1:-1].replace(' ','') 691 if not self.respectcase: 692 self.symbols=self.symbols.lower()+self.symbols.upper() 693 self.symbols=list(set(self.symbols)) 694 if 'datatype' in options: 695 self.datatype=options['datatype'].lower() 696 if self.datatype=='dna' or self.datatype=='nucleotide': 697 self.alphabet=copy.deepcopy(IUPAC.ambiguous_dna) 698 self.ambiguous_values=copy.deepcopy(IUPACData.ambiguous_dna_values) 699 self.unambiguous_letters=copy.deepcopy(IUPACData.unambiguous_dna_letters) 700 elif self.datatype=='rna': 701 self.alphabet=copy.deepcopy(IUPAC.ambiguous_rna) 702 self.ambiguous_values=copy.deepcopy(IUPACData.ambiguous_rna_values) 703 self.unambiguous_letters=copy.deepcopy(IUPACData.unambiguous_rna_letters) 704 elif self.datatype=='protein': 705 self.alphabet=copy.deepcopy(IUPAC.protein) 706 self.ambiguous_values={'B':'DN','Z':'EQ','X':copy.deepcopy(IUPACData.protein_letters)} # that's how PAUP handles it 707 self.unambiguous_letters=copy.deepcopy(IUPACData.protein_letters)+'*' # stop-codon 708 elif self.datatype=='standard': 709 raise NexusError('Datatype standard is not yet supported.') 710 #self.alphabet=None 711 #self.ambiguous_values={} 712 #if not self.symbols: 713 # self.symbols='01' # if nothing else defined, then 0 and 1 are the default states 714 #self.unambiguous_letters=self.symbols 715 else: 716 raise NexusError('Unsupported datatype: '+self.datatype) 717 self.valid_characters=''.join(self.ambiguous_values)+self.unambiguous_letters 718 if not self.respectcase: 719 self.valid_characters=self.valid_characters.lower()+self.valid_characters.upper() 720 #we have to sort the reverse ambig coding dict key characters: 721 #to be sure that it's 'ACGT':'N' and not 'GTCA':'N' 722 rev=dict((i[1],i[0]) for i in self.ambiguous_values.iteritems() if i[0]!='X') 723 self.rev_ambiguous_values={} 724 for (k,v) in rev.iteritems(): 725 key=[c for c in k] 726 key.sort() 727 self.rev_ambiguous_values[''.join(key)]=v 728 #overwrite symbols for datype rna,dna,nucleotide 729 if self.datatype in ['dna','rna','nucleotide']: 730 self.symbols=self.alphabet.letters 731 if self.missing not in self.ambiguous_values: 732 self.ambiguous_values[self.missing]=self.unambiguous_letters+self.gap 733 self.ambiguous_values[self.gap]=self.gap 734 elif self.datatype=='standard': 735 if not self.symbols: 736 self.symbols=['1','0'] 737 if 'missing' in options: 738 self.missing=options['missing'][0] 739 if 'gap' in options: 740 self.gap=options['gap'][0] 741 if 'equate' in options: 742 self.equate=options['equate'] 743 if 'matchchar' in options: 744 self.matchchar=options['matchchar'][0] 745 if 'labels' in options: 746 self.labels=options['labels'] 747 if 'transpose' in options: 748 raise NexusError('TRANSPOSE is not supported!') 749 self.transpose=True 750 if 'interleave' in options: 751 if options['interleave'] is None or options['interleave'].lower()=='yes': 752 self.interleave=True 753 if 'tokens' in options: 754 self.tokens=True 755 if 'notokens' in options: 756 self.tokens=False
757
758 - def _set(self,options):
759 self.set=options
760
761 - def _options(self,options):
762 self.options=options
763
764 - def _eliminate(self,options):
765 self.eliminate=options
766
767 - def _taxlabels(self,options):
768 """Get taxon labels (PRIVATE). 769 770 As the taxon names are already in the matrix, this is superfluous 771 except for transpose matrices, which are currently unsupported anyway. 772 Thus, we ignore the taxlabels command to make handling of duplicate 773 taxon names easier. 774 """ 775 pass
776 #self.taxlabels=[] 777 #opts=CharBuffer(options) 778 #while True: 779 # taxon=quotestrip(opts.next_word()) 780 # if not taxon: 781 # break 782 # self.taxlabels.append(taxon) 783
784 - def _check_taxlabels(self,taxon):
785 """Check for presence of taxon in self.taxlabels.""" 786 # According to NEXUS standard, underscores shall be treated as spaces..., 787 # so checking for identity is more difficult 788 nextaxa=dict([(t.replace(' ','_'),t) for t in self.taxlabels]) 789 nexid=taxon.replace(' ','_') 790 return nextaxa.get(nexid)
791
792 - def _charlabels(self,options):
793 self.charlabels={} 794 opts=CharBuffer(options) 795 while True: 796 try: 797 # get id and state 798 w=opts.next_word() 799 if w is None: # McClade saves and reads charlabel-lists with terminal comma?! 800 break 801 identifier=self._resolve(w,set_type=CHARSET) 802 state=quotestrip(opts.next_word()) 803 self.charlabels[identifier]=state 804 # check for comma or end of command 805 c=opts.next_nonwhitespace() 806 if c is None: 807 break 808 elif c!=',': 809 raise NexusError('Missing \',\' in line %s.' % options) 810 except NexusError: 811 raise 812 except: 813 raise NexusError('Format error in line %s.' % options)
814
815 - def _charstatelabels(self,options):
816 # warning: charstatelabels supports only charlabels-syntax! 817 self._charlabels(options)
818
819 - def _statelabels(self,options):
820 #self.charlabels=options 821 #print 'Command statelabels is not supported and will be ignored.' 822 pass
823
824 - def _matrix(self,options):
825 if not self.ntax or not self.nchar: 826 raise NexusError('Dimensions must be specified before matrix!') 827 self.matrix={} 828 taxcount=0 829 first_matrix_block=True 830 831 #eliminate empty lines and leading/trailing whitespace 832 lines=[l.strip() for l in options.split('\n') if l.strip()!=''] 833 lineiter=iter(lines) 834 while 1: 835 try: 836 l=lineiter.next() 837 except StopIteration: 838 if taxcount<self.ntax: 839 raise NexusError('Not enough taxa in matrix.') 840 elif taxcount>self.ntax: 841 raise NexusError('Too many taxa in matrix.') 842 else: 843 break 844 # count the taxa and check for interleaved matrix 845 taxcount+=1 846 ##print taxcount 847 if taxcount>self.ntax: 848 if not self.interleave: 849 raise NexusError('Too many taxa in matrix - should matrix be interleaved?') 850 else: 851 taxcount=1 852 first_matrix_block=False 853 #get taxon name and sequence 854 linechars=CharBuffer(l) 855 id=quotestrip(linechars.next_word()) 856 l=linechars.rest().strip() 857 chars='' 858 if self.interleave: 859 #interleaved matrix 860 #print 'In interleave' 861 if l: 862 chars=''.join(l.split()) 863 else: 864 chars=''.join(lineiter.next().split()) 865 else: 866 #non-interleaved matrix 867 chars=''.join(l.split()) 868 while len(chars)<self.nchar: 869 l=lineiter.next() 870 chars+=''.join(l.split()) 871 iupac_seq=Seq(_replace_parenthesized_ambigs(chars,self.rev_ambiguous_values),self.alphabet) 872 #first taxon has the reference sequence if matchhar is used 873 if taxcount==1: 874 refseq=iupac_seq 875 else: 876 if self.matchchar: 877 while 1: 878 p=str(iupac_seq).find(self.matchchar) 879 if p==-1: 880 break 881 iupac_seq=Seq(str(iupac_seq)[:p]+refseq[p]+str(iupac_seq)[p+1:],self.alphabet) 882 #check for invalid characters 883 for i,c in enumerate(str(iupac_seq)): 884 if c not in self.valid_characters and c!=self.gap and c!=self.missing: 885 raise NexusError( 886 ('Taxon %s: Illegal character %s in sequence %s ' + 887 '(check dimensions/interleaving)') % (id,c, iupac_seq)) 888 #add sequence to matrix 889 if first_matrix_block: 890 self.unaltered_taxlabels.append(id) 891 id=_unique_label(self.matrix.keys(),id) 892 self.matrix[id]=iupac_seq 893 self.taxlabels.append(id) 894 else: 895 # taxon names need to be in the same order in each interleaved block 896 id=_unique_label(self.taxlabels[:taxcount-1],id) 897 taxon_present=self._check_taxlabels(id) 898 if taxon_present: 899 self.matrix[taxon_present]+=iupac_seq 900 else: 901 raise NexusError('Taxon %s not in first block of interleaved matrix. Check matrix dimensions and interleave.' % id) 902 #check all sequences for length according to nchar 903 for taxon in self.matrix: 904 if len(self.matrix[taxon])!=self.nchar: 905 raise NexusError('Matrix Nchar %d does not match data length (%d) for taxon %s' 906 % (self.nchar, len(self.matrix[taxon]),taxon)) 907 #check that taxlabels is identical with matrix.keys. If not, it's a problem 908 matrixkeys=sorted(self.matrix) 909 taxlabelssort=self.taxlabels[:] 910 taxlabelssort.sort() 911 assert matrixkeys==taxlabelssort,"ERROR: TAXLABELS must be identical with MATRIX. Please Report this as a bug, and send in data file."
912
913 - def _translate(self,options):
914 self.translate={} 915 opts=CharBuffer(options) 916 while True: 917 try: 918 # get id and state 919 identifier=int(opts.next_word()) 920 label=quotestrip(opts.next_word()) 921 self.translate[identifier]=label 922 # check for comma or end of command 923 c=opts.next_nonwhitespace() 924 if c is None: 925 break 926 elif c!=',': 927 raise NexusError('Missing \',\' in line %s.' % options) 928 except NexusError: 929 raise 930 except: 931 raise NexusError('Format error in line %s.' % options)
932
933 - def _utree(self,options):
934 """Some software (clustalx) uses 'utree' to denote an unrooted tree.""" 935 self._tree(options)
936
937 - def _tree(self,options):
938 opts=CharBuffer(options) 939 if opts.peek_nonwhitespace()=='*': # a star can be used to make it the default tree in some software packages 940 dummy=opts.next_nonwhitespace() 941 name=opts.next_word() 942 if opts.next_nonwhitespace()!='=': 943 raise NexusError('Syntax error in tree description: %s' 944 % options[:50]) 945 rooted=False 946 weight=1.0 947 while opts.peek_nonwhitespace()=='[': 948 open=opts.next_nonwhitespace() 949 symbol=opts.next() 950 if symbol!='&': 951 raise NexusError('Illegal special comment [%s...] in tree description: %s' 952 % (symbol, options[:50])) 953 special=opts.next() 954 value=opts.next_until(']') 955 closing=opts.next() 956 if special=='R': 957 rooted=True 958 elif special=='U': 959 rooted=False 960 elif special=='W': 961 weight=float(value) 962 tree=Tree(name=name,weight=weight,rooted=rooted,tree=opts.rest().strip()) 963 # if there's an active translation table, translate 964 if self.translate: 965 for n in tree.get_terminals(): 966 try: 967 tree.node(n).data.taxon=safename(self.translate[int(tree.node(n).data.taxon)]) 968 except (ValueError,KeyError): 969 raise NexusError('Unable to substitute %s using \'translate\' data.' 970 % tree.node(n).data.taxon) 971 self.trees.append(tree)
972
973 - def _apply_block_structure(self,title,lines):
974 block=Block('') 975 block.title = title 976 for line in lines: 977 block.commandlines.append(Commandline(line, title)) 978 self.structured.append(block)
979
980 - def _taxset(self, options):
981 name,taxa=self._get_indices(options,set_type=TAXSET) 982 self.taxsets[name]=_make_unique(taxa)
983
984 - def _charset(self, options):
985 name,sites=self._get_indices(options,set_type=CHARSET) 986 self.charsets[name]=_make_unique(sites)
987
988 - def _taxpartition(self, options):
989 taxpartition={} 990 quotelevel=False 991 opts=CharBuffer(options) 992 name=self._name_n_vector(opts) 993 if not name: 994 raise NexusError('Formatting error in taxpartition: %s ' % options) 995 # now collect thesubbpartitions and parse them 996 # subpartitons separated by commas - which unfortunately could be part of a quoted identifier... 997 # this is rather unelegant, but we have to avoid double-parsing and potential change of special nexus-words 998 sub='' 999 while True: 1000 w=opts.next() 1001 if w is None or (w==',' and not quotelevel): 1002 subname,subindices=self._get_indices(sub,set_type=TAXSET,separator=':') 1003 taxpartition[subname]=_make_unique(subindices) 1004 sub='' 1005 if w is None: 1006 break 1007 else: 1008 if w=="'": 1009 quotelevel=not quotelevel 1010 sub+=w 1011 self.taxpartitions[name]=taxpartition
1012
1013 - def _codonposset(self,options):
1014 """Read codon positions from a codons block as written from McClade. 1015 1016 Here codonposset is just a fancy name for a character partition with 1017 the name CodonPositions and the partitions N,1,2,3 1018 """ 1019 1020 prev_partitions=self.charpartitions.keys() 1021 self._charpartition(options) 1022 # mcclade calls it CodonPositions, but you never know... 1023 codonname=[n for n in self.charpartitions if n not in prev_partitions] 1024 if codonname==[] or len(codonname)>1: 1025 raise NexusError('Formatting Error in codonposset: %s ' % options) 1026 else: 1027 self.codonposset=codonname[0]
1028
1029 - def _codeset(self,options):
1030 pass
1031
1032 - def _charpartition(self, options):
1033 charpartition={} 1034 quotelevel=False 1035 opts=CharBuffer(options) 1036 name=self._name_n_vector(opts) 1037 if not name: 1038 raise NexusError('Formatting error in charpartition: %s ' % options) 1039 # now collect thesubbpartitions and parse them 1040 # subpartitons separated by commas - which unfortunately could be part of a quoted identifier... 1041 sub='' 1042 while True: 1043 w=opts.next() 1044 if w is None or (w==',' and not quotelevel): 1045 subname,subindices=self._get_indices(sub,set_type=CHARSET,separator=':') 1046 charpartition[subname]=_make_unique(subindices) 1047 sub='' 1048 if w is None: 1049 break 1050 else: 1051 if w=="'": 1052 quotelevel=not quotelevel 1053 sub+=w 1054 self.charpartitions[name]=charpartition
1055
1056 - def _get_indices(self,options,set_type=CHARSET,separator='='):
1057 """Parse the taxset/charset specification (PRIVATE). 1058 1059 e.g. '1 2 3 - 5 dog cat 10 - 20 \\ 3' 1060 --> [0,1,2,3,4,'dog','cat',9,12,15,18] 1061 """ 1062 opts=CharBuffer(options) 1063 name=self._name_n_vector(opts,separator=separator) 1064 indices=self._parse_list(opts,set_type=set_type) 1065 if indices is None: 1066 raise NexusError('Formatting error in line: %s ' % options) 1067 return name,indices
1068
1069 - def _name_n_vector(self,opts,separator='='):
1070 """Extract name and check that it's not in vector format.""" 1071 rest=opts.rest() 1072 name=opts.next_word() 1073 # we ignore * before names 1074 if name=='*': 1075 name=opts.next_word() 1076 if not name: 1077 raise NexusError('Formatting error in line: %s ' % rest) 1078 name=quotestrip(name) 1079 if opts.peek_nonwhitespace=='(': 1080 open=opts.next_nonwhitespace() 1081 qualifier=open.next_word() 1082 close=opts.next_nonwhitespace() 1083 if qualifier.lower()=='vector': 1084 raise NexusError('Unsupported VECTOR format in line %s' 1085 % (opts)) 1086 elif qualifier.lower()!='standard': 1087 raise NexusError('Unknown qualifier %s in line %s' 1088 % (qualifier, opts)) 1089 if opts.next_nonwhitespace()!=separator: 1090 raise NexusError('Formatting error in line: %s ' % rest) 1091 return name
1092
1093 - def _parse_list(self,options_buffer,set_type):
1094 """Parse a NEXUS list (PRIVATE). 1095 1096 e.g. [1, 2, 4-8\\2, dog, cat] --> [1,2,4,6,8,17,21], 1097 (assuming dog is taxon no. 17 and cat is taxon no. 21). 1098 """ 1099 plain_list=[] 1100 if options_buffer.peek_nonwhitespace(): 1101 try: # capture all possible exceptions and treat them as formatting erros, if they are not NexusError 1102 while True: 1103 identifier=options_buffer.next_word() # next list element 1104 if not identifier: # end of list? 1105 break 1106 start=self._resolve(identifier,set_type=set_type) 1107 if options_buffer.peek_nonwhitespace()=='-': # followd by - 1108 end=start 1109 step=1 1110 # get hyphen and end of range 1111 hyphen=options_buffer.next_nonwhitespace() 1112 end=self._resolve(options_buffer.next_word(),set_type=set_type) 1113 if set_type==CHARSET: 1114 if options_buffer.peek_nonwhitespace()=='\\': # followd by \ 1115 backslash=options_buffer.next_nonwhitespace() 1116 step=int(options_buffer.next_word()) # get backslash and step 1117 plain_list.extend(range(start,end+1,step)) 1118 else: 1119 if type(start)==list or type(end)==list: 1120 raise NexusError('Name if character sets not allowed in range definition: %s' 1121 % identifier) 1122 start=self.taxlabels.index(start) 1123 end=self.taxlabels.index(end) 1124 taxrange=self.taxlabels[start:end+1] 1125 plain_list.extend(taxrange) 1126 else: 1127 if type(start)==list: # start was the name of charset or taxset 1128 plain_list.extend(start) 1129 else: # start was an ordinary identifier 1130 plain_list.append(start) 1131 except NexusError: 1132 raise 1133 except: 1134 return None 1135 return plain_list
1136
1137 - def _resolve(self,identifier,set_type=None):
1138 """Translate identifier in list into character/taxon index. 1139 1140 Characters (which are referred to by their index in Nexus.py): 1141 Plain numbers are returned minus 1 (Nexus indices to python indices) 1142 Text identifiers are translated into their indices (if plain character identifiers), 1143 the first hit in charlabels is returned (charlabels don't need to be unique) 1144 or the range of indices is returned (if names of character sets). 1145 Taxa (which are referred to by their unique name in Nexus.py): 1146 Plain numbers are translated in their taxon name, underscores and spaces are considered equal. 1147 Names are returned unchanged (if plain taxon identifiers), or the names in 1148 the corresponding taxon set is returned. 1149 """ 1150 identifier=quotestrip(identifier) 1151 if not set_type: 1152 raise NexusError('INTERNAL ERROR: Need type to resolve identifier.') 1153 if set_type==CHARSET: 1154 try: 1155 n=int(identifier) 1156 except ValueError: 1157 if self.charlabels and identifier in self.charlabels.itervalues(): 1158 for k in self.charlabels: 1159 if self.charlabels[k]==identifier: 1160 return k 1161 elif self.charsets and identifier in self.charsets: 1162 return self.charsets[identifier] 1163 else: 1164 raise NexusError('Unknown character identifier: %s' 1165 % identifier) 1166 else: 1167 if n<=self.nchar: 1168 return n-1 1169 else: 1170 raise NexusError('Illegal character identifier: %d>nchar (=%d).' 1171 % (identifier,self.nchar)) 1172 elif set_type==TAXSET: 1173 try: 1174 n=int(identifier) 1175 except ValueError: 1176 taxlabels_id=self._check_taxlabels(identifier) 1177 if taxlabels_id: 1178 return taxlabels_id 1179 elif self.taxsets and identifier in self.taxsets: 1180 return self.taxsets[identifier] 1181 else: 1182 raise NexusError('Unknown taxon identifier: %s' 1183 % identifier) 1184 else: 1185 if n>0 and n<=self.ntax: 1186 return self.taxlabels[n-1] 1187 else: 1188 raise NexusError('Illegal taxon identifier: %d>ntax (=%d).' 1189 % (identifier,self.ntax)) 1190 else: 1191 raise NexusError('Unknown set specification: %s.'% set_type)
1192
1193 - def _stateset(self, options):
1194 #Not implemented 1195 pass
1196
1197 - def _changeset(self, options):
1198 #Not implemented 1199 pass
1200
1201 - def _treeset(self, options):
1202 #Not implemented 1203 pass
1204
1205 - def _treepartition(self, options):
1206 #Not implemented 1207 pass
1208
1209 - def write_nexus_data_partitions(self, matrix=None, filename=None, blocksize=None, interleave=False, 1210 exclude=[], delete=[], charpartition=None, comment='',mrbayes=False):
1211 """Writes a nexus file for each partition in charpartition. 1212 1213 Only non-excluded characters and non-deleted taxa are included, 1214 just the data block is written. 1215 """ 1216 1217 if not matrix: 1218 matrix=self.matrix 1219 if not matrix: 1220 return 1221 if not filename: 1222 filename=self.filename 1223 if charpartition: 1224 pfilenames={} 1225 for p in charpartition: 1226 total_exclude=[]+exclude 1227 total_exclude.extend([c for c in range(self.nchar) if c not in charpartition[p]]) 1228 total_exclude=_make_unique(total_exclude) 1229 pcomment=comment+'\nPartition: '+p+'\n' 1230 dot=filename.rfind('.') 1231 if dot>0: 1232 pfilename=filename[:dot]+'_'+p+'.data' 1233 else: 1234 pfilename=filename+'_'+p 1235 pfilenames[p]=pfilename 1236 self.write_nexus_data(filename=pfilename,matrix=matrix,blocksize=blocksize, 1237 interleave=interleave,exclude=total_exclude,delete=delete,comment=pcomment,append_sets=False, 1238 mrbayes=mrbayes) 1239 return pfilenames 1240 else: 1241 fn=self.filename+'.data' 1242 self.write_nexus_data(filename=fn,matrix=matrix,blocksize=blocksize,interleave=interleave, 1243 exclude=exclude,delete=delete,comment=comment,append_sets=False, 1244 mrbayes=mrbayes) 1245 return fn
1246
1247 - def write_nexus_data(self, filename=None, matrix=None, exclude=[], delete=[], 1248 blocksize=None, interleave=False, interleave_by_partition=False, 1249 comment=None,omit_NEXUS=False,append_sets=True,mrbayes=False, 1250 codons_block=True):
1251 """Writes a nexus file with data and sets block to a file or handle. 1252 1253 Character sets and partitions are appended by default, and are 1254 adjusted according to excluded characters (i.e. character sets 1255 still point to the same sites (not necessarily same positions), 1256 without including the deleted characters. 1257 1258 filename - Either a filename as a string (which will be opened, 1259 written to and closed), or a handle object (which will 1260 be written to but NOT closed). 1261 interleave_by_partition - Optional name of partition (string) 1262 omit_NEXUS - Boolean. If true, the '#NEXUS' line normally at the 1263 start of the file is omitted. 1264 1265 Returns the filename/handle used to write the data. 1266 """ 1267 if not matrix: 1268 matrix=self.matrix 1269 if not matrix: 1270 return 1271 if not filename: 1272 filename=self.filename 1273 if [t for t in delete if not self._check_taxlabels(t)]: 1274 raise NexusError('Unknown taxa: %s' 1275 % ', '.join(set(delete).difference(set(self.taxlabels)))) 1276 if interleave_by_partition: 1277 if not interleave_by_partition in self.charpartitions: 1278 raise NexusError('Unknown partition: %r' % interleave_by_partition) 1279 else: 1280 partition=self.charpartitions[interleave_by_partition] 1281 # we need to sort the partition names by starting position before we exclude characters 1282 names=_sort_keys_by_values(partition) 1283 newpartition={} 1284 for p in partition: 1285 newpartition[p]=[c for c in partition[p] if c not in exclude] 1286 # how many taxa and how many characters are left? 1287 undelete=[taxon for taxon in self.taxlabels if taxon in matrix and taxon not in delete] 1288 cropped_matrix=_seqmatrix2strmatrix(self.crop_matrix(matrix,exclude=exclude,delete=delete)) 1289 ntax_adjusted=len(undelete) 1290 nchar_adjusted=len(cropped_matrix[undelete[0]]) 1291 if not undelete or (undelete and undelete[0]==''): 1292 return 1293 1294 with File.as_handle(filename, mode='w') as fh: 1295 if not omit_NEXUS: 1296 fh.write('#NEXUS\n') 1297 if comment: 1298 fh.write('['+comment+']\n') 1299 fh.write('begin data;\n') 1300 fh.write('\tdimensions ntax=%d nchar=%d;\n' % (ntax_adjusted, nchar_adjusted)) 1301 fh.write('\tformat datatype='+self.datatype) 1302 if self.respectcase: 1303 fh.write(' respectcase') 1304 if self.missing: 1305 fh.write(' missing='+self.missing) 1306 if self.gap: 1307 fh.write(' gap='+self.gap) 1308 if self.matchchar: 1309 fh.write(' matchchar='+self.matchchar) 1310 if self.labels: 1311 fh.write(' labels='+self.labels) 1312 if self.equate: 1313 fh.write(' equate='+self.equate) 1314 if interleave or interleave_by_partition: 1315 fh.write(' interleave') 1316 fh.write(';\n') 1317 #if self.taxlabels: 1318 # fh.write('taxlabels '+' '.join(self.taxlabels)+';\n') 1319 if self.charlabels: 1320 newcharlabels=self._adjust_charlabels(exclude=exclude) 1321 clkeys=sorted(newcharlabels) 1322 fh.write('charlabels '+', '.join(["%s %s" % (k+1,safename(newcharlabels[k])) for k in clkeys])+';\n') 1323 fh.write('matrix\n') 1324 if not blocksize: 1325 if interleave: 1326 blocksize=70 1327 else: 1328 blocksize=self.nchar 1329 # delete deleted taxa and ecxclude excluded characters... 1330 namelength=max([len(safename(t,mrbayes=mrbayes)) for t in undelete]) 1331 if interleave_by_partition: 1332 # interleave by partitions, but adjust partitions with regard to excluded characters 1333 seek=0 1334 for p in names: 1335 fh.write('[%s: %s]\n' % (interleave_by_partition,p)) 1336 if len(newpartition[p])>0: 1337 for taxon in undelete: 1338 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1)) 1339 fh.write(cropped_matrix[taxon][seek:seek+len(newpartition[p])]+'\n') 1340 fh.write('\n') 1341 else: 1342 fh.write('[empty]\n\n') 1343 seek+=len(newpartition[p]) 1344 elif interleave: 1345 for seek in range(0,nchar_adjusted,blocksize): 1346 for taxon in undelete: 1347 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1)) 1348 fh.write(cropped_matrix[taxon][seek:seek+blocksize]+'\n') 1349 fh.write('\n') 1350 else: 1351 for taxon in undelete: 1352 if blocksize<nchar_adjusted: 1353 fh.write(safename(taxon,mrbayes=mrbayes)+'\n') 1354 else: 1355 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1)) 1356 taxon_seq = cropped_matrix[taxon] 1357 for seek in range(0,nchar_adjusted,blocksize): 1358 fh.write(taxon_seq[seek:seek+blocksize]+'\n') 1359 del taxon_seq 1360 fh.write(';\nend;\n') 1361 if append_sets: 1362 if codons_block: 1363 fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes,include_codons=False)) 1364 fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes,codons_only=True)) 1365 else: 1366 fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes)) 1367 return filename
1368
1369 - def append_sets(self,exclude=[],delete=[],mrbayes=False,include_codons=True,codons_only=False):
1370 """Returns a sets block.""" 1371 if not self.charsets and not self.taxsets and not self.charpartitions: 1372 return '' 1373 if codons_only: 1374 setsb=['\nbegin codons'] 1375 else: 1376 setsb=['\nbegin sets'] 1377 # - now if characters have been excluded, the character sets need to be adjusted, 1378 # so that they still point to the right character positions 1379 # calculate a list of offsets: for each deleted character, the following character position 1380 # in the new file will have an additional offset of -1 1381 offset=0 1382 offlist=[] 1383 for c in range(self.nchar): 1384 if c in exclude: 1385 offset+=1 1386 offlist.append(-1) # dummy value as these character positions are excluded 1387 else: 1388 offlist.append(c-offset) 1389 # now adjust each of the character sets 1390 if not codons_only: 1391 for n,ns in self.charsets.iteritems(): 1392 cset=[offlist[c] for c in ns if c not in exclude] 1393 if cset: 1394 setsb.append('charset %s = %s' % (safename(n),_compact4nexus(cset))) 1395 for n,s in self.taxsets.iteritems(): 1396 tset=[safename(t,mrbayes=mrbayes) for t in s if t not in delete] 1397 if tset: 1398 setsb.append('taxset %s = %s' % (safename(n),' '.join(tset))) 1399 for n,p in self.charpartitions.iteritems(): 1400 if not include_codons and n==CODONPOSITIONS: 1401 continue 1402 elif codons_only and n!=CODONPOSITIONS: 1403 continue 1404 # as characters have been excluded, the partitions must be adjusted 1405 # if a partition is empty, it will be omitted from the charpartition command 1406 # (although paup allows charpartition part=t1:,t2:,t3:1-100) 1407 names=_sort_keys_by_values(p) 1408 newpartition={} 1409 for sn in names: 1410 nsp=[offlist[c] for c in p[sn] if c not in exclude] 1411 if nsp: 1412 newpartition[sn]=nsp 1413 if newpartition: 1414 if include_codons and n==CODONPOSITIONS: 1415 command='codonposset' 1416 else: 1417 command='charpartition' 1418 setsb.append('%s %s = %s' % (command,safename(n), 1419 ', '.join(['%s: %s' % (sn,_compact4nexus(newpartition[sn])) for sn in names if sn in newpartition]))) 1420 # now write charpartititions, much easier than charpartitions 1421 for n,p in self.taxpartitions.iteritems(): 1422 names=_sort_keys_by_values(p) 1423 newpartition={} 1424 for sn in names: 1425 nsp=[t for t in p[sn] if t not in delete] 1426 if nsp: 1427 newpartition[sn]=nsp 1428 if newpartition: 1429 setsb.append('taxpartition %s = %s' % (safename(n), 1430 ', '.join(['%s: %s' % (safename(sn),' '.join(map(safename,newpartition[sn]))) for sn in names if sn in newpartition]))) 1431 # add 'end' and return everything 1432 setsb.append('end;\n') 1433 if len(setsb)==2: # begin and end only 1434 return '' 1435 else: 1436 return ';\n'.join(setsb)
1437
1438 - def export_fasta(self, filename=None, width=70):
1439 """Writes matrix into a fasta file.""" 1440 if not filename: 1441 if '.' in self.filename and self.filename.split('.')[-1].lower() in ['paup','nexus','nex','dat']: 1442 filename='.'.join(self.filename.split('.')[:-1])+'.fas' 1443 else: 1444 filename=self.filename+'.fas' 1445 fh=open(filename,'w') 1446 for taxon in self.taxlabels: 1447 fh.write('>'+safename(taxon)+'\n') 1448 for i in range(0, len(str(self.matrix[taxon])), width): 1449 fh.write(str(self.matrix[taxon])[i:i+width] + '\n') 1450 fh.close() 1451 return filename
1452
1453 - def export_phylip(self, filename=None):
1454 """Writes matrix into a PHYLIP file. 1455 1456 Note that this writes a relaxed PHYLIP format file, where the names 1457 are not truncated, nor checked for invalid characters.""" 1458 if not filename: 1459 if '.' in self.filename and self.filename.split('.')[-1].lower() in ['paup','nexus','nex','dat']: 1460 filename='.'.join(self.filename.split('.')[:-1])+'.phy' 1461 else: 1462 filename=self.filename+'.phy' 1463 fh=open(filename,'w') 1464 fh.write('%d %d\n' % (self.ntax,self.nchar)) 1465 for taxon in self.taxlabels: 1466 fh.write('%s %s\n' % (safename(taxon), str(self.matrix[taxon]))) 1467 fh.close() 1468 return filename
1469
1470 - def constant(self,matrix=None,delete=[],exclude=[]):
1471 """Return a list with all constant characters.""" 1472 if not matrix: 1473 matrix=self.matrix 1474 undelete=[t for t in self.taxlabels if t in matrix and t not in delete] 1475 if not undelete: 1476 return None 1477 elif len(undelete)==1: 1478 return [x for x in range(len(matrix[undelete[0]])) if x not in exclude] 1479 # get the first sequence and expand all ambiguous values 1480 constant=[(x,self.ambiguous_values.get(n.upper(),n.upper())) for 1481 x,n in enumerate(str(matrix[undelete[0]])) if x not in exclude] 1482 1483 for taxon in undelete[1:]: 1484 newconstant=[] 1485 for site in constant: 1486 #print '%d (paup=%d)' % (site[0],site[0]+1), 1487 seqsite=matrix[taxon][site[0]].upper() 1488 #print seqsite,'checked against',site[1],'\t', 1489 if seqsite==self.missing or (seqsite==self.gap and self.options['gapmode'].lower()=='missing') or seqsite==site[1]: 1490 # missing or same as before -> ok 1491 newconstant.append(site) 1492 elif seqsite in site[1] or site[1]==self.missing or (self.options['gapmode'].lower()=='missing' and site[1]==self.gap): 1493 # subset of an ambig or only missing in previous -> take subset 1494 newconstant.append((site[0],self.ambiguous_values.get(seqsite,seqsite))) 1495 elif seqsite in self.ambiguous_values: # is it an ambig: check the intersection with prev. values 1496 intersect = set(self.ambiguous_values[seqsite]).intersection(set(site[1])) 1497 if intersect: 1498 newconstant.append((site[0],''.join(intersect))) 1499 # print 'ok' 1500 #else: 1501 # print 'failed' 1502 #else: 1503 # print 'failed' 1504 constant=newconstant 1505 cpos=[s[0] for s in constant] 1506 return cpos
1507
1508 - def cstatus(self,site,delete=[],narrow=True):
1509 """Summarize character. 1510 1511 narrow=True: paup-mode (a c ? --> ac; ? ? ? --> ?) 1512 narrow=false: (a c ? --> a c g t -; ? ? ? --> a c g t -) 1513 """ 1514 undelete=[t for t in self.taxlabels if t not in delete] 1515 if not undelete: 1516 return None 1517 cstatus=[] 1518 for t in undelete: 1519 c=self.matrix[t][site].upper() 1520 if self.options.get('gapmode')=='missing' and c==self.gap: 1521 c=self.missing 1522 if narrow and c==self.missing: 1523 if c not in cstatus: 1524 cstatus.append(c) 1525 else: 1526 cstatus.extend([b for b in self.ambiguous_values[c] if b not in cstatus]) 1527 if self.missing in cstatus and narrow and len(cstatus)>