Package Bio :: Package AlignIO :: Module NexusIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.NexusIO

  1  # Copyright 2008-2010 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """Bio.AlignIO support for the "nexus" file format. 
  7   
  8  You are expected to use this module via the Bio.AlignIO functions (or the 
  9  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 10   
 11  See also the Bio.Nexus module (which this code calls internally), 
 12  as this offers more than just accessing the alignment or its 
 13  sequences as SeqRecord objects. 
 14  """ 
 15   
 16  from __future__ import print_function 
 17   
 18  from Bio.SeqRecord import SeqRecord 
 19  from Bio.Nexus import Nexus 
 20  from Bio.Align import MultipleSeqAlignment 
 21  from .Interfaces import AlignmentWriter 
 22  from Bio import Alphabet 
 23   
 24  __docformat__ = "restructuredtext en" 
 25   
 26  # You can get a couple of example files here: 
 27  # http://www.molecularevolution.org/resources/fileformats/ 
 28   
 29   
 30  # This is a generator function! 
31 -def NexusIterator(handle, seq_count=None):
32 """Returns SeqRecord objects from a Nexus file. 33 34 Thus uses the Bio.Nexus module to do the hard work. 35 36 You are expected to call this function via Bio.SeqIO or Bio.AlignIO 37 (and not use it directly). 38 39 NOTE - We only expect ONE alignment matrix per Nexus file, 40 meaning this iterator will only yield one MultipleSeqAlignment. 41 """ 42 n = Nexus.Nexus(handle) 43 if not n.matrix: 44 # No alignment found 45 raise StopIteration 46 47 # Bio.Nexus deals with duplicated names by adding a '.copy' suffix. 48 # The original names and the modified names are kept in these two lists: 49 assert len(n.unaltered_taxlabels) == len(n.taxlabels) 50 51 if seq_count and seq_count != len(n.unaltered_taxlabels): 52 raise ValueError("Found %i sequences, but seq_count=%i" 53 % (len(n.unaltered_taxlabels), seq_count)) 54 55 # TODO - Can we extract any annotation too? 56 records = (SeqRecord(n.matrix[new_name], id=new_name, 57 name=old_name, description="") 58 for old_name, new_name 59 in zip(n.unaltered_taxlabels, n.taxlabels)) 60 # All done 61 yield MultipleSeqAlignment(records, n.alphabet)
62 63
64 -class NexusWriter(AlignmentWriter):
65 """Nexus alignment writer. 66 67 Note that Nexus files are only expected to hold ONE alignment 68 matrix. 69 70 You are expected to call this class via the Bio.AlignIO.write() or 71 Bio.SeqIO.write() functions. 72 """
73 - def write_file(self, alignments):
74 """Use this to write an entire file containing the given alignments. 75 76 Arguments: 77 78 - alignments - A list or iterator returning MultipleSeqAlignment objects. 79 This should hold ONE and only one alignment. 80 """ 81 align_iter = iter(alignments) # Could have been a list 82 try: 83 first_alignment = next(align_iter) 84 except StopIteration: 85 first_alignment = None 86 if first_alignment is None: 87 # Nothing to write! 88 return 0 89 90 # Check there is only one alignment... 91 try: 92 second_alignment = next(align_iter) 93 except StopIteration: 94 second_alignment = None 95 if second_alignment is not None: 96 raise ValueError("We can only write one Alignment to a Nexus file.") 97 98 # Good. Actually write the single alignment, 99 self.write_alignment(first_alignment) 100 return 1 # we only support writing one alignment!
101
102 - def write_alignment(self, alignment):
103 # Creates an empty Nexus object, adds the sequences, 104 # and then gets Nexus to prepare the output. 105 if len(alignment) == 0: 106 raise ValueError("Must have at least one sequence") 107 columns = alignment.get_alignment_length() 108 if columns == 0: 109 raise ValueError("Non-empty sequences are required") 110 minimal_record = "#NEXUS\nbegin data; dimensions ntax=0 nchar=0; " \ 111 + "format datatype=%s; end;" \ 112 % self._classify_alphabet_for_nexus(alignment._alphabet) 113 n = Nexus.Nexus(minimal_record) 114 n.alphabet = alignment._alphabet 115 for record in alignment: 116 n.add_sequence(record.id, str(record.seq)) 117 # For smaller alignments, don't bother to interleave. 118 # For larger alginments, interleave to avoid very long lines 119 # in the output - something MrBayes can't handle. 120 # TODO - Default to always interleaving? 121 n.write_nexus_data(self.handle, interleave=(columns > 1000))
122
123 - def _classify_alphabet_for_nexus(self, alphabet):
124 """Returns 'protein', 'dna', 'rna' based on the alphabet (PRIVATE). 125 126 Raises an exception if this is not possible.""" 127 # Get the base alphabet (underneath any Gapped or StopCodon encoding) 128 a = Alphabet._get_base_alphabet(alphabet) 129 130 if not isinstance(a, Alphabet.Alphabet): 131 raise TypeError("Invalid alphabet") 132 elif isinstance(a, Alphabet.ProteinAlphabet): 133 return "protein" 134 elif isinstance(a, Alphabet.DNAAlphabet): 135 return "dna" 136 elif isinstance(a, Alphabet.RNAAlphabet): 137 return "rna" 138 else: 139 # Must be something like NucleotideAlphabet or 140 # just the generic Alphabet (default for fasta files) 141 raise ValueError("Need a DNA, RNA or Protein alphabet")
142 143 if __name__ == "__main__": 144 from Bio._py3k import StringIO 145 print("Quick self test") 146 print("") 147 print("Repeated names without a TAXA block") 148 handle = StringIO("""#NEXUS 149 [TITLE: NoName] 150 151 begin data; 152 dimensions ntax=4 nchar=50; 153 format interleave datatype=protein gap=- symbols="FSTNKEYVQMCLAWPHDRIG"; 154 155 matrix 156 CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---- 157 ALEU_HORVU MAHARVLLLA LAVLATAAVA VASSSSFADS NPIRPVTDRA ASTLESAVLG 158 CATH_HUMAN ------MWAT LPLLCAGAWL LGV------- -PVCGAAELS VNSLEK---- 159 CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---X 160 ; 161 end; 162 """) 163 for a in NexusIterator(handle): 164 print(a) 165 for r in a: 166 print("%r %s %s" % (r.seq, r.name, r.id)) 167 print("Done") 168 169 print("") 170 print("Repeated names with a TAXA block") 171 handle = StringIO("""#NEXUS 172 [TITLE: NoName] 173 174 begin taxa 175 CYS1_DICDI 176 ALEU_HORVU 177 CATH_HUMAN 178 CYS1_DICDI; 179 end; 180 181 begin data; 182 dimensions ntax=4 nchar=50; 183 format interleave datatype=protein gap=- symbols="FSTNKEYVQMCLAWPHDRIG"; 184 185 matrix 186 CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---- 187 ALEU_HORVU MAHARVLLLA LAVLATAAVA VASSSSFADS NPIRPVTDRA ASTLESAVLG 188 CATH_HUMAN ------MWAT LPLLCAGAWL LGV------- -PVCGAAELS VNSLEK---- 189 CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---X 190 ; 191 end; 192 """) 193 for a in NexusIterator(handle): 194 print(a) 195 for r in a: 196 print("%r %s %s" % (r.seq, r.name, r.id)) 197 print("Done") 198 print("") 199 print("Reading an empty file") 200 assert 0 == len(list(NexusIterator(StringIO()))) 201 print("Done") 202 print("") 203 print("Writing...") 204 205 handle = StringIO() 206 NexusWriter(handle).write_file([a]) 207 handle.seek(0) 208 print(handle.read()) 209 210 handle = StringIO() 211 try: 212 NexusWriter(handle).write_file([a, a]) 213 assert False, "Should have rejected more than one alignment!" 214 except ValueError: 215 pass 216