Package Bio :: Package AlignIO :: Module EmbossIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.EmbossIO

  1  # Copyright 2008-2013 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """Bio.AlignIO support for "emboss" alignment output from EMBOSS tools. 
  7   
  8  You are expected to use this module via the Bio.AlignIO functions (or the 
  9  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 10   
 11  This module contains a parser for the EMBOSS pairs/simple file format, for 
 12  example from the alignret, water and needle tools. 
 13  """ 
 14   
 15  from __future__ import print_function 
 16   
 17  from Bio.Seq import Seq 
 18  from Bio.SeqRecord import SeqRecord 
 19  from Bio.Align import MultipleSeqAlignment 
 20  from .Interfaces import AlignmentIterator, SequentialAlignmentWriter 
 21   
 22  __docformat__ = "restructuredtext en" 
 23   
 24   
25 -class EmbossWriter(SequentialAlignmentWriter):
26 """Emboss alignment writer (WORK IN PROGRESS). 27 28 Writes a simplfied version of the EMBOSS pairs/simple file format. 29 A lot of the information their tools record in their headers is not 30 available and is omitted. 31 """ 32
33 - def write_header(self):
34 handle = self.handle 35 handle.write("########################################\n") 36 handle.write("# Program: Biopython\n") 37 try: 38 handle.write("# Report_file: %s\n" % handle.name) 39 except AttributeError: 40 pass 41 handle.write("########################################\n")
42 47
48 - def write_alignment(self, alignment):
49 """Use this to write (another) single alignment to an open file.""" 50 handle = self.handle 51 handle.write("#=======================================\n") 52 handle.write("#\n") 53 handle.write("# Aligned_sequences: %i\n" % len(alignment)) 54 for i, record in enumerate(alignment): 55 handle.write("# %i: %s\n" % (i + 1, record.id)) 56 handle.write("#\n") 57 handle.write("# Length: %i\n" % alignment.get_alignment_length()) 58 handle.write("#\n") 59 handle.write("#=======================================\n") 60 handle.write("\n") 61 # ... 62 assert False
63 64
65 -class EmbossIterator(AlignmentIterator):
66 """Emboss alignment iterator. 67 68 For reading the (pairwise) alignments from EMBOSS tools in what they 69 call the "pairs" and "simple" formats. 70 """ 71 72 _header = None # for caching lines between __next__ calls 73
74 - def __next__(self):
75 76 handle = self.handle 77 78 if self._header is None: 79 line = handle.readline() 80 else: 81 # Header we saved from when we were parsing 82 # the previous alignment. 83 line = self._header 84 self._header = None 85 86 if not line: 87 raise StopIteration 88 89 while line.rstrip() != "#=======================================": 90 line = handle.readline() 91 if not line: 92 raise StopIteration 93 94 length_of_seqs = None 95 number_of_seqs = None 96 ids = [] 97 seqs = [] 98 99 while line[0] == "#": 100 # Read in the rest of this alignment header, 101 # try and discover the number of records expected 102 # and their length 103 parts = line[1:].split(":", 1) 104 key = parts[0].lower().strip() 105 if key == "aligned_sequences": 106 number_of_seqs = int(parts[1].strip()) 107 assert len(ids) == 0 108 # Should now expect the record identifiers... 109 for i in range(number_of_seqs): 110 line = handle.readline() 111 parts = line[1:].strip().split(":", 1) 112 assert i + 1 == int(parts[0].strip()) 113 ids.append(parts[1].strip()) 114 assert len(ids) == number_of_seqs 115 if key == "length": 116 length_of_seqs = int(parts[1].strip()) 117 118 # And read in another line... 119 line = handle.readline() 120 121 if number_of_seqs is None: 122 raise ValueError("Number of sequences missing!") 123 if length_of_seqs is None: 124 raise ValueError("Length of sequences missing!") 125 126 if self.records_per_alignment is not None \ 127 and self.records_per_alignment != number_of_seqs: 128 raise ValueError("Found %i records in this alignment, told to expect %i" 129 % (number_of_seqs, self.records_per_alignment)) 130 131 seqs = ["" for id in ids] 132 seq_starts = [] 133 index = 0 134 135 # Parse the seqs 136 while line: 137 if len(line) > 21: 138 id_start = line[:21].strip().split(None, 1) 139 seq_end = line[21:].strip().split(None, 1) 140 if len(id_start) == 2 and len(seq_end) == 2: 141 # identifier, seq start position, seq, seq end position 142 # (an aligned seq is broken up into multiple lines) 143 id, start = id_start 144 seq, end = seq_end 145 if start == end: 146 # Special case, either a single letter is present, 147 # or no letters at all. 148 if seq.replace("-", "") == "": 149 start = int(start) 150 end = int(end) 151 else: 152 start = int(start) - 1 153 end = int(end) 154 else: 155 assert seq.replace("-", "") != "", repr(line) 156 start = int(start) - 1 # python counting 157 end = int(end) 158 159 # The identifier is truncated... 160 assert 0 <= index and index < number_of_seqs, \ 161 "Expected index %i in range [0,%i)" \ 162 % (index, number_of_seqs) 163 assert id == ids[index] or id == ids[index][:len(id)] 164 165 if len(seq_starts) == index: 166 # Record the start 167 seq_starts.append(start) 168 169 # Check the start... 170 if start == end: 171 assert seq.replace("-", "") == "", line 172 else: 173 assert start - seq_starts[index] == len(seqs[index].replace("-", "")), \ 174 "Found %i chars so far for sequence %i (%s, %s), line says start %i:\n%s" \ 175 % (len(seqs[index].replace("-", "")), index, id, repr(seqs[index]), 176 start, line) 177 178 seqs[index] += seq 179 180 # Check the end ... 181 assert end == seq_starts[index] + len(seqs[index].replace("-", "")), \ 182 "Found %i chars so far for sequence %i (%s, %s, start=%i), file says end %i:\n%s" \ 183 % (len(seqs[index].replace("-", "")), index, id, repr(seqs[index]), 184 seq_starts[index], end, line) 185 186 index += 1 187 if index >= number_of_seqs: 188 index = 0 189 else: 190 # just a start value, this is just alignment annotation (?) 191 # print "Skipping: " + line.rstrip() 192 pass 193 elif line.strip() == "": 194 # Just a spacer? 195 pass 196 else: 197 print(line) 198 assert False 199 200 line = handle.readline() 201 if line.rstrip() == "#---------------------------------------" \ 202 or line.rstrip() == "#=======================================": 203 # End of alignment 204 self._header = line 205 break 206 207 assert index == 0 208 209 if self.records_per_alignment is not None \ 210 and self.records_per_alignment != len(ids): 211 raise ValueError("Found %i records in this alignment, told to expect %i" 212 % (len(ids), self.records_per_alignment)) 213 214 records = [] 215 for id, seq in zip(ids, seqs): 216 if len(seq) != length_of_seqs: 217 # EMBOSS 2.9.0 is known to use spaces instead of minus signs 218 # for leading gaps, and thus fails to parse. This old version 219 # is still used as of Dec 2008 behind the EBI SOAP webservice: 220 # http://www.ebi.ac.uk/Tools/webservices/wsdl/WSEmboss.wsdl 221 raise ValueError("Error parsing alignment - sequences of " 222 "different length? You could be using an " 223 "old version of EMBOSS.") 224 records.append(SeqRecord(Seq(seq, self.alphabet), 225 id=id, description=id)) 226 return MultipleSeqAlignment(records, self.alphabet)
227