Package Bio :: Package AlignIO :: Module EmbossIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.EmbossIO

  1  # Copyright 2008-2013 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """Bio.AlignIO support for "emboss" alignment output from EMBOSS tools. 
  7   
  8  You are expected to use this module via the Bio.AlignIO functions (or the 
  9  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 10   
 11  This module contains a parser for the EMBOSS pairs/simple file format, for 
 12  example from the alignret, water and needle tools. 
 13  """ 
 14   
 15  from __future__ import print_function 
 16   
 17  from Bio.Seq import Seq 
 18  from Bio.SeqRecord import SeqRecord 
 19  from Bio.Align import MultipleSeqAlignment 
 20  from .Interfaces import AlignmentIterator, SequentialAlignmentWriter 
 21   
 22  __docformat__ = "restructuredtext en" 
 23   
 24   
25 -class EmbossWriter(SequentialAlignmentWriter):
26 """Emboss alignment writer (WORK IN PROGRESS). 27 28 Writes a simplfied version of the EMBOSS pairs/simple file format. 29 A lot of the information their tools record in their headers is not 30 available and is omitted. 31 """ 32
33 - def write_header(self):
34 handle = self.handle 35 handle.write("########################################\n") 36 handle.write("# Program: Biopython\n") 37 try: 38 handle.write("# Report_file: %s\n" % handle.name) 39 except AttributeError: 40 pass 41 handle.write("########################################\n")
42 47
48 - def write_alignment(self, alignment):
49 """Use this to write (another) single alignment to an open file.""" 50 handle = self.handle 51 handle.write("#=======================================\n") 52 handle.write("#\n") 53 handle.write("# Aligned_sequences: %i\n" % len(alignment)) 54 for i, record in enumerate(alignment): 55 handle.write("# %i: %s\n" % (i + 1, record.id)) 56 handle.write("#\n") 57 handle.write("# Length: %i\n" % alignment.get_alignment_length()) 58 handle.write("#\n") 59 handle.write("#=======================================\n") 60 handle.write("\n") 61 # ... 62 assert False
63 64
65 -class EmbossIterator(AlignmentIterator):
66 """Emboss alignment iterator. 67 68 For reading the (pairwise) alignments from EMBOSS tools in what they 69 call the "pairs" and "simple" formats. 70 """ 71
72 - def __next__(self):
73 74 handle = self.handle 75 76 try: 77 # Header we saved from when we were parsing 78 # the previous alignment. 79 line = self._header 80 del self._header 81 except AttributeError: 82 line = handle.readline() 83 if not line: 84 raise StopIteration 85 86 while line.rstrip() != "#=======================================": 87 line = handle.readline() 88 if not line: 89 raise StopIteration 90 91 length_of_seqs = None 92 number_of_seqs = None 93 ids = [] 94 seqs = [] 95 96 while line[0] == "#": 97 # Read in the rest of this alignment header, 98 # try and discover the number of records expected 99 # and their length 100 parts = line[1:].split(":", 1) 101 key = parts[0].lower().strip() 102 if key == "aligned_sequences": 103 number_of_seqs = int(parts[1].strip()) 104 assert len(ids) == 0 105 # Should now expect the record identifiers... 106 for i in range(number_of_seqs): 107 line = handle.readline() 108 parts = line[1:].strip().split(":", 1) 109 assert i + 1 == int(parts[0].strip()) 110 ids.append(parts[1].strip()) 111 assert len(ids) == number_of_seqs 112 if key == "length": 113 length_of_seqs = int(parts[1].strip()) 114 115 # And read in another line... 116 line = handle.readline() 117 118 if number_of_seqs is None: 119 raise ValueError("Number of sequences missing!") 120 if length_of_seqs is None: 121 raise ValueError("Length of sequences missing!") 122 123 if self.records_per_alignment is not None \ 124 and self.records_per_alignment != number_of_seqs: 125 raise ValueError("Found %i records in this alignment, told to expect %i" 126 % (number_of_seqs, self.records_per_alignment)) 127 128 seqs = ["" for id in ids] 129 seq_starts = [] 130 index = 0 131 132 # Parse the seqs 133 while line: 134 if len(line) > 21: 135 id_start = line[:21].strip().split(None, 1) 136 seq_end = line[21:].strip().split(None, 1) 137 if len(id_start) == 2 and len(seq_end) == 2: 138 # identifier, seq start position, seq, seq end position 139 # (an aligned seq is broken up into multiple lines) 140 id, start = id_start 141 seq, end = seq_end 142 if start == end: 143 # Special case, either a single letter is present, 144 # or no letters at all. 145 if seq.replace("-", "") == "": 146 start = int(start) 147 end = int(end) 148 else: 149 start = int(start) - 1 150 end = int(end) 151 else: 152 assert seq.replace("-", "") != "", repr(line) 153 start = int(start) - 1 # python counting 154 end = int(end) 155 156 # The identifier is truncated... 157 assert 0 <= index and index < number_of_seqs, \ 158 "Expected index %i in range [0,%i)" \ 159 % (index, number_of_seqs) 160 assert id == ids[index] or id == ids[index][:len(id)] 161 162 if len(seq_starts) == index: 163 # Record the start 164 seq_starts.append(start) 165 166 # Check the start... 167 if start == end: 168 assert seq.replace("-", "") == "", line 169 else: 170 assert start - seq_starts[index] == len(seqs[index].replace("-", "")), \ 171 "Found %i chars so far for sequence %i (%s, %s), line says start %i:\n%s" \ 172 % (len(seqs[index].replace("-", "")), index, id, repr(seqs[index]), 173 start, line) 174 175 seqs[index] += seq 176 177 # Check the end ... 178 assert end == seq_starts[index] + len(seqs[index].replace("-", "")), \ 179 "Found %i chars so far for sequence %i (%s, %s, start=%i), file says end %i:\n%s" \ 180 % (len(seqs[index].replace("-", "")), index, id, repr(seqs[index]), 181 seq_starts[index], end, line) 182 183 index += 1 184 if index >= number_of_seqs: 185 index = 0 186 else: 187 # just a start value, this is just alignment annotation (?) 188 # print "Skipping: " + line.rstrip() 189 pass 190 elif line.strip() == "": 191 # Just a spacer? 192 pass 193 else: 194 print(line) 195 assert False 196 197 line = handle.readline() 198 if line.rstrip() == "#---------------------------------------" \ 199 or line.rstrip() == "#=======================================": 200 # End of alignment 201 self._header = line 202 break 203 204 assert index == 0 205 206 if self.records_per_alignment is not None \ 207 and self.records_per_alignment != len(ids): 208 raise ValueError("Found %i records in this alignment, told to expect %i" 209 % (len(ids), self.records_per_alignment)) 210 211 records = [] 212 for id, seq in zip(ids, seqs): 213 if len(seq) != length_of_seqs: 214 # EMBOSS 2.9.0 is known to use spaces instead of minus signs 215 # for leading gaps, and thus fails to parse. This old version 216 # is still used as of Dec 2008 behind the EBI SOAP webservice: 217 # http://www.ebi.ac.uk/Tools/webservices/wsdl/WSEmboss.wsdl 218 raise ValueError("Error parsing alignment - sequences of " 219 "different length? You could be using an " 220 "old version of EMBOSS.") 221 records.append(SeqRecord(Seq(seq, self.alphabet), 222 id=id, description=id)) 223 return MultipleSeqAlignment(records, self.alphabet)
224