Package Bio :: Package SeqIO :: Module _convert
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO._convert

  1  # Copyright 2009 by Peter Cock.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Optimised sequence conversion code (PRIVATE). 
  6   
  7  You are not expected to access this module, or any of its code, directly. This 
  8  is all handled internally by the Bio.SeqIO.convert(...) function which is the 
  9  public interface for this. 
 10   
 11  The idea here is that while doing this will work:: 
 12   
 13      from Bio import SeqIO 
 14      records = SeqIO.parse(in_handle, in_format) 
 15      count = SeqIO.write(records, out_handle, out_format) 
 16   
 17  it is shorter to write:: 
 18   
 19      from Bio import SeqIO 
 20      count = SeqIO.convert(in_handle, in_format, out_handle, out_format) 
 21   
 22  Also, the convert function can take a number of special case optimisations. This 
 23  means that using Bio.SeqIO.convert() may be faster, as well as more convenient. 
 24  All these file format specific optimisations are handled by this (private) module. 
 25  """ 
 26   
 27  from Bio import SeqIO 
 28  # NOTE - Lots of lazy imports further on... 
 29   
 30  __docformat__ = "restructuredtext en" 
 31   
 32   
33 -def _genbank_convert_fasta(in_handle, out_handle, alphabet=None):
34 """Fast GenBank to FASTA (PRIVATE).""" 35 # We don't need to parse the features... 36 from Bio.GenBank.Scanner import GenBankScanner 37 records = GenBankScanner().parse_records(in_handle, do_features=False) 38 # For FASTA output we can ignore the alphabet too 39 return SeqIO.write(records, out_handle, "fasta")
40 41
42 -def _embl_convert_fasta(in_handle, out_handle, alphabet=None):
43 """Fast EMBL to FASTA (PRIVATE).""" 44 # We don't need to parse the features... 45 from Bio.GenBank.Scanner import EmblScanner 46 records = EmblScanner().parse_records(in_handle, do_features=False) 47 # For FASTA output we can ignore the alphabet too 48 return SeqIO.write(records, out_handle, "fasta")
49 50
51 -def _fastq_generic(in_handle, out_handle, mapping):
52 """FASTQ helper function where can't have data loss by truncation (PRIVATE).""" 53 from Bio.SeqIO.QualityIO import FastqGeneralIterator 54 # For real speed, don't even make SeqRecord and Seq objects! 55 count = 0 56 null = chr(0) 57 for title, seq, old_qual in FastqGeneralIterator(in_handle): 58 count += 1 59 # map the qual... 60 qual = old_qual.translate(mapping) 61 if null in qual: 62 raise ValueError("Invalid character in quality string") 63 out_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) 64 return count
65 66
67 -def _fastq_generic2(in_handle, out_handle, mapping, truncate_char, truncate_msg):
68 """FASTQ helper function where there could be data loss by truncation (PRIVATE).""" 69 from Bio.SeqIO.QualityIO import FastqGeneralIterator 70 # For real speed, don't even make SeqRecord and Seq objects! 71 count = 0 72 null = chr(0) 73 for title, seq, old_qual in FastqGeneralIterator(in_handle): 74 count += 1 75 # map the qual... 76 qual = old_qual.translate(mapping) 77 if null in qual: 78 raise ValueError("Invalid character in quality string") 79 if truncate_char in qual: 80 qual = qual.replace(truncate_char, chr(126)) 81 import warnings 82 warnings.warn(truncate_msg) 83 out_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) 84 return count
85 86
87 -def _fastq_sanger_convert_fastq_sanger(in_handle, out_handle, alphabet=None):
88 """Fast Sanger FASTQ to Sanger FASTQ conversion (PRIVATE). 89 90 Useful for removing line wrapping and the redundant second identifier 91 on the plus lines. Will check also check the quality string is valid. 92 93 Avoids creating SeqRecord and Seq objects in order to speed up this 94 conversion. 95 """ 96 # Map unexpected chars to null 97 mapping = "".join([chr(0) for ascii in range(0, 33)] 98 + [chr(ascii) for ascii in range(33, 127)] 99 + [chr(0) for ascii in range(127, 256)]) 100 assert len(mapping) == 256 101 return _fastq_generic(in_handle, out_handle, mapping)
102 103
104 -def _fastq_solexa_convert_fastq_solexa(in_handle, out_handle, alphabet=None):
105 """Fast Solexa FASTQ to Solexa FASTQ conversion (PRIVATE). 106 107 Useful for removing line wrapping and the redundant second identifier 108 on the plus lines. Will check also check the quality string is valid. 109 Avoids creating SeqRecord and Seq objects in order to speed up this 110 conversion. 111 """ 112 # Map unexpected chars to null 113 mapping = "".join([chr(0) for ascii in range(0, 59)] 114 + [chr(ascii) for ascii in range(59, 127)] 115 + [chr(0) for ascii in range(127, 256)]) 116 assert len(mapping) == 256 117 return _fastq_generic(in_handle, out_handle, mapping)
118 119
120 -def _fastq_illumina_convert_fastq_illumina(in_handle, out_handle, alphabet=None):
121 """Fast Illumina 1.3+ FASTQ to Illumina 1.3+ FASTQ conversion (PRIVATE). 122 123 Useful for removing line wrapping and the redundant second identifier 124 on the plus lines. Will check also check the quality string is valid. 125 Avoids creating SeqRecord and Seq objects in order to speed up this 126 conversion. 127 """ 128 # Map unexpected chars to null 129 mapping = "".join([chr(0) for ascii in range(0, 64)] 130 + [chr(ascii) for ascii in range(64, 127)] 131 + [chr(0) for ascii in range(127, 256)]) 132 assert len(mapping) == 256 133 return _fastq_generic(in_handle, out_handle, mapping)
134 135
136 -def _fastq_illumina_convert_fastq_sanger(in_handle, out_handle, alphabet=None):
137 """Fast Illumina 1.3+ FASTQ to Sanger FASTQ conversion (PRIVATE). 138 139 Avoids creating SeqRecord and Seq objects in order to speed up this 140 conversion. 141 """ 142 # Map unexpected chars to null 143 mapping = "".join([chr(0) for ascii in range(0, 64)] 144 + [chr(33 + q) for q in range(0, 62 + 1)] 145 + [chr(0) for ascii in range(127, 256)]) 146 assert len(mapping) == 256 147 return _fastq_generic(in_handle, out_handle, mapping)
148 149
150 -def _fastq_sanger_convert_fastq_illumina(in_handle, out_handle, alphabet=None):
151 """Fast Sanger FASTQ to Illumina 1.3+ FASTQ conversion (PRIVATE). 152 153 Avoids creating SeqRecord and Seq objects in order to speed up this 154 conversion. Will issue a warning if the scores had to be truncated at 62 155 (maximum possible in the Illumina 1.3+ FASTQ format) 156 """ 157 # Map unexpected chars to null 158 trunc_char = chr(1) 159 mapping = "".join([chr(0) for ascii in range(0, 33)] 160 + [chr(64 + q) for q in range(0, 62 + 1)] 161 + [trunc_char for ascii in range(96, 127)] 162 + [chr(0) for ascii in range(127, 256)]) 163 assert len(mapping) == 256 164 return _fastq_generic2(in_handle, out_handle, mapping, trunc_char, 165 "Data loss - max PHRED quality 62 in Illumina 1.3+ FASTQ")
166 167
168 -def _fastq_solexa_convert_fastq_sanger(in_handle, out_handle, alphabet=None):
169 """Fast Solexa FASTQ to Sanger FASTQ conversion (PRIVATE). 170 171 Avoids creating SeqRecord and Seq objects in order to speed up this 172 conversion. 173 """ 174 # Map unexpected chars to null 175 from Bio.SeqIO.QualityIO import phred_quality_from_solexa 176 mapping = "".join([chr(0) for ascii in range(0, 59)] 177 + [chr(33 + int(round(phred_quality_from_solexa(q)))) 178 for q in range(-5, 62 + 1)] 179 + [chr(0) for ascii in range(127, 256)]) 180 assert len(mapping) == 256 181 return _fastq_generic(in_handle, out_handle, mapping)
182 183
184 -def _fastq_sanger_convert_fastq_solexa(in_handle, out_handle, alphabet=None):
185 """Fast Sanger FASTQ to Solexa FASTQ conversion (PRIVATE). 186 187 Avoids creating SeqRecord and Seq objects in order to speed up this 188 conversion. Will issue a warning if the scores had to be truncated at 62 189 (maximum possible in the Solexa FASTQ format) 190 """ 191 # Map unexpected chars to null 192 from Bio.SeqIO.QualityIO import solexa_quality_from_phred 193 trunc_char = chr(1) 194 mapping = "".join([chr(0) for ascii in range(0, 33)] 195 + [chr(64 + int(round(solexa_quality_from_phred(q)))) 196 for q in range(0, 62 + 1)] 197 + [trunc_char for ascii in range(96, 127)] 198 + [chr(0) for ascii in range(127, 256)]) 199 assert len(mapping) == 256 200 return _fastq_generic2(in_handle, out_handle, mapping, trunc_char, 201 "Data loss - max Solexa quality 62 in Solexa FASTQ")
202 203
204 -def _fastq_solexa_convert_fastq_illumina(in_handle, out_handle, alphabet=None):
205 """Fast Solexa FASTQ to Illumina 1.3+ FASTQ conversion (PRIVATE). 206 207 Avoids creating SeqRecord and Seq objects in order to speed up this 208 conversion. 209 """ 210 # Map unexpected chars to null 211 from Bio.SeqIO.QualityIO import phred_quality_from_solexa 212 mapping = "".join([chr(0) for ascii in range(0, 59)] 213 + [chr(64 + int(round(phred_quality_from_solexa(q)))) 214 for q in range(-5, 62 + 1)] 215 + [chr(0) for ascii in range(127, 256)]) 216 assert len(mapping) == 256 217 return _fastq_generic(in_handle, out_handle, mapping)
218 219
220 -def _fastq_illumina_convert_fastq_solexa(in_handle, out_handle, alphabet=None):
221 """Fast Illumina 1.3+ FASTQ to Solexa FASTQ conversion (PRIVATE). 222 223 Avoids creating SeqRecord and Seq objects in order to speed up this 224 conversion. 225 """ 226 # Map unexpected chars to null 227 from Bio.SeqIO.QualityIO import solexa_quality_from_phred 228 trunc_char = chr(1) 229 mapping = "".join([chr(0) for ascii in range(0, 64)] 230 + [chr(64 + int(round(solexa_quality_from_phred(q)))) 231 for q in range(0, 62 + 1)] 232 + [chr(0) for ascii in range(127, 256)]) 233 assert len(mapping) == 256 234 return _fastq_generic(in_handle, out_handle, mapping)
235 236
237 -def _fastq_convert_fasta(in_handle, out_handle, alphabet=None):
238 """Fast FASTQ to FASTA conversion (PRIVATE). 239 240 Avoids dealing with the FASTQ quality encoding, and creating SeqRecord and 241 Seq objects in order to speed up this conversion. 242 243 NOTE - This does NOT check the characters used in the FASTQ quality string 244 are valid! 245 """ 246 from Bio.SeqIO.QualityIO import FastqGeneralIterator 247 # For real speed, don't even make SeqRecord and Seq objects! 248 count = 0 249 for title, seq, qual in FastqGeneralIterator(in_handle): 250 count += 1 251 out_handle.write(">%s\n" % title) 252 # Do line wrapping 253 for i in range(0, len(seq), 60): 254 out_handle.write(seq[i:i + 60] + "\n") 255 return count
256 257
258 -def _fastq_convert_tab(in_handle, out_handle, alphabet=None):
259 """Fast FASTQ to simple tabbed conversion (PRIVATE). 260 261 Avoids dealing with the FASTQ quality encoding, and creating SeqRecord and 262 Seq objects in order to speed up this conversion. 263 264 NOTE - This does NOT check the characters used in the FASTQ quality string 265 are valid! 266 """ 267 from Bio.SeqIO.QualityIO import FastqGeneralIterator 268 # For real speed, don't even make SeqRecord and Seq objects! 269 count = 0 270 for title, seq, qual in FastqGeneralIterator(in_handle): 271 count += 1 272 out_handle.write("%s\t%s\n" % (title.split(None, 1)[0], seq)) 273 return count
274 275
276 -def _fastq_convert_qual(in_handle, out_handle, mapping):
277 """FASTQ helper function for QUAL output (PRIVATE). 278 279 Mapping should be a dictionary mapping expected ASCII characters from the 280 FASTQ quality string to PHRED quality scores (as strings). 281 """ 282 from Bio.SeqIO.QualityIO import FastqGeneralIterator 283 # For real speed, don't even make SeqRecord and Seq objects! 284 count = 0 285 for title, seq, qual in FastqGeneralIterator(in_handle): 286 count += 1 287 out_handle.write(">%s\n" % title) 288 # map the qual... note even with Sanger encoding max 2 digits 289 try: 290 qualities_strs = [mapping[ascii] for ascii in qual] 291 except KeyError: 292 raise ValueError("Invalid character in quality string") 293 data = " ".join(qualities_strs) 294 while len(data) > 60: 295 # Know quality scores are either 1 or 2 digits, so there 296 # must be a space in any three consecutive characters. 297 if data[60] == " ": 298 out_handle.write(data[:60] + "\n") 299 data = data[61:] 300 elif data[59] == " ": 301 out_handle.write(data[:59] + "\n") 302 data = data[60:] 303 else: 304 assert data[58] == " ", "Internal logic failure in wrapping" 305 out_handle.write(data[:58] + "\n") 306 data = data[59:] 307 out_handle.write(data + "\n") 308 return count
309 310
311 -def _fastq_sanger_convert_qual(in_handle, out_handle, alphabet=None):
312 """Fast Sanger FASTQ to QUAL conversion (PRIVATE).""" 313 mapping = dict((chr(q + 33), str(q)) for q in range(0, 93 + 1)) 314 return _fastq_convert_qual(in_handle, out_handle, mapping)
315 316
317 -def _fastq_solexa_convert_qual(in_handle, out_handle, alphabet=None):
318 """Fast Solexa FASTQ to QUAL conversion (PRIVATE).""" 319 from Bio.SeqIO.QualityIO import phred_quality_from_solexa 320 mapping = dict((chr(q + 64), str(int(round(phred_quality_from_solexa(q))))) 321 for q in range(-5, 62 + 1)) 322 return _fastq_convert_qual(in_handle, out_handle, mapping)
323 324
325 -def _fastq_illumina_convert_qual(in_handle, out_handle, alphabet=None):
326 """Fast Illumina 1.3+ FASTQ to QUAL conversion (PRIVATE).""" 327 mapping = dict((chr(q + 64), str(q)) for q in range(0, 62 + 1)) 328 return _fastq_convert_qual(in_handle, out_handle, mapping)
329 330 331 # TODO? - Handling aliases explicitly would let us shorten this list: 332 _converter = { 333 ("genbank", "fasta"): _genbank_convert_fasta, 334 ("gb", "fasta"): _genbank_convert_fasta, 335 ("embl", "fasta"): _embl_convert_fasta, 336 ("fastq", "fasta"): _fastq_convert_fasta, 337 ("fastq-sanger", "fasta"): _fastq_convert_fasta, 338 ("fastq-solexa", "fasta"): _fastq_convert_fasta, 339 ("fastq-illumina", "fasta"): _fastq_convert_fasta, 340 ("fastq", "tab"): _fastq_convert_tab, 341 ("fastq-sanger", "tab"): _fastq_convert_tab, 342 ("fastq-solexa", "tab"): _fastq_convert_tab, 343 ("fastq-illumina", "tab"): _fastq_convert_tab, 344 ("fastq", "fastq"): _fastq_sanger_convert_fastq_sanger, 345 ("fastq-sanger", "fastq"): _fastq_sanger_convert_fastq_sanger, 346 ("fastq-solexa", "fastq"): _fastq_solexa_convert_fastq_sanger, 347 ("fastq-illumina", "fastq"): _fastq_illumina_convert_fastq_sanger, 348 ("fastq", "fastq-sanger"): _fastq_sanger_convert_fastq_sanger, 349 ("fastq-sanger", "fastq-sanger"): _fastq_sanger_convert_fastq_sanger, 350 ("fastq-solexa", "fastq-sanger"): _fastq_solexa_convert_fastq_sanger, 351 ("fastq-illumina", "fastq-sanger"): _fastq_illumina_convert_fastq_sanger, 352 ("fastq", "fastq-solexa"): _fastq_sanger_convert_fastq_solexa, 353 ("fastq-sanger", "fastq-solexa"): _fastq_sanger_convert_fastq_solexa, 354 ("fastq-solexa", "fastq-solexa"): _fastq_solexa_convert_fastq_solexa, 355 ("fastq-illumina", "fastq-solexa"): _fastq_illumina_convert_fastq_solexa, 356 ("fastq", "fastq-illumina"): _fastq_sanger_convert_fastq_illumina, 357 ("fastq-sanger", "fastq-illumina"): _fastq_sanger_convert_fastq_illumina, 358 ("fastq-solexa", "fastq-illumina"): _fastq_solexa_convert_fastq_illumina, 359 ("fastq-illumina", "fastq-illumina"): _fastq_illumina_convert_fastq_illumina, 360 ("fastq", "qual"): _fastq_sanger_convert_qual, 361 ("fastq-sanger", "qual"): _fastq_sanger_convert_qual, 362 ("fastq-solexa", "qual"): _fastq_solexa_convert_qual, 363 ("fastq-illumina", "qual"): _fastq_illumina_convert_qual, 364 } 365 366
367 -def _handle_convert(in_handle, in_format, out_handle, out_format, alphabet=None):
368 """SeqIO conversion function (PRIVATE).""" 369 try: 370 f = _converter[(in_format, out_format)] 371 except KeyError: 372 f = None 373 if f: 374 return f(in_handle, out_handle, alphabet) 375 else: 376 records = SeqIO.parse(in_handle, in_format, alphabet) 377 return SeqIO.write(records, out_handle, out_format)
378