Package Bio :: Package SeqIO :: Module _convert
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO._convert

  1  # Copyright 2009 by Peter Cock.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Optimised sequence conversion code (PRIVATE). 
  6   
  7  You are not expected to access this module, or any of its code, directly. This 
  8  is all handled internally by the Bio.SeqIO.convert(...) function which is the 
  9  public interface for this. 
 10   
 11  The idea here is rather while doing this will work: 
 12   
 13  from Bio import SeqIO 
 14  records = SeqIO.parse(in_handle, in_format) 
 15  count = SeqIO.write(records, out_handle, out_format) 
 16   
 17  it is shorter to write: 
 18   
 19  from Bio import SeqIO 
 20  count = SeqIO.convert(in_handle, in_format, out_handle, out_format) 
 21   
 22  Also, the convert function can take a number of special case optimisations. This 
 23  means that using Bio.SeqIO.convert() may be faster, as well as more convenient. 
 24  All these file format specific optimisations are handled by this (private) module. 
 25  """ 
 26   
 27  from Bio import SeqIO 
 28  #NOTE - Lots of lazy imports further on... 
 29   
 30   
31 -def _genbank_convert_fasta(in_handle, out_handle, alphabet=None):
32 """Fast GenBank to FASTA (PRIVATE).""" 33 #We don't need to parse the features... 34 from Bio.GenBank.Scanner import GenBankScanner 35 records = GenBankScanner().parse_records(in_handle, do_features=False) 36 #For FASTA output we can ignore the alphabet too 37 return SeqIO.write(records, out_handle, "fasta")
38 39
40 -def _embl_convert_fasta(in_handle, out_handle, alphabet=None):
41 """Fast EMBL to FASTA (PRIVATE).""" 42 #We don't need to parse the features... 43 from Bio.GenBank.Scanner import EmblScanner 44 records = EmblScanner().parse_records(in_handle, do_features=False) 45 #For FASTA output we can ignore the alphabet too 46 return SeqIO.write(records, out_handle, "fasta")
47 48
49 -def _fastq_generic(in_handle, out_handle, mapping):
50 """FASTQ helper function where can't have data loss by truncation (PRIVATE).""" 51 from Bio.SeqIO.QualityIO import FastqGeneralIterator 52 #For real speed, don't even make SeqRecord and Seq objects! 53 count = 0 54 null = chr(0) 55 for title, seq, old_qual in FastqGeneralIterator(in_handle): 56 count += 1 57 #map the qual... 58 qual = old_qual.translate(mapping) 59 if null in qual: 60 raise ValueError("Invalid character in quality string") 61 out_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) 62 return count
63 64
65 -def _fastq_generic2(in_handle, out_handle, mapping, truncate_char, truncate_msg):
66 """FASTQ helper function where there could be data loss by truncation (PRIVATE).""" 67 from Bio.SeqIO.QualityIO import FastqGeneralIterator 68 #For real speed, don't even make SeqRecord and Seq objects! 69 count = 0 70 null = chr(0) 71 for title, seq, old_qual in FastqGeneralIterator(in_handle): 72 count += 1 73 #map the qual... 74 qual = old_qual.translate(mapping) 75 if null in qual: 76 raise ValueError("Invalid character in quality string") 77 if truncate_char in qual: 78 qual = qual.replace(truncate_char, chr(126)) 79 import warnings 80 warnings.warn(truncate_msg) 81 out_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) 82 return count
83 84
85 -def _fastq_sanger_convert_fastq_sanger(in_handle, out_handle, alphabet=None):
86 """Fast Sanger FASTQ to Sanger FASTQ conversion (PRIVATE). 87 88 Useful for removing line wrapping and the redundant second identifier 89 on the plus lines. Will check also check the quality string is valid. 90 91 Avoids creating SeqRecord and Seq objects in order to speed up this 92 conversion. 93 """ 94 #Map unexpected chars to null 95 mapping = "".join([chr(0) for ascii in range(0, 33)] 96 + [chr(ascii) for ascii in range(33, 127)] 97 + [chr(0) for ascii in range(127, 256)]) 98 assert len(mapping) == 256 99 return _fastq_generic(in_handle, out_handle, mapping)
100 101
102 -def _fastq_solexa_convert_fastq_solexa(in_handle, out_handle, alphabet=None):
103 """Fast Solexa FASTQ to Solexa FASTQ conversion (PRIVATE). 104 105 Useful for removing line wrapping and the redundant second identifier 106 on the plus lines. Will check also check the quality string is valid. 107 Avoids creating SeqRecord and Seq objects in order to speed up this 108 conversion. 109 """ 110 #Map unexpected chars to null 111 mapping = "".join([chr(0) for ascii in range(0, 59)] 112 + [chr(ascii) for ascii in range(59, 127)] 113 + [chr(0) for ascii in range(127, 256)]) 114 assert len(mapping) == 256 115 return _fastq_generic(in_handle, out_handle, mapping)
116 117
118 -def _fastq_illumina_convert_fastq_illumina(in_handle, out_handle, alphabet=None):
119 """Fast Illumina 1.3+ FASTQ to Illumina 1.3+ FASTQ conversion (PRIVATE). 120 121 Useful for removing line wrapping and the redundant second identifier 122 on the plus lines. Will check also check the quality string is valid. 123 Avoids creating SeqRecord and Seq objects in order to speed up this 124 conversion. 125 """ 126 #Map unexpected chars to null 127 mapping = "".join([chr(0) for ascii in range(0, 64)] 128 + [chr(ascii) for ascii in range(64, 127)] 129 + [chr(0) for ascii in range(127, 256)]) 130 assert len(mapping) == 256 131 return _fastq_generic(in_handle, out_handle, mapping)
132 133
134 -def _fastq_illumina_convert_fastq_sanger(in_handle, out_handle, alphabet=None):
135 """Fast Illumina 1.3+ FASTQ to Sanger FASTQ conversion (PRIVATE). 136 137 Avoids creating SeqRecord and Seq objects in order to speed up this 138 conversion. 139 """ 140 #Map unexpected chars to null 141 mapping = "".join([chr(0) for ascii in range(0, 64)] 142 + [chr(33 + q) for q in range(0, 62 + 1)] 143 + [chr(0) for ascii in range(127, 256)]) 144 assert len(mapping) == 256 145 return _fastq_generic(in_handle, out_handle, mapping)
146 147
148 -def _fastq_sanger_convert_fastq_illumina(in_handle, out_handle, alphabet=None):
149 """Fast Sanger FASTQ to Illumina 1.3+ FASTQ conversion (PRIVATE). 150 151 Avoids creating SeqRecord and Seq objects in order to speed up this 152 conversion. Will issue a warning if the scores had to be truncated at 62 153 (maximum possible in the Illumina 1.3+ FASTQ format) 154 """ 155 #Map unexpected chars to null 156 trunc_char = chr(1) 157 mapping = "".join([chr(0) for ascii in range(0, 33)] 158 + [chr(64 + q) for q in range(0, 62 + 1)] 159 + [trunc_char for ascii in range(96, 127)] 160 + [chr(0) for ascii in range(127, 256)]) 161 assert len(mapping) == 256 162 return _fastq_generic2(in_handle, out_handle, mapping, trunc_char, 163 "Data loss - max PHRED quality 62 in Illumina 1.3+ FASTQ")
164 165
166 -def _fastq_solexa_convert_fastq_sanger(in_handle, out_handle, alphabet=None):
167 """Fast Solexa FASTQ to Sanger FASTQ conversion (PRIVATE). 168 169 Avoids creating SeqRecord and Seq objects in order to speed up this 170 conversion. 171 """ 172 #Map unexpected chars to null 173 from Bio.SeqIO.QualityIO import phred_quality_from_solexa 174 mapping = "".join([chr(0) for ascii in range(0, 59)] 175 + [chr(33 + int(round(phred_quality_from_solexa(q)))) 176 for q in range(-5, 62 + 1)] 177 + [chr(0) for ascii in range(127, 256)]) 178 assert len(mapping) == 256 179 return _fastq_generic(in_handle, out_handle, mapping)
180 181
182 -def _fastq_sanger_convert_fastq_solexa(in_handle, out_handle, alphabet=None):
183 """Fast Sanger FASTQ to Solexa FASTQ conversion (PRIVATE). 184 185 Avoids creating SeqRecord and Seq objects in order to speed up this 186 conversion. Will issue a warning if the scores had to be truncated at 62 187 (maximum possible in the Solexa FASTQ format) 188 """ 189 #Map unexpected chars to null 190 from Bio.SeqIO.QualityIO import solexa_quality_from_phred 191 trunc_char = chr(1) 192 mapping = "".join([chr(0) for ascii in range(0, 33)] 193 + [chr(64 + int(round(solexa_quality_from_phred(q)))) 194 for q in range(0, 62 + 1)] 195 + [trunc_char for ascii in range(96, 127)] 196 + [chr(0) for ascii in range(127, 256)]) 197 assert len(mapping) == 256 198 return _fastq_generic2(in_handle, out_handle, mapping, trunc_char, 199 "Data loss - max Solexa quality 62 in Solexa FASTQ")
200 201
202 -def _fastq_solexa_convert_fastq_illumina(in_handle, out_handle, alphabet=None):
203 """Fast Solexa FASTQ to Illumina 1.3+ FASTQ conversion (PRIVATE). 204 205 Avoids creating SeqRecord and Seq objects in order to speed up this 206 conversion. 207 """ 208 #Map unexpected chars to null 209 from Bio.SeqIO.QualityIO import phred_quality_from_solexa 210 mapping = "".join([chr(0) for ascii in range(0, 59)] 211 + [chr(64 + int(round(phred_quality_from_solexa(q)))) 212 for q in range(-5, 62 + 1)] 213 + [chr(0) for ascii in range(127, 256)]) 214 assert len(mapping) == 256 215 return _fastq_generic(in_handle, out_handle, mapping)
216 217
218 -def _fastq_illumina_convert_fastq_solexa(in_handle, out_handle, alphabet=None):
219 """Fast Illumina 1.3+ FASTQ to Solexa FASTQ conversion (PRIVATE). 220 221 Avoids creating SeqRecord and Seq objects in order to speed up this 222 conversion. 223 """ 224 #Map unexpected chars to null 225 from Bio.SeqIO.QualityIO import solexa_quality_from_phred 226 trunc_char = chr(1) 227 mapping = "".join([chr(0) for ascii in range(0, 64)] 228 + [chr(64 + int(round(solexa_quality_from_phred(q)))) 229 for q in range(0, 62 + 1)] 230 + [chr(0) for ascii in range(127, 256)]) 231 assert len(mapping) == 256 232 return _fastq_generic(in_handle, out_handle, mapping)
233 234
235 -def _fastq_convert_fasta(in_handle, out_handle, alphabet=None):
236 """Fast FASTQ to FASTA conversion (PRIVATE). 237 238 Avoids dealing with the FASTQ quality encoding, and creating SeqRecord and 239 Seq objects in order to speed up this conversion. 240 241 NOTE - This does NOT check the characters used in the FASTQ quality string 242 are valid! 243 """ 244 from Bio.SeqIO.QualityIO import FastqGeneralIterator 245 #For real speed, don't even make SeqRecord and Seq objects! 246 count = 0 247 for title, seq, qual in FastqGeneralIterator(in_handle): 248 count += 1 249 out_handle.write(">%s\n" % title) 250 #Do line wrapping 251 for i in range(0, len(seq), 60): 252 out_handle.write(seq[i:i + 60] + "\n") 253 return count
254 255
256 -def _fastq_convert_tab(in_handle, out_handle, alphabet=None):
257 """Fast FASTQ to simple tabbed conversion (PRIVATE). 258 259 Avoids dealing with the FASTQ quality encoding, and creating SeqRecord and 260 Seq objects in order to speed up this conversion. 261 262 NOTE - This does NOT check the characters used in the FASTQ quality string 263 are valid! 264 """ 265 from Bio.SeqIO.QualityIO import FastqGeneralIterator 266 #For real speed, don't even make SeqRecord and Seq objects! 267 count = 0 268 for title, seq, qual in FastqGeneralIterator(in_handle): 269 count += 1 270 out_handle.write("%s\t%s\n" % (title.split(None, 1)[0], seq)) 271 return count
272 273
274 -def _fastq_convert_qual(in_handle, out_handle, mapping):
275 """FASTQ helper function for QUAL output (PRIVATE). 276 277 Mapping should be a dictionary mapping expected ASCII characters from the 278 FASTQ quality string to PHRED quality scores (as strings). 279 """ 280 from Bio.SeqIO.QualityIO import FastqGeneralIterator 281 #For real speed, don't even make SeqRecord and Seq objects! 282 count = 0 283 for title, seq, qual in FastqGeneralIterator(in_handle): 284 count += 1 285 out_handle.write(">%s\n" % title) 286 #map the qual... note even with Sanger encoding max 2 digits 287 try: 288 qualities_strs = [mapping[ascii] for ascii in qual] 289 except KeyError: 290 raise ValueError("Invalid character in quality string") 291 data = " ".join(qualities_strs) 292 while len(data) > 60: 293 #Know quality scores are either 1 or 2 digits, so there 294 #must be a space in any three consecutive characters. 295 if data[60] == " ": 296 out_handle.write(data[:60] + "\n") 297 data = data[61:] 298 elif data[59] == " ": 299 out_handle.write(data[:59] + "\n") 300 data = data[60:] 301 else: 302 assert data[58] == " ", "Internal logic failure in wrapping" 303 out_handle.write(data[:58] + "\n") 304 data = data[59:] 305 out_handle.write(data + "\n") 306 return count
307 308
309 -def _fastq_sanger_convert_qual(in_handle, out_handle, alphabet=None):
310 """Fast Sanger FASTQ to QUAL conversion (PRIVATE).""" 311 mapping = dict((chr(q + 33), str(q)) for q in range(0, 93 + 1)) 312 return _fastq_convert_qual(in_handle, out_handle, mapping)
313 314
315 -def _fastq_solexa_convert_qual(in_handle, out_handle, alphabet=None):
316 """Fast Solexa FASTQ to QUAL conversion (PRIVATE).""" 317 from Bio.SeqIO.QualityIO import phred_quality_from_solexa 318 mapping = dict((chr(q + 64), str(int(round(phred_quality_from_solexa(q))))) 319 for q in range(-5, 62 + 1)) 320 return _fastq_convert_qual(in_handle, out_handle, mapping)
321 322
323 -def _fastq_illumina_convert_qual(in_handle, out_handle, alphabet=None):
324 """Fast Illumina 1.3+ FASTQ to QUAL conversion (PRIVATE).""" 325 mapping = dict((chr(q + 64), str(q)) for q in range(0, 62 + 1)) 326 return _fastq_convert_qual(in_handle, out_handle, mapping)
327 328 329 #TODO? - Handling aliases explicitly would let us shorten this list: 330 _converter = { 331 ("genbank", "fasta"): _genbank_convert_fasta, 332 ("gb", "fasta"): _genbank_convert_fasta, 333 ("embl", "fasta"): _embl_convert_fasta, 334 ("fastq", "fasta"): _fastq_convert_fasta, 335 ("fastq-sanger", "fasta"): _fastq_convert_fasta, 336 ("fastq-solexa", "fasta"): _fastq_convert_fasta, 337 ("fastq-illumina", "fasta"): _fastq_convert_fasta, 338 ("fastq", "tab"): _fastq_convert_tab, 339 ("fastq-sanger", "tab"): _fastq_convert_tab, 340 ("fastq-solexa", "tab"): _fastq_convert_tab, 341 ("fastq-illumina", "tab"): _fastq_convert_tab, 342 ("fastq", "fastq"): _fastq_sanger_convert_fastq_sanger, 343 ("fastq-sanger", "fastq"): _fastq_sanger_convert_fastq_sanger, 344 ("fastq-solexa", "fastq"): _fastq_solexa_convert_fastq_sanger, 345 ("fastq-illumina", "fastq"): _fastq_illumina_convert_fastq_sanger, 346 ("fastq", "fastq-sanger"): _fastq_sanger_convert_fastq_sanger, 347 ("fastq-sanger", "fastq-sanger"): _fastq_sanger_convert_fastq_sanger, 348 ("fastq-solexa", "fastq-sanger"): _fastq_solexa_convert_fastq_sanger, 349 ("fastq-illumina", "fastq-sanger"): _fastq_illumina_convert_fastq_sanger, 350 ("fastq", "fastq-solexa"): _fastq_sanger_convert_fastq_solexa, 351 ("fastq-sanger", "fastq-solexa"): _fastq_sanger_convert_fastq_solexa, 352 ("fastq-solexa", "fastq-solexa"): _fastq_solexa_convert_fastq_solexa, 353 ("fastq-illumina", "fastq-solexa"): _fastq_illumina_convert_fastq_solexa, 354 ("fastq", "fastq-illumina"): _fastq_sanger_convert_fastq_illumina, 355 ("fastq-sanger", "fastq-illumina"): _fastq_sanger_convert_fastq_illumina, 356 ("fastq-solexa", "fastq-illumina"): _fastq_solexa_convert_fastq_illumina, 357 ("fastq-illumina", "fastq-illumina"): _fastq_illumina_convert_fastq_illumina, 358 ("fastq", "qual"): _fastq_sanger_convert_qual, 359 ("fastq-sanger", "qual"): _fastq_sanger_convert_qual, 360 ("fastq-solexa", "qual"): _fastq_solexa_convert_qual, 361 ("fastq-illumina", "qual"): _fastq_illumina_convert_qual, 362 } 363 364
365 -def _handle_convert(in_handle, in_format, out_handle, out_format, alphabet=None):
366 """SeqIO conversion function (PRIVATE).""" 367 try: 368 f = _converter[(in_format, out_format)] 369 except KeyError: 370 f = None 371 if f: 372 return f(in_handle, out_handle, alphabet) 373 else: 374 records = SeqIO.parse(in_handle, in_format, alphabet) 375 return SeqIO.write(records, out_handle, out_format)
376