1
2
3
4
5 """Optimised sequence conversion code (PRIVATE).
6
7 You are not expected to access this module, or any of its code, directly. This
8 is all handled internally by the Bio.SeqIO.convert(...) function which is the
9 public interface for this.
10
11 The idea here is rather while doing this will work:
12
13 from Bio import SeqIO
14 records = SeqIO.parse(in_handle, in_format)
15 count = SeqIO.write(records, out_handle, out_format)
16
17 it is shorter to write:
18
19 from Bio import SeqIO
20 count = SeqIO.convert(in_handle, in_format, out_handle, out_format)
21
22 Also, the convert function can take a number of special case optimisations. This
23 means that using Bio.SeqIO.convert() may be faster, as well as more convenient.
24 All these file format specific optimisations are handled by this (private) module.
25 """
26
27 from Bio import SeqIO
28
29
30
38
39
47
48
63
64
65 -def _fastq_generic2(in_handle, out_handle, mapping, truncate_char, truncate_msg):
66 """FASTQ helper function where there could be data loss by truncation (PRIVATE)."""
67 from Bio.SeqIO.QualityIO import FastqGeneralIterator
68
69 count = 0
70 null = chr(0)
71 for title, seq, old_qual in FastqGeneralIterator(in_handle):
72 count += 1
73
74 qual = old_qual.translate(mapping)
75 if null in qual:
76 raise ValueError("Invalid character in quality string")
77 if truncate_char in qual:
78 qual = qual.replace(truncate_char, chr(126))
79 import warnings
80 warnings.warn(truncate_msg)
81 out_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
82 return count
83
84
86 """Fast Sanger FASTQ to Sanger FASTQ conversion (PRIVATE).
87
88 Useful for removing line wrapping and the redundant second identifier
89 on the plus lines. Will check also check the quality string is valid.
90
91 Avoids creating SeqRecord and Seq objects in order to speed up this
92 conversion.
93 """
94
95 mapping = "".join([chr(0) for ascii in range(0, 33)]
96 + [chr(ascii) for ascii in range(33, 127)]
97 + [chr(0) for ascii in range(127, 256)])
98 assert len(mapping) == 256
99 return _fastq_generic(in_handle, out_handle, mapping)
100
101
103 """Fast Solexa FASTQ to Solexa FASTQ conversion (PRIVATE).
104
105 Useful for removing line wrapping and the redundant second identifier
106 on the plus lines. Will check also check the quality string is valid.
107 Avoids creating SeqRecord and Seq objects in order to speed up this
108 conversion.
109 """
110
111 mapping = "".join([chr(0) for ascii in range(0, 59)]
112 + [chr(ascii) for ascii in range(59, 127)]
113 + [chr(0) for ascii in range(127, 256)])
114 assert len(mapping) == 256
115 return _fastq_generic(in_handle, out_handle, mapping)
116
117
119 """Fast Illumina 1.3+ FASTQ to Illumina 1.3+ FASTQ conversion (PRIVATE).
120
121 Useful for removing line wrapping and the redundant second identifier
122 on the plus lines. Will check also check the quality string is valid.
123 Avoids creating SeqRecord and Seq objects in order to speed up this
124 conversion.
125 """
126
127 mapping = "".join([chr(0) for ascii in range(0, 64)]
128 + [chr(ascii) for ascii in range(64, 127)]
129 + [chr(0) for ascii in range(127, 256)])
130 assert len(mapping) == 256
131 return _fastq_generic(in_handle, out_handle, mapping)
132
133
135 """Fast Illumina 1.3+ FASTQ to Sanger FASTQ conversion (PRIVATE).
136
137 Avoids creating SeqRecord and Seq objects in order to speed up this
138 conversion.
139 """
140
141 mapping = "".join([chr(0) for ascii in range(0, 64)]
142 + [chr(33 + q) for q in range(0, 62 + 1)]
143 + [chr(0) for ascii in range(127, 256)])
144 assert len(mapping) == 256
145 return _fastq_generic(in_handle, out_handle, mapping)
146
147
149 """Fast Sanger FASTQ to Illumina 1.3+ FASTQ conversion (PRIVATE).
150
151 Avoids creating SeqRecord and Seq objects in order to speed up this
152 conversion. Will issue a warning if the scores had to be truncated at 62
153 (maximum possible in the Illumina 1.3+ FASTQ format)
154 """
155
156 trunc_char = chr(1)
157 mapping = "".join([chr(0) for ascii in range(0, 33)]
158 + [chr(64 + q) for q in range(0, 62 + 1)]
159 + [trunc_char for ascii in range(96, 127)]
160 + [chr(0) for ascii in range(127, 256)])
161 assert len(mapping) == 256
162 return _fastq_generic2(in_handle, out_handle, mapping, trunc_char,
163 "Data loss - max PHRED quality 62 in Illumina 1.3+ FASTQ")
164
165
167 """Fast Solexa FASTQ to Sanger FASTQ conversion (PRIVATE).
168
169 Avoids creating SeqRecord and Seq objects in order to speed up this
170 conversion.
171 """
172
173 from Bio.SeqIO.QualityIO import phred_quality_from_solexa
174 mapping = "".join([chr(0) for ascii in range(0, 59)]
175 + [chr(33 + int(round(phred_quality_from_solexa(q))))
176 for q in range(-5, 62 + 1)]
177 + [chr(0) for ascii in range(127, 256)])
178 assert len(mapping) == 256
179 return _fastq_generic(in_handle, out_handle, mapping)
180
181
183 """Fast Sanger FASTQ to Solexa FASTQ conversion (PRIVATE).
184
185 Avoids creating SeqRecord and Seq objects in order to speed up this
186 conversion. Will issue a warning if the scores had to be truncated at 62
187 (maximum possible in the Solexa FASTQ format)
188 """
189
190 from Bio.SeqIO.QualityIO import solexa_quality_from_phred
191 trunc_char = chr(1)
192 mapping = "".join([chr(0) for ascii in range(0, 33)]
193 + [chr(64 + int(round(solexa_quality_from_phred(q))))
194 for q in range(0, 62 + 1)]
195 + [trunc_char for ascii in range(96, 127)]
196 + [chr(0) for ascii in range(127, 256)])
197 assert len(mapping) == 256
198 return _fastq_generic2(in_handle, out_handle, mapping, trunc_char,
199 "Data loss - max Solexa quality 62 in Solexa FASTQ")
200
201
203 """Fast Solexa FASTQ to Illumina 1.3+ FASTQ conversion (PRIVATE).
204
205 Avoids creating SeqRecord and Seq objects in order to speed up this
206 conversion.
207 """
208
209 from Bio.SeqIO.QualityIO import phred_quality_from_solexa
210 mapping = "".join([chr(0) for ascii in range(0, 59)]
211 + [chr(64 + int(round(phred_quality_from_solexa(q))))
212 for q in range(-5, 62 + 1)]
213 + [chr(0) for ascii in range(127, 256)])
214 assert len(mapping) == 256
215 return _fastq_generic(in_handle, out_handle, mapping)
216
217
219 """Fast Illumina 1.3+ FASTQ to Solexa FASTQ conversion (PRIVATE).
220
221 Avoids creating SeqRecord and Seq objects in order to speed up this
222 conversion.
223 """
224
225 from Bio.SeqIO.QualityIO import solexa_quality_from_phred
226 trunc_char = chr(1)
227 mapping = "".join([chr(0) for ascii in range(0, 64)]
228 + [chr(64 + int(round(solexa_quality_from_phred(q))))
229 for q in range(0, 62 + 1)]
230 + [chr(0) for ascii in range(127, 256)])
231 assert len(mapping) == 256
232 return _fastq_generic(in_handle, out_handle, mapping)
233
234
236 """Fast FASTQ to FASTA conversion (PRIVATE).
237
238 Avoids dealing with the FASTQ quality encoding, and creating SeqRecord and
239 Seq objects in order to speed up this conversion.
240
241 NOTE - This does NOT check the characters used in the FASTQ quality string
242 are valid!
243 """
244 from Bio.SeqIO.QualityIO import FastqGeneralIterator
245
246 count = 0
247 for title, seq, qual in FastqGeneralIterator(in_handle):
248 count += 1
249 out_handle.write(">%s\n" % title)
250
251 for i in range(0, len(seq), 60):
252 out_handle.write(seq[i:i + 60] + "\n")
253 return count
254
255
257 """Fast FASTQ to simple tabbed conversion (PRIVATE).
258
259 Avoids dealing with the FASTQ quality encoding, and creating SeqRecord and
260 Seq objects in order to speed up this conversion.
261
262 NOTE - This does NOT check the characters used in the FASTQ quality string
263 are valid!
264 """
265 from Bio.SeqIO.QualityIO import FastqGeneralIterator
266
267 count = 0
268 for title, seq, qual in FastqGeneralIterator(in_handle):
269 count += 1
270 out_handle.write("%s\t%s\n" % (title.split(None, 1)[0], seq))
271 return count
272
273
275 """FASTQ helper function for QUAL output (PRIVATE).
276
277 Mapping should be a dictionary mapping expected ASCII characters from the
278 FASTQ quality string to PHRED quality scores (as strings).
279 """
280 from Bio.SeqIO.QualityIO import FastqGeneralIterator
281
282 count = 0
283 for title, seq, qual in FastqGeneralIterator(in_handle):
284 count += 1
285 out_handle.write(">%s\n" % title)
286
287 try:
288 qualities_strs = [mapping[ascii] for ascii in qual]
289 except KeyError:
290 raise ValueError("Invalid character in quality string")
291 data = " ".join(qualities_strs)
292 while len(data) > 60:
293
294
295 if data[60] == " ":
296 out_handle.write(data[:60] + "\n")
297 data = data[61:]
298 elif data[59] == " ":
299 out_handle.write(data[:59] + "\n")
300 data = data[60:]
301 else:
302 assert data[58] == " ", "Internal logic failure in wrapping"
303 out_handle.write(data[:58] + "\n")
304 data = data[59:]
305 out_handle.write(data + "\n")
306 return count
307
308
310 """Fast Sanger FASTQ to QUAL conversion (PRIVATE)."""
311 mapping = dict((chr(q + 33), str(q)) for q in range(0, 93 + 1))
312 return _fastq_convert_qual(in_handle, out_handle, mapping)
313
314
321
322
324 """Fast Illumina 1.3+ FASTQ to QUAL conversion (PRIVATE)."""
325 mapping = dict((chr(q + 64), str(q)) for q in range(0, 62 + 1))
326 return _fastq_convert_qual(in_handle, out_handle, mapping)
327
328
329
330 _converter = {
331 ("genbank", "fasta"): _genbank_convert_fasta,
332 ("gb", "fasta"): _genbank_convert_fasta,
333 ("embl", "fasta"): _embl_convert_fasta,
334 ("fastq", "fasta"): _fastq_convert_fasta,
335 ("fastq-sanger", "fasta"): _fastq_convert_fasta,
336 ("fastq-solexa", "fasta"): _fastq_convert_fasta,
337 ("fastq-illumina", "fasta"): _fastq_convert_fasta,
338 ("fastq", "tab"): _fastq_convert_tab,
339 ("fastq-sanger", "tab"): _fastq_convert_tab,
340 ("fastq-solexa", "tab"): _fastq_convert_tab,
341 ("fastq-illumina", "tab"): _fastq_convert_tab,
342 ("fastq", "fastq"): _fastq_sanger_convert_fastq_sanger,
343 ("fastq-sanger", "fastq"): _fastq_sanger_convert_fastq_sanger,
344 ("fastq-solexa", "fastq"): _fastq_solexa_convert_fastq_sanger,
345 ("fastq-illumina", "fastq"): _fastq_illumina_convert_fastq_sanger,
346 ("fastq", "fastq-sanger"): _fastq_sanger_convert_fastq_sanger,
347 ("fastq-sanger", "fastq-sanger"): _fastq_sanger_convert_fastq_sanger,
348 ("fastq-solexa", "fastq-sanger"): _fastq_solexa_convert_fastq_sanger,
349 ("fastq-illumina", "fastq-sanger"): _fastq_illumina_convert_fastq_sanger,
350 ("fastq", "fastq-solexa"): _fastq_sanger_convert_fastq_solexa,
351 ("fastq-sanger", "fastq-solexa"): _fastq_sanger_convert_fastq_solexa,
352 ("fastq-solexa", "fastq-solexa"): _fastq_solexa_convert_fastq_solexa,
353 ("fastq-illumina", "fastq-solexa"): _fastq_illumina_convert_fastq_solexa,
354 ("fastq", "fastq-illumina"): _fastq_sanger_convert_fastq_illumina,
355 ("fastq-sanger", "fastq-illumina"): _fastq_sanger_convert_fastq_illumina,
356 ("fastq-solexa", "fastq-illumina"): _fastq_solexa_convert_fastq_illumina,
357 ("fastq-illumina", "fastq-illumina"): _fastq_illumina_convert_fastq_illumina,
358 ("fastq", "qual"): _fastq_sanger_convert_qual,
359 ("fastq-sanger", "qual"): _fastq_sanger_convert_qual,
360 ("fastq-solexa", "qual"): _fastq_solexa_convert_qual,
361 ("fastq-illumina", "qual"): _fastq_illumina_convert_qual,
362 }
363
364
365 -def _handle_convert(in_handle, in_format, out_handle, out_format, alphabet=None):
366 """SeqIO conversion function (PRIVATE)."""
367 try:
368 f = _converter[(in_format, out_format)]
369 except KeyError:
370 f = None
371 if f:
372 return f(in_handle, out_handle, alphabet)
373 else:
374 records = SeqIO.parse(in_handle, in_format, alphabet)
375 return SeqIO.write(records, out_handle, out_format)
376