1
2
3
4
5
6
7 """Bio.SeqIO support for the binary Standard Flowgram Format (SFF) file format.
8
9 SFF was designed by 454 Life Sciences (Roche), the Whitehead Institute for
10 Biomedical Research and the Wellcome Trust Sanger Institute. You are expected
11 to use this module via the Bio.SeqIO functions under the format name "sff" (or
12 "sff-trim" as described below).
13
14 For example, to iterate over the records in an SFF file,
15
16 >>> from Bio import SeqIO
17 >>> for record in SeqIO.parse("Roche/E3MFGYR02_random_10_reads.sff", "sff"):
18 ... print record.id, len(record), record.seq[:20]+"..."
19 E3MFGYR02JWQ7T 265 tcagGGTCTACATGTTGGTT...
20 E3MFGYR02JA6IL 271 tcagTTTTTTTTGGAAAGGA...
21 E3MFGYR02JHD4H 310 tcagAAAGACAAGTGGTATC...
22 E3MFGYR02GFKUC 299 tcagCGGCCGGGCCTCTCAT...
23 E3MFGYR02FTGED 281 tcagTGGTAATGGGGGGAAA...
24 E3MFGYR02FR9G7 261 tcagCTCCGTAAGAAGGTGC...
25 E3MFGYR02GAZMS 278 tcagAAAGAAGTAAGGTAAA...
26 E3MFGYR02HHZ8O 221 tcagACTTTCTTCTTTACCG...
27 E3MFGYR02GPGB1 269 tcagAAGCAGTGGTATCAAC...
28 E3MFGYR02F7Z7G 219 tcagAATCATCCACTTTTTA...
29
30 Each SeqRecord object will contain all the annotation from the SFF file,
31 including the PHRED quality scores.
32
33 >>> print record.id, len(record)
34 E3MFGYR02F7Z7G 219
35 >>> print record.seq[:10], "..."
36 tcagAATCAT ...
37 >>> print record.letter_annotations["phred_quality"][:10], "..."
38 [22, 21, 23, 28, 26, 15, 12, 21, 28, 21] ...
39
40 Notice that the sequence is given in mixed case, the central upper case region
41 corresponds to the trimmed sequence. This matches the output of the Roche
42 tools (and the 3rd party tool sff_extract) for SFF to FASTA.
43
44 >>> print record.annotations["clip_qual_left"]
45 4
46 >>> print record.annotations["clip_qual_right"]
47 134
48 >>> print record.seq[:4]
49 tcag
50 >>> print record.seq[4:20], "...", record.seq[120:134]
51 AATCATCCACTTTTTA ... CAAAACACAAACAG
52 >>> print record.seq[134:]
53 atcttatcaacaaaactcaaagttcctaactgagacacgcaacaggggataagacaaggcacacaggggataggnnnnnnnnnnn
54
55 The annotations dictionary also contains any adapter clip positions
56 (usually zero), and information about the flows. e.g.
57
58 >>> print record.annotations["flow_key"]
59 TCAG
60 >>> print record.annotations["flow_values"][:10], "..."
61 (83, 1, 128, 7, 4, 84, 6, 106, 3, 172) ...
62 >>> print len(record.annotations["flow_values"])
63 400
64 >>> print record.annotations["flow_index"][:10], "..."
65 (1, 2, 3, 2, 2, 0, 3, 2, 3, 3) ...
66 >>> print len(record.annotations["flow_index"])
67 219
68
69 Note that to convert from a raw reading in flow_values to the corresponding
70 homopolymer stretch estimate, the value should be rounded to the nearest 100:
71
72 >>> print [int(round(value, -2)) // 100
73 ... for value in record.annotations["flow_values"][:10]], '...'
74 [1, 0, 1, 0, 0, 1, 0, 1, 0, 2] ...
75
76 As a convenience method, you can read the file with SeqIO format name "sff-trim"
77 instead of "sff" to get just the trimmed sequences (without any annotation
78 except for the PHRED quality scores):
79
80 >>> from Bio import SeqIO
81 >>> for record in SeqIO.parse("Roche/E3MFGYR02_random_10_reads.sff", "sff-trim"):
82 ... print record.id, len(record), record.seq[:20]+"..."
83 E3MFGYR02JWQ7T 260 GGTCTACATGTTGGTTAACC...
84 E3MFGYR02JA6IL 265 TTTTTTTTGGAAAGGAAAAC...
85 E3MFGYR02JHD4H 292 AAAGACAAGTGGTATCAACG...
86 E3MFGYR02GFKUC 295 CGGCCGGGCCTCTCATCGGT...
87 E3MFGYR02FTGED 277 TGGTAATGGGGGGAAATTTA...
88 E3MFGYR02FR9G7 256 CTCCGTAAGAAGGTGCTGCC...
89 E3MFGYR02GAZMS 271 AAAGAAGTAAGGTAAATAAC...
90 E3MFGYR02HHZ8O 150 ACTTTCTTCTTTACCGTAAC...
91 E3MFGYR02GPGB1 221 AAGCAGTGGTATCAACGCAG...
92 E3MFGYR02F7Z7G 130 AATCATCCACTTTTTAACGT...
93
94 Looking at the final record in more detail, note how this differs to the
95 example above:
96
97 >>> print record.id, len(record)
98 E3MFGYR02F7Z7G 130
99 >>> print record.seq[:10], "..."
100 AATCATCCAC ...
101 >>> print record.letter_annotations["phred_quality"][:10], "..."
102 [26, 15, 12, 21, 28, 21, 36, 28, 27, 27] ...
103 >>> print record.annotations
104 {}
105
106 You might use the Bio.SeqIO.convert() function to convert the (trimmed) SFF
107 reads into a FASTQ file (or a FASTA file and a QUAL file), e.g.
108
109 >>> from Bio import SeqIO
110 >>> from StringIO import StringIO
111 >>> out_handle = StringIO()
112 >>> count = SeqIO.convert("Roche/E3MFGYR02_random_10_reads.sff", "sff",
113 ... out_handle, "fastq")
114 >>> print "Converted %i records" % count
115 Converted 10 records
116
117 The output FASTQ file would start like this:
118
119 >>> print "%s..." % out_handle.getvalue()[:50]
120 @E3MFGYR02JWQ7T
121 tcagGGTCTACATGTTGGTTAACCCGTACTGATT...
122
123 Bio.SeqIO.index() provides memory efficient random access to the reads in an
124 SFF file by name. SFF files can include an index within the file, which can
125 be read in making this very fast. If the index is missing (or in a format not
126 yet supported in Biopython) the file is indexed by scanning all the reads -
127 which is a little slower. For example,
128
129 >>> from Bio import SeqIO
130 >>> reads = SeqIO.index("Roche/E3MFGYR02_random_10_reads.sff", "sff")
131 >>> record = reads["E3MFGYR02JHD4H"]
132 >>> print record.id, len(record), record.seq[:20]+"..."
133 E3MFGYR02JHD4H 310 tcagAAAGACAAGTGGTATC...
134
135 Or, using the trimmed reads:
136
137 >>> from Bio import SeqIO
138 >>> reads = SeqIO.index("Roche/E3MFGYR02_random_10_reads.sff", "sff-trim")
139 >>> record = reads["E3MFGYR02JHD4H"]
140 >>> print record.id, len(record), record.seq[:20]+"..."
141 E3MFGYR02JHD4H 292 AAAGACAAGTGGTATCAACG...
142
143 You can also use the Bio.SeqIO.write() function with the "sff" format. Note
144 that this requires all the flow information etc, and thus is probably only
145 useful for SeqRecord objects originally from reading another SFF file (and
146 not the trimmed SeqRecord objects from parsing an SFF file as "sff-trim").
147
148 As an example, let's pretend this example SFF file represents some DNA which
149 was pre-amplified with a PCR primers AAAGANNNNN. The following script would
150 produce a sub-file containing all those reads whose post-quality clipping
151 region (i.e. the sequence after trimming) starts with AAAGA exactly (the non-
152 degenerate bit of this pretend primer):
153
154 >>> from Bio import SeqIO
155 >>> records = (record for record in
156 ... SeqIO.parse("Roche/E3MFGYR02_random_10_reads.sff","sff")
157 ... if record.seq[record.annotations["clip_qual_left"]:].startswith("AAAGA"))
158 >>> count = SeqIO.write(records, "temp_filtered.sff", "sff")
159 >>> print "Selected %i records" % count
160 Selected 2 records
161
162 Of course, for an assembly you would probably want to remove these primers.
163 If you want FASTA or FASTQ output, you could just slice the SeqRecord. However,
164 if you want SFF output we have to preserve all the flow information - the trick
165 is just to adjust the left clip position!
166
167 >>> from Bio import SeqIO
168 >>> def filter_and_trim(records, primer):
169 ... for record in records:
170 ... if record.seq[record.annotations["clip_qual_left"]:].startswith(primer):
171 ... record.annotations["clip_qual_left"] += len(primer)
172 ... yield record
173 >>> records = SeqIO.parse("Roche/E3MFGYR02_random_10_reads.sff", "sff")
174 >>> count = SeqIO.write(filter_and_trim(records,"AAAGA"),
175 ... "temp_filtered.sff", "sff")
176 >>> print "Selected %i records" % count
177 Selected 2 records
178
179 We can check the results, note the lower case clipped region now includes the "AAAGA"
180 sequence:
181
182 >>> for record in SeqIO.parse("temp_filtered.sff", "sff"):
183 ... print record.id, len(record), record.seq[:20]+"..."
184 E3MFGYR02JHD4H 310 tcagaaagaCAAGTGGTATC...
185 E3MFGYR02GAZMS 278 tcagaaagaAGTAAGGTAAA...
186 >>> for record in SeqIO.parse("temp_filtered.sff", "sff-trim"):
187 ... print record.id, len(record), record.seq[:20]+"..."
188 E3MFGYR02JHD4H 287 CAAGTGGTATCAACGCAGAG...
189 E3MFGYR02GAZMS 266 AGTAAGGTAAATAACAAACG...
190 >>> import os
191 >>> os.remove("temp_filtered.sff")
192
193 For a description of the file format, please see the Roche manuals and:
194 http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=show&f=formats&m=doc&s=formats
195
196 """
197 from Bio.SeqIO.Interfaces import SequenceWriter
198 from Bio import Alphabet
199 from Bio.Seq import Seq
200 from Bio.SeqRecord import SeqRecord
201 import struct
202 import sys
203
204 from Bio._py3k import _bytes_to_string, _as_bytes
205 _null = _as_bytes("\0")
206 _sff = _as_bytes(".sff")
207 _hsh = _as_bytes(".hsh")
208 _srt = _as_bytes(".srt")
209 _mft = _as_bytes(".mft")
210 _flag = _as_bytes("\xff")
211
213 """Read in an SFF file header (PRIVATE).
214
215 Assumes the handle is at the start of the file, will read forwards
216 though the header and leave the handle pointing at the first record.
217 Returns a tuple of values from the header (header_length, index_offset,
218 index_length, number_of_reads, flows_per_read, flow_chars, key_sequence)
219
220 >>> handle = open("Roche/greek.sff", "rb")
221 >>> values = _sff_file_header(handle)
222 >>> print values[0]
223 840
224 >>> print values[1]
225 65040
226 >>> print values[2]
227 256
228 >>> print values[3]
229 24
230 >>> print values[4]
231 800
232 >>> values[-1]
233 'TCAG'
234
235 """
236 if hasattr(handle,"mode") and "U" in handle.mode.upper():
237 raise ValueError("SFF files must NOT be opened in universal new "
238 "lines mode. Binary mode is recommended (although "
239 "on Unix the default mode is also fine).")
240 elif hasattr(handle,"mode") and "B" not in handle.mode.upper() \
241 and sys.platform == "win32":
242 raise ValueError("SFF files must be opened in binary mode on Windows")
243
244
245
246
247
248
249
250
251
252
253
254
255 fmt = '>4s4BQIIHHHB'
256 assert 31 == struct.calcsize(fmt)
257 data = handle.read(31)
258 if not data:
259 raise ValueError("Empty file.")
260 elif len(data) < 13:
261 raise ValueError("File too small to hold a valid SFF header.")
262 magic_number, ver0, ver1, ver2, ver3, index_offset, index_length, \
263 number_of_reads, header_length, key_length, number_of_flows_per_read, \
264 flowgram_format = struct.unpack(fmt, data)
265 if magic_number in [_hsh, _srt, _mft]:
266
267 raise ValueError("Handle seems to be at SFF index block, not start")
268 if magic_number != _sff:
269 raise ValueError("SFF file did not start '.sff', but %s" \
270 % repr(magic_number))
271 if (ver0, ver1, ver2, ver3) != (0, 0, 0, 1):
272 raise ValueError("Unsupported SFF version in header, %i.%i.%i.%i" \
273 % (ver0, ver1, ver2, ver3))
274 if flowgram_format != 1:
275 raise ValueError("Flowgram format code %i not supported" \
276 % flowgram_format)
277 if (index_offset!=0) ^ (index_length!=0):
278 raise ValueError("Index offset %i but index length %i" \
279 % (index_offset, index_length))
280 flow_chars = _bytes_to_string(handle.read(number_of_flows_per_read))
281 key_sequence = _bytes_to_string(handle.read(key_length))
282
283
284
285
286 assert header_length % 8 == 0
287 padding = header_length - number_of_flows_per_read - key_length - 31
288 assert 0 <= padding < 8, padding
289 if handle.read(padding).count(_null) != padding:
290 raise ValueError("Post header %i byte padding region contained data" \
291 % padding)
292 return header_length, index_offset, index_length, \
293 number_of_reads, number_of_flows_per_read, \
294 flow_chars, key_sequence
295
296
298 """Generates an index by scanning though all the reads in an SFF file (PRIVATE).
299
300 This is a slow but generic approach if we can't parse the provided index
301 (if present).
302
303 Will use the handle seek/tell functions.
304 """
305 handle.seek(0)
306 header_length, index_offset, index_length, number_of_reads, \
307 number_of_flows_per_read, flow_chars, key_sequence \
308 = _sff_file_header(handle)
309
310 read_header_fmt = '>2HI4H'
311 read_header_size = struct.calcsize(read_header_fmt)
312
313 read_flow_fmt = ">%iH" % number_of_flows_per_read
314 read_flow_size = struct.calcsize(read_flow_fmt)
315 assert 1 == struct.calcsize(">B")
316 assert 1 == struct.calcsize(">s")
317 assert 1 == struct.calcsize(">c")
318 assert read_header_size % 8 == 0
319 for read in range(number_of_reads):
320 record_offset = handle.tell()
321 if record_offset == index_offset:
322
323 offset = index_offset + index_length
324 if offset % 8:
325 offset += 8 - (offset % 8)
326 assert offset % 8 == 0
327 handle.seek(offset)
328 record_offset = offset
329
330
331 data = handle.read(read_header_size)
332 read_header_length, name_length, seq_len, clip_qual_left, \
333 clip_qual_right, clip_adapter_left, clip_adapter_right \
334 = struct.unpack(read_header_fmt, data)
335 if read_header_length < 10 or read_header_length % 8 != 0:
336 raise ValueError("Malformed read header, says length is %i:\n%s" \
337 % (read_header_length, repr(data)))
338
339 name = _bytes_to_string(handle.read(name_length))
340 padding = read_header_length - read_header_size - name_length
341 if handle.read(padding).count(_null) != padding:
342 raise ValueError("Post name %i byte padding region contained data" \
343 % padding)
344 assert record_offset + read_header_length == handle.tell()
345
346 size = read_flow_size + 3*seq_len
347 handle.seek(size, 1)
348
349 padding = size % 8
350 if padding:
351 padding = 8 - padding
352 if handle.read(padding).count(_null) != padding:
353 raise ValueError("Post quality %i byte padding region contained data" \
354 % padding)
355
356 yield name, record_offset
357 if handle.tell() % 8 != 0:
358 raise ValueError("After scanning reads, did not end on a multiple of 8")
359
361 """Locate any existing Roche style XML meta data and read index (PRIVATE).
362
363 Makes a number of hard coded assumptions based on reverse engineered SFF
364 files from Roche 454 machines.
365
366 Returns a tuple of read count, SFF "index" offset and size, XML offset
367 and size, and the actual read index offset and size.
368
369 Raises a ValueError for unsupported or non-Roche index blocks.
370 """
371 handle.seek(0)
372 header_length, index_offset, index_length, number_of_reads, \
373 number_of_flows_per_read, flow_chars, key_sequence \
374 = _sff_file_header(handle)
375 assert handle.tell() == header_length
376 if not index_offset or not index_offset:
377 raise ValueError("No index present in this SFF file")
378
379 handle.seek(index_offset)
380 fmt = ">4s4B"
381 fmt_size = struct.calcsize(fmt)
382 data = handle.read(fmt_size)
383 if not data:
384 raise ValueError("Premature end of file? Expected index of size %i at offest %i, found nothing" \
385 % (index_length, index_offset))
386 if len(data) < fmt_size:
387 raise ValueError("Premature end of file? Expected index of size %i at offest %i, found %s" \
388 % (index_length, index_offset, repr(data)))
389 magic_number, ver0, ver1, ver2, ver3 = struct.unpack(fmt, data)
390 if magic_number == _mft:
391
392
393
394 if (ver0, ver1, ver2, ver3) != (49, 46, 48, 48):
395
396 raise ValueError("Unsupported version in .mft index header, %i.%i.%i.%i" \
397 % (ver0, ver1, ver2, ver3))
398 fmt2 = ">LL"
399 fmt2_size = struct.calcsize(fmt2)
400 xml_size, data_size = struct.unpack(fmt2, handle.read(fmt2_size))
401 if index_length != fmt_size + fmt2_size + xml_size + data_size:
402 raise ValueError("Problem understanding .mft index header, %i != %i + %i + %i + %i" \
403 % (index_length, fmt_size, fmt2_size, xml_size, data_size))
404 return number_of_reads, header_length, \
405 index_offset, index_length, \
406 index_offset + fmt_size + fmt2_size, xml_size, \
407 index_offset + fmt_size + fmt2_size + xml_size, data_size
408 elif magic_number == _srt:
409
410
411
412 if (ver0, ver1, ver2, ver3) != (49, 46, 48, 48):
413
414 raise ValueError("Unsupported version in .srt index header, %i.%i.%i.%i" \
415 % (ver0, ver1, ver2, ver3))
416 data = handle.read(4)
417 if data != _null*4:
418 raise ValueError("Did not find expected null four bytes in .srt index")
419 return number_of_reads, header_length, \
420 index_offset, index_length, \
421 0, 0, \
422 index_offset + fmt_size + 4, index_length - fmt_size - 4
423 elif magic_number == _hsh:
424 raise ValueError("Hash table style indexes (.hsh) in SFF files are "
425 "not (yet) supported")
426 else:
427 raise ValueError("Unknown magic number %s in SFF index header:\n%s" \
428 % (repr(magic_number), repr(data)))
429
431 """Reads any existing Roche style XML manifest data in the SFF "index" (PRIVATE, DEPRECATED).
432
433 Will use the handle seek/tell functions. Returns a string.
434
435 This has been replaced by ReadRocheXmlManifest. We would normally just
436 delete an old private function without warning, but I believe some people
437 are using this so we'll handle this with a deprecation warning.
438 """
439 import warnings
440 warnings.warn("Private function _sff_read_roche_index_xml is deprecated. "
441 "Use new public function ReadRocheXmlManifest instead",
442 DeprecationWarning)
443 return ReadRocheXmlManifest(handle)
444
445
447 """Reads any Roche style XML manifest data in the SFF "index".
448
449 The SFF file format allows for multiple different index blocks, and Roche
450 took advantage of this to define their own index block wich also embeds
451 an XML manifest string. This is not a publically documented extension to
452 the SFF file format, this was reverse engineered.
453
454 The handle should be to an SFF file opened in binary mode. This function
455 will use the handle seek/tell functions and leave the handle in an
456 arbitrary location.
457
458 Any XML manifest found is returned as a Python string, which you can then
459 parse as appropriate, or reuse when writing out SFF files with the
460 SffWriter class.
461
462 Returns a string, or raises a ValueError if an Roche manifest could not be
463 found.
464 """
465 number_of_reads, header_length, index_offset, index_length, xml_offset, \
466 xml_size, read_index_offset, read_index_size = _sff_find_roche_index(handle)
467 if not xml_offset or not xml_size:
468 raise ValueError("No XML manifest found")
469 handle.seek(xml_offset)
470 return _bytes_to_string(handle.read(xml_size))
471
472
473
475 """Reads any existing Roche style read index provided in the SFF file (PRIVATE).
476
477 Will use the handle seek/tell functions.
478
479 This works on ".srt1.00" and ".mft1.00" style Roche SFF index blocks.
480
481 Roche SFF indices use base 255 not 256, meaning we see bytes in range the
482 range 0 to 254 only. This appears to be so that byte 0xFF (character 255)
483 can be used as a marker character to separate entries (required if the
484 read name lengths vary).
485
486 Note that since only four bytes are used for the read offset, this is
487 limited to 255^4 bytes (nearly 4GB). If you try to use the Roche sfffile
488 tool to combine SFF files beyound this limit, they issue a warning and
489 omit the index (and manifest).
490 """
491 number_of_reads, header_length, index_offset, index_length, xml_offset, \
492 xml_size, read_index_offset, read_index_size = _sff_find_roche_index(handle)
493
494 handle.seek(read_index_offset)
495 fmt = ">5B"
496 for read in range(number_of_reads):
497
498 data = handle.read(6)
499 while True:
500 more = handle.read(1)
501 if not more:
502 raise ValueError("Premature end of file!")
503 data += more
504 if more == _flag: break
505 assert data[-1:] == _flag, data[-1:]
506 name = _bytes_to_string(data[:-6])
507 off4, off3, off2, off1, off0 = struct.unpack(fmt, data[-6:-1])
508 offset = off0 + 255*off1 + 65025*off2 + 16581375*off3
509 if off4:
510
511
512
513 raise ValueError("Expected a null terminator to the read name.")
514 yield name, offset
515 if handle.tell() != read_index_offset + read_index_size:
516 raise ValueError("Problem with index length? %i vs %i" \
517 % (handle.tell(), read_index_offset + read_index_size))
518
519
520 -def _sff_read_seq_record(handle, number_of_flows_per_read, flow_chars,
521 key_sequence, alphabet, trim=False):
522 """Parse the next read in the file, return data as a SeqRecord (PRIVATE)."""
523
524
525
526
527
528
529
530
531
532
533 read_header_fmt = '>2HI4H'
534 read_header_size = struct.calcsize(read_header_fmt)
535 read_flow_fmt = ">%iH" % number_of_flows_per_read
536 read_flow_size = struct.calcsize(read_flow_fmt)
537
538 read_header_length, name_length, seq_len, clip_qual_left, \
539 clip_qual_right, clip_adapter_left, clip_adapter_right \
540 = struct.unpack(read_header_fmt, handle.read(read_header_size))
541 if clip_qual_left:
542 clip_qual_left -= 1
543 if clip_adapter_left:
544 clip_adapter_left -= 1
545 if read_header_length < 10 or read_header_length % 8 != 0:
546 raise ValueError("Malformed read header, says length is %i" \
547 % read_header_length)
548
549 name = _bytes_to_string(handle.read(name_length))
550 padding = read_header_length - read_header_size - name_length
551 if handle.read(padding).count(_null) != padding:
552 raise ValueError("Post name %i byte padding region contained data" \
553 % padding)
554
555
556 flow_values = handle.read(read_flow_size)
557 temp_fmt = ">%iB" % seq_len
558 flow_index = handle.read(seq_len)
559 seq = _bytes_to_string(handle.read(seq_len))
560 quals = list(struct.unpack(temp_fmt, handle.read(seq_len)))
561
562 padding = (read_flow_size + seq_len*3)%8
563 if padding:
564 padding = 8 - padding
565 if handle.read(padding).count(_null) != padding:
566 raise ValueError("Post quality %i byte padding region contained data" \
567 % padding)
568
569
570
571 clip_left = max(clip_qual_left, clip_adapter_left)
572
573 if clip_qual_right:
574 if clip_adapter_right:
575 clip_right = min(clip_qual_right, clip_adapter_right)
576 else:
577
578 clip_right = clip_qual_right
579 elif clip_adapter_right:
580 clip_right = clip_adapter_right
581 else:
582 clip_right = seq_len
583
584 if trim:
585 seq = seq[clip_left:clip_right].upper()
586 quals = quals[clip_left:clip_right]
587
588 annotations = {}
589 else:
590
591 seq = seq[:clip_left].lower() + \
592 seq[clip_left:clip_right].upper() + \
593 seq[clip_right:].lower()
594 annotations = {"flow_values":struct.unpack(read_flow_fmt, flow_values),
595 "flow_index":struct.unpack(temp_fmt, flow_index),
596 "flow_chars":flow_chars,
597 "flow_key":key_sequence,
598 "clip_qual_left":clip_qual_left,
599 "clip_qual_right":clip_qual_right,
600 "clip_adapter_left":clip_adapter_left,
601 "clip_adapter_right":clip_adapter_right}
602 record = SeqRecord(Seq(seq, alphabet),
603 id=name,
604 name=name,
605 description="",
606 annotations=annotations)
607
608
609 dict.__setitem__(record._per_letter_annotations,
610 "phred_quality", quals)
611
612 return record
613
614
616 """Extract the next read in the file as a raw (bytes) string (PRIVATE)."""
617 read_header_fmt = '>2HI'
618 read_header_size = struct.calcsize(read_header_fmt)
619 read_flow_fmt = ">%iH" % number_of_flows_per_read
620 read_flow_size = struct.calcsize(read_flow_fmt)
621
622 raw = handle.read(read_header_size)
623 read_header_length, name_length, seq_len \
624 = struct.unpack(read_header_fmt, raw)
625 if read_header_length < 10 or read_header_length % 8 != 0:
626 raise ValueError("Malformed read header, says length is %i" \
627 % read_header_length)
628
629 raw += handle.read(8 + name_length)
630
631 padding = read_header_length - read_header_size - 8 - name_length
632 pad = handle.read(padding)
633 if pad.count(_null) != padding:
634 raise ValueError("Post name %i byte padding region contained data" \
635 % padding)
636 raw += pad
637
638 raw += handle.read(read_flow_size + seq_len*3)
639 padding = (read_flow_size + seq_len*3)%8
640
641 if padding:
642 padding = 8 - padding
643 pad = handle.read(padding)
644 if pad.count(_null) != padding:
645 raise ValueError("Post quality %i byte padding region contained data" \
646 % padding)
647 raw += pad
648
649 return raw
650
652 """Wrapper for handles which do not support the tell method (PRIVATE).
653
654 Intended for use with things like network handles where tell (and reverse
655 seek) are not supported. The SFF file needs to track the current offset in
656 order to deal with the index block.
657 """
659 self._handle = handle
660 self._offset = 0
661
662 - def read(self, length):
666
669
670 - def seek(self, offset):
671 if offset < self._offset:
672 raise RunTimeError("Can't seek backwards")
673 self._handle.read(offset - self._offset)
674
676 return self._handle.close()
677
678
679
681 """Iterate over Standard Flowgram Format (SFF) reads (as SeqRecord objects).
682
683 handle - input file, an SFF file, e.g. from Roche 454 sequencing.
684 This must NOT be opened in universal read lines mode!
685 alphabet - optional alphabet, defaults to generic DNA.
686 trim - should the sequences be trimmed?
687
688 The resulting SeqRecord objects should match those from a paired FASTA
689 and QUAL file converted from the SFF file using the Roche 454 tool
690 ssfinfo. i.e. The sequence will be mixed case, with the trim regions
691 shown in lower case.
692
693 This function is used internally via the Bio.SeqIO functions:
694
695 >>> from Bio import SeqIO
696 >>> handle = open("Roche/E3MFGYR02_random_10_reads.sff", "rb")
697 >>> for record in SeqIO.parse(handle, "sff"):
698 ... print record.id, len(record)
699 E3MFGYR02JWQ7T 265
700 E3MFGYR02JA6IL 271
701 E3MFGYR02JHD4H 310
702 E3MFGYR02GFKUC 299
703 E3MFGYR02FTGED 281
704 E3MFGYR02FR9G7 261
705 E3MFGYR02GAZMS 278
706 E3MFGYR02HHZ8O 221
707 E3MFGYR02GPGB1 269
708 E3MFGYR02F7Z7G 219
709 >>> handle.close()
710
711 You can also call it directly:
712
713 >>> handle = open("Roche/E3MFGYR02_random_10_reads.sff", "rb")
714 >>> for record in SffIterator(handle):
715 ... print record.id, len(record)
716 E3MFGYR02JWQ7T 265
717 E3MFGYR02JA6IL 271
718 E3MFGYR02JHD4H 310
719 E3MFGYR02GFKUC 299
720 E3MFGYR02FTGED 281
721 E3MFGYR02FR9G7 261
722 E3MFGYR02GAZMS 278
723 E3MFGYR02HHZ8O 221
724 E3MFGYR02GPGB1 269
725 E3MFGYR02F7Z7G 219
726 >>> handle.close()
727
728 Or, with the trim option:
729
730 >>> handle = open("Roche/E3MFGYR02_random_10_reads.sff", "rb")
731 >>> for record in SffIterator(handle, trim=True):
732 ... print record.id, len(record)
733 E3MFGYR02JWQ7T 260
734 E3MFGYR02JA6IL 265
735 E3MFGYR02JHD4H 292
736 E3MFGYR02GFKUC 295
737 E3MFGYR02FTGED 277
738 E3MFGYR02FR9G7 256
739 E3MFGYR02GAZMS 271
740 E3MFGYR02HHZ8O 150
741 E3MFGYR02GPGB1 221
742 E3MFGYR02F7Z7G 130
743 >>> handle.close()
744
745 """
746 if isinstance(Alphabet._get_base_alphabet(alphabet),
747 Alphabet.ProteinAlphabet):
748 raise ValueError("Invalid alphabet, SFF files do not hold proteins.")
749 if isinstance(Alphabet._get_base_alphabet(alphabet),
750 Alphabet.RNAAlphabet):
751 raise ValueError("Invalid alphabet, SFF files do not hold RNA.")
752 try:
753 assert 0 == handle.tell()
754 except AttributeError:
755
756 handle = _AddTellHandle(handle)
757 header_length, index_offset, index_length, number_of_reads, \
758 number_of_flows_per_read, flow_chars, key_sequence \
759 = _sff_file_header(handle)
760
761
762
763
764
765
766
767
768
769
770 read_header_fmt = '>2HI4H'
771 read_header_size = struct.calcsize(read_header_fmt)
772 read_flow_fmt = ">%iH" % number_of_flows_per_read
773 read_flow_size = struct.calcsize(read_flow_fmt)
774 assert 1 == struct.calcsize(">B")
775 assert 1 == struct.calcsize(">s")
776 assert 1 == struct.calcsize(">c")
777 assert read_header_size % 8 == 0
778
779
780
781 for read in range(number_of_reads):
782 if index_offset and handle.tell() == index_offset:
783 offset = index_offset + index_length
784 if offset % 8:
785 offset += 8 - (offset % 8)
786 assert offset % 8 == 0
787 handle.seek(offset)
788
789
790 index_offset = 0
791 yield _sff_read_seq_record(handle,
792 number_of_flows_per_read,
793 flow_chars,
794 key_sequence,
795 alphabet,
796 trim)
797
798
799 if index_offset and handle.tell() == index_offset:
800 offset = index_offset + index_length
801 if offset % 8:
802 offset += 8 - (offset % 8)
803 assert offset % 8 == 0
804 handle.seek(offset)
805
806 if handle.read(1):
807 raise ValueError("Additional data at end of SFF file")
808
809
810
812 """Iterate over SFF reads (as SeqRecord objects) with trimming (PRIVATE)."""
813 return SffIterator(handle, alphabet, trim=True)
814
815
817 """SFF file writer."""
818
819 - def __init__(self, handle, index=True, xml=None):
820 """Creates the writer object.
821
822 handle - Output handle, ideally in binary write mode.
823 index - Boolean argument, should we try and write an index?
824 xml - Optional string argument, xml manifest to be recorded in the index
825 block (see function ReadRocheXmlManifest for reading this data).
826 """
827 if hasattr(handle,"mode") and "U" in handle.mode.upper():
828 raise ValueError("SFF files must NOT be opened in universal new "
829 "lines mode. Binary mode is required")
830 elif hasattr(handle,"mode") and "B" not in handle.mode.upper():
831 raise ValueError("SFF files must be opened in binary mode")
832 self.handle = handle
833 self._xml = xml
834 if index:
835 self._index = []
836 else:
837 self._index = None
838
840 """Use this to write an entire file containing the given records."""
841 try:
842 self._number_of_reads = len(records)
843 except TypeError:
844 self._number_of_reads = 0
845 if not hasattr(self.handle, "seek") \
846 or not hasattr(self.handle, "tell"):
847 raise ValueError("A handle with a seek/tell methods is "
848 "required in order to record the total "
849 "record count in the file header (once it "
850 "is known at the end).")
851 if self._index is not None and \
852 not (hasattr(self.handle, "seek") and hasattr(self.handle, "tell")):
853 import warnings
854 warnings.warn("A handle with a seek/tell methods is required in "
855 "order to record an SFF index.")
856 self._index = None
857 self._index_start = 0
858 self._index_length = 0
859 if not hasattr(records, "next"):
860 records = iter(records)
861
862
863 try:
864 record = records.next()
865 except StopIteration:
866 record = None
867 if record is None:
868
869
870
871 raise ValueError("Must have at least one sequence")
872 try:
873 self._key_sequence = _as_bytes(record.annotations["flow_key"])
874 self._flow_chars = _as_bytes(record.annotations["flow_chars"])
875 self._number_of_flows_per_read = len(self._flow_chars)
876 except KeyError:
877 raise ValueError("Missing SFF flow information")
878 self.write_header()
879 self.write_record(record)
880 count = 1
881 for record in records:
882 self.write_record(record)
883 count += 1
884 if self._number_of_reads == 0:
885
886 offset = self.handle.tell()
887 self.handle.seek(0)
888 self._number_of_reads = count
889 self.write_header()
890 self.handle.seek(offset)
891 else:
892 assert count == self._number_of_reads
893 if self._index is not None:
894 self._write_index()
895 return count
896
898 assert len(self._index)==self._number_of_reads
899 handle = self.handle
900 self._index.sort()
901 self._index_start = handle.tell()
902
903 if self._xml is not None:
904 xml = _as_bytes(self._xml)
905 else:
906 from Bio import __version__
907 xml = "<!-- This file was output with Biopython %s -->\n" % __version__
908 xml += "<!-- This XML and index block attempts to mimic Roche SFF files -->\n"
909 xml += "<!-- This file may be a combination of multiple SFF files etc -->\n"
910 xml = _as_bytes(xml)
911 xml_len = len(xml)
912
913 fmt = ">I4BLL"
914 fmt_size = struct.calcsize(fmt)
915 handle.write(_null*fmt_size + xml)
916 fmt2 = ">6B"
917 assert 6 == struct.calcsize(fmt2)
918 self._index.sort()
919 index_len = 0
920 for name, offset in self._index:
921
922
923
924 off3 = offset
925 off0 = off3 % 255
926 off3 -= off0
927 off1 = off3 % 65025
928 off3 -= off1
929 off2 = off3 % 16581375
930 off3 -= off2
931 assert offset == off0 + off1 + off2 + off3, \
932 "%i -> %i %i %i %i" % (offset, off0, off1, off2, off3)
933 off3, off2, off1, off0 = off3//16581375, off2//65025, \
934 off1//255, off0
935 assert off0 < 255 and off1 < 255 and off2 < 255 and off3 < 255, \
936 "%i -> %i %i %i %i" % (offset, off0, off1, off2, off3)
937 handle.write(name + struct.pack(fmt2, 0, \
938 off3, off2, off1, off0, 255))
939 index_len += len(name) + 6
940
941 self._index_length = fmt_size + xml_len + index_len
942
943
944
945 if self._index_length % 8:
946 padding = 8 - (self._index_length%8)
947 handle.write(_null*padding)
948 else:
949 padding = 0
950 offset = handle.tell()
951 assert offset == self._index_start + self._index_length + padding, \
952 "%i vs %i + %i + %i" % (offset, self._index_start, \
953 self._index_length, padding)
954
955 handle.seek(self._index_start)
956 handle.write(struct.pack(fmt, 778921588,
957 49,46,48,48,
958 xml_len, index_len) + xml)
959
960 handle.seek(0)
961 self.write_header()
962 handle.seek(offset)
963
965
966 key_length = len(self._key_sequence)
967
968
969
970
971
972
973
974
975
976
977
978
979 fmt = '>I4BQIIHHHB%is%is' % (self._number_of_flows_per_read, key_length)
980
981
982
983
984 if struct.calcsize(fmt) % 8 == 0:
985 padding = 0
986 else:
987 padding = 8 - (struct.calcsize(fmt) % 8)
988 header_length = struct.calcsize(fmt) + padding
989 assert header_length % 8 == 0
990 header = struct.pack(fmt, 779314790,
991 0, 0, 0, 1,
992 self._index_start, self._index_length,
993 self._number_of_reads,
994 header_length, key_length,
995 self._number_of_flows_per_read,
996 1,
997 self._flow_chars, self._key_sequence)
998 self.handle.write(header + _null*padding)
999
1001 """Write a single additional record to the output file.
1002
1003 This assumes the header has been done.
1004 """
1005
1006 name = _as_bytes(record.id)
1007 name_len = len(name)
1008 seq = _as_bytes(str(record.seq).upper())
1009 seq_len = len(seq)
1010
1011 try:
1012 quals = record.letter_annotations["phred_quality"]
1013 except KeyError:
1014 raise ValueError("Missing PHRED qualities information")
1015
1016 try:
1017 flow_values = record.annotations["flow_values"]
1018 flow_index = record.annotations["flow_index"]
1019 if self._key_sequence != _as_bytes(record.annotations["flow_key"]) \
1020 or self._flow_chars != _as_bytes(record.annotations["flow_chars"]):
1021 raise ValueError("Records have inconsistent SFF flow data")
1022 except KeyError:
1023 raise ValueError("Missing SFF flow information")
1024 except AttributeError:
1025 raise ValueError("Header not written yet?")
1026
1027 try:
1028 clip_qual_left = record.annotations["clip_qual_left"]
1029 if clip_qual_left:
1030 clip_qual_left += 1
1031 clip_qual_right = record.annotations["clip_qual_right"]
1032 clip_adapter_left = record.annotations["clip_adapter_left"]
1033 if clip_adapter_left:
1034 clip_adapter_left += 1
1035 clip_adapter_right = record.annotations["clip_adapter_right"]
1036 except KeyError:
1037 raise ValueError("Missing SFF clipping information")
1038
1039
1040 if self._index is not None:
1041 offset = self.handle.tell()
1042
1043
1044
1045 if offset > 4244897280:
1046 import warnings
1047 warnings.warn("Read %s has file offset %i, which is too large "
1048 "to store in the Roche SFF index structure. No "
1049 "index block will be recorded." % (name, offset))
1050
1051 self._index = None
1052 else:
1053 self._index.append((name, self.handle.tell()))
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069 read_header_fmt = '>2HI4H%is' % name_len
1070 if struct.calcsize(read_header_fmt) % 8 == 0:
1071 padding = 0
1072 else:
1073 padding = 8 - (struct.calcsize(read_header_fmt) % 8)
1074 read_header_length = struct.calcsize(read_header_fmt) + padding
1075 assert read_header_length % 8 == 0
1076 data = struct.pack(read_header_fmt,
1077 read_header_length,
1078 name_len, seq_len,
1079 clip_qual_left, clip_qual_right,
1080 clip_adapter_left, clip_adapter_right,
1081 name) + _null*padding
1082 assert len(data) == read_header_length
1083
1084
1085 read_flow_fmt = ">%iH" % self._number_of_flows_per_read
1086 read_flow_size = struct.calcsize(read_flow_fmt)
1087 temp_fmt = ">%iB" % seq_len
1088 data += struct.pack(read_flow_fmt, *flow_values) \
1089 + struct.pack(temp_fmt, *flow_index) \
1090 + seq \
1091 + struct.pack(temp_fmt, *quals)
1092
1093 padding = (read_flow_size + seq_len*3)%8
1094 if padding:
1095 padding = 8 - padding
1096 self.handle.write(data + _null*padding)
1097
1098
1099 if __name__ == "__main__":
1100 print "Running quick self test"
1101 filename = "../../Tests/Roche/E3MFGYR02_random_10_reads.sff"
1102 metadata = ReadRocheXmlManifest(open(filename, "rb"))
1103 index1 = sorted(_sff_read_roche_index(open(filename, "rb")))
1104 index2 = sorted(_sff_do_slow_index(open(filename, "rb")))
1105 assert index1 == index2
1106 assert len(index1) == len(list(SffIterator(open(filename, "rb"))))
1107 from StringIO import StringIO
1108 try:
1109
1110 from io import BytesIO
1111 except ImportError:
1112 BytesIO = StringIO
1113 assert len(index1) == len(list(SffIterator(BytesIO(open(filename,"rb").read()))))
1114
1115 if sys.platform != "win32":
1116 assert len(index1) == len(list(SffIterator(open(filename, "r"))))
1117 index2 = sorted(_sff_read_roche_index(open(filename)))
1118 assert index1 == index2
1119 index2 = sorted(_sff_do_slow_index(open(filename)))
1120 assert index1 == index2
1121 assert len(index1) == len(list(SffIterator(open(filename))))
1122 assert len(index1) == len(list(SffIterator(BytesIO(open(filename,"r").read()))))
1123 assert len(index1) == len(list(SffIterator(BytesIO(open(filename).read()))))
1124
1125 sff = list(SffIterator(open(filename, "rb")))
1126
1127 sff2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_alt_index_at_end.sff", "rb")))
1128 assert len(sff) == len(sff2)
1129 for old, new in zip(sff, sff2):
1130 assert old.id == new.id
1131 assert str(old.seq) == str(new.seq)
1132
1133 sff2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_alt_index_at_start.sff", "rb")))
1134 assert len(sff) == len(sff2)
1135 for old, new in zip(sff, sff2):
1136 assert old.id == new.id
1137 assert str(old.seq) == str(new.seq)
1138
1139 sff2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_alt_index_in_middle.sff", "rb")))
1140 assert len(sff) == len(sff2)
1141 for old, new in zip(sff, sff2):
1142 assert old.id == new.id
1143 assert str(old.seq) == str(new.seq)
1144
1145 sff2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_index_at_start.sff", "rb")))
1146 assert len(sff) == len(sff2)
1147 for old, new in zip(sff, sff2):
1148 assert old.id == new.id
1149 assert str(old.seq) == str(new.seq)
1150
1151 sff2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_index_in_middle.sff", "rb")))
1152 assert len(sff) == len(sff2)
1153 for old, new in zip(sff, sff2):
1154 assert old.id == new.id
1155 assert str(old.seq) == str(new.seq)
1156
1157 sff_trim = list(SffIterator(open(filename, "rb"), trim=True))
1158
1159 print ReadRocheXmlManifest(open(filename, "rb"))
1160
1161 from Bio import SeqIO
1162 filename = "../../Tests/Roche/E3MFGYR02_random_10_reads_no_trim.fasta"
1163 fasta_no_trim = list(SeqIO.parse(open(filename,"rU"), "fasta"))
1164 filename = "../../Tests/Roche/E3MFGYR02_random_10_reads_no_trim.qual"
1165 qual_no_trim = list(SeqIO.parse(open(filename,"rU"), "qual"))
1166
1167 filename = "../../Tests/Roche/E3MFGYR02_random_10_reads.fasta"
1168 fasta_trim = list(SeqIO.parse(open(filename,"rU"), "fasta"))
1169 filename = "../../Tests/Roche/E3MFGYR02_random_10_reads.qual"
1170 qual_trim = list(SeqIO.parse(open(filename,"rU"), "qual"))
1171
1172 for s, sT, f, q, fT, qT in zip(sff, sff_trim, fasta_no_trim,
1173 qual_no_trim, fasta_trim, qual_trim):
1174
1175 print s.id
1176
1177
1178
1179 assert s.id == f.id == q.id
1180 assert str(s.seq) == str(f.seq)
1181 assert s.letter_annotations["phred_quality"] == q.letter_annotations["phred_quality"]
1182
1183 assert s.id == sT.id == fT.id == qT.id
1184 assert str(sT.seq) == str(fT.seq)
1185 assert sT.letter_annotations["phred_quality"] == qT.letter_annotations["phred_quality"]
1186
1187
1188 print "Writing with a list of SeqRecords..."
1189 handle = StringIO()
1190 w = SffWriter(handle, xml=metadata)
1191 w.write_file(sff)
1192 data = handle.getvalue()
1193 print "And again with an iterator..."
1194 handle = StringIO()
1195 w = SffWriter(handle, xml=metadata)
1196 w.write_file(iter(sff))
1197 assert data == handle.getvalue()
1198
1199 filename = "../../Tests/Roche/E3MFGYR02_random_10_reads.sff"
1200 original = open(filename,"rb").read()
1201 assert len(data) == len(original)
1202 assert data == original
1203 del data
1204 handle.close()
1205
1206 print "-"*50
1207 filename = "../../Tests/Roche/greek.sff"
1208 for record in SffIterator(open(filename,"rb")):
1209 print record.id
1210 index1 = sorted(_sff_read_roche_index(open(filename, "rb")))
1211 index2 = sorted(_sff_do_slow_index(open(filename, "rb")))
1212 assert index1 == index2
1213 try:
1214 print ReadRocheXmlManifest(open(filename, "rb"))
1215 assert False, "Should fail!"
1216 except ValueError:
1217 pass
1218
1219
1220 handle = open(filename, "rb")
1221 for record in SffIterator(handle):
1222 pass
1223 try:
1224 for record in SffIterator(handle):
1225 print record.id
1226 assert False, "Should have failed"
1227 except ValueError, err:
1228 print "Checking what happens on re-reading a handle:"
1229 print err
1230
1231
1232 """
1233 #Ugly code to make test files...
1234 index = ".diy1.00This is a fake index block (DIY = Do It Yourself), which is allowed under the SFF standard.\0"
1235 padding = len(index)%8
1236 if padding:
1237 padding = 8 - padding
1238 index += chr(0)*padding
1239 assert len(index)%8 == 0
1240
1241 #Ugly bit of code to make a fake index at start
1242 records = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_random_10_reads.sff", "rb")))
1243 out_handle = open("../../Tests/Roche/E3MFGYR02_alt_index_at_start.sff", "w")
1244 index = ".diy1.00This is a fake index block (DIY = Do It Yourself), which is allowed under the SFF standard.\0"
1245 padding = len(index)%8
1246 if padding:
1247 padding = 8 - padding
1248 index += chr(0)*padding
1249 w = SffWriter(out_handle, index=False, xml=None)
1250 #Fake the header...
1251 w._number_of_reads = len(records)
1252 w._index_start = 0
1253 w._index_length = 0
1254 w._key_sequence = records[0].annotations["flow_key"]
1255 w._flow_chars = records[0].annotations["flow_chars"]
1256 w._number_of_flows_per_read = len(w._flow_chars)
1257 w.write_header()
1258 w._index_start = out_handle.tell()
1259 w._index_length = len(index)
1260 out_handle.seek(0)
1261 w.write_header() #this time with index info
1262 w.handle.write(index)
1263 for record in records:
1264 w.write_record(record)
1265 out_handle.close()
1266 records2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_alt_index_at_start.sff", "rb")))
1267 for old, new in zip(records, records2):
1268 assert str(old.seq)==str(new.seq)
1269 i = list(_sff_do_slow_index(open("../../Tests/Roche/E3MFGYR02_alt_index_at_start.sff", "rb")))
1270
1271 #Ugly bit of code to make a fake index in middle
1272 records = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_random_10_reads.sff", "rb")))
1273 out_handle = open("../../Tests/Roche/E3MFGYR02_alt_index_in_middle.sff", "w")
1274 index = ".diy1.00This is a fake index block (DIY = Do It Yourself), which is allowed under the SFF standard.\0"
1275 padding = len(index)%8
1276 if padding:
1277 padding = 8 - padding
1278 index += chr(0)*padding
1279 w = SffWriter(out_handle, index=False, xml=None)
1280 #Fake the header...
1281 w._number_of_reads = len(records)
1282 w._index_start = 0
1283 w._index_length = 0
1284 w._key_sequence = records[0].annotations["flow_key"]
1285 w._flow_chars = records[0].annotations["flow_chars"]
1286 w._number_of_flows_per_read = len(w._flow_chars)
1287 w.write_header()
1288 for record in records[:5]:
1289 w.write_record(record)
1290 w._index_start = out_handle.tell()
1291 w._index_length = len(index)
1292 w.handle.write(index)
1293 for record in records[5:]:
1294 w.write_record(record)
1295 out_handle.seek(0)
1296 w.write_header() #this time with index info
1297 out_handle.close()
1298 records2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_alt_index_in_middle.sff", "rb")))
1299 for old, new in zip(records, records2):
1300 assert str(old.seq)==str(new.seq)
1301 j = list(_sff_do_slow_index(open("../../Tests/Roche/E3MFGYR02_alt_index_in_middle.sff", "rb")))
1302
1303 #Ugly bit of code to make a fake index at end
1304 records = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_random_10_reads.sff", "rb")))
1305 out_handle = open("../../Tests/Roche/E3MFGYR02_alt_index_at_end.sff", "w")
1306 w = SffWriter(out_handle, index=False, xml=None)
1307 #Fake the header...
1308 w._number_of_reads = len(records)
1309 w._index_start = 0
1310 w._index_length = 0
1311 w._key_sequence = records[0].annotations["flow_key"]
1312 w._flow_chars = records[0].annotations["flow_chars"]
1313 w._number_of_flows_per_read = len(w._flow_chars)
1314 w.write_header()
1315 for record in records:
1316 w.write_record(record)
1317 w._index_start = out_handle.tell()
1318 w._index_length = len(index)
1319 out_handle.write(index)
1320 out_handle.seek(0)
1321 w.write_header() #this time with index info
1322 out_handle.close()
1323 records2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_alt_index_at_end.sff", "rb")))
1324 for old, new in zip(records, records2):
1325 assert str(old.seq)==str(new.seq)
1326 try:
1327 print ReadRocheXmlManifest(open("../../Tests/Roche/E3MFGYR02_alt_index_at_end.sff", "rb"))
1328 assert False, "Should fail!"
1329 except ValueError:
1330 pass
1331 k = list(_sff_do_slow_index(open("../../Tests/Roche/E3MFGYR02_alt_index_at_end.sff", "rb")))
1332 """
1333
1334 print "Done"
1335