1
2
3
4
5
6
7 """Bio.SeqIO support for the binary Standard Flowgram Format (SFF) file format.
8
9 SFF was designed by 454 Life Sciences (Roche), the Whitehead Institute for
10 Biomedical Research and the Wellcome Trust Sanger Institute. SFF was also used
11 as the native output format from early versions of Ion Torrent's PGM platform
12 as well. You are expected to use this module via the Bio.SeqIO functions under
13 the format name "sff" (or "sff-trim" as described below).
14
15 For example, to iterate over the records in an SFF file,
16
17 >>> from Bio import SeqIO
18 >>> for record in SeqIO.parse("Roche/E3MFGYR02_random_10_reads.sff", "sff"):
19 ... print record.id, len(record), record.seq[:20]+"..."
20 E3MFGYR02JWQ7T 265 tcagGGTCTACATGTTGGTT...
21 E3MFGYR02JA6IL 271 tcagTTTTTTTTGGAAAGGA...
22 E3MFGYR02JHD4H 310 tcagAAAGACAAGTGGTATC...
23 E3MFGYR02GFKUC 299 tcagCGGCCGGGCCTCTCAT...
24 E3MFGYR02FTGED 281 tcagTGGTAATGGGGGGAAA...
25 E3MFGYR02FR9G7 261 tcagCTCCGTAAGAAGGTGC...
26 E3MFGYR02GAZMS 278 tcagAAAGAAGTAAGGTAAA...
27 E3MFGYR02HHZ8O 221 tcagACTTTCTTCTTTACCG...
28 E3MFGYR02GPGB1 269 tcagAAGCAGTGGTATCAAC...
29 E3MFGYR02F7Z7G 219 tcagAATCATCCACTTTTTA...
30
31 Each SeqRecord object will contain all the annotation from the SFF file,
32 including the PHRED quality scores.
33
34 >>> print record.id, len(record)
35 E3MFGYR02F7Z7G 219
36 >>> print record.seq[:10], "..."
37 tcagAATCAT ...
38 >>> print record.letter_annotations["phred_quality"][:10], "..."
39 [22, 21, 23, 28, 26, 15, 12, 21, 28, 21] ...
40
41 Notice that the sequence is given in mixed case, the central upper case region
42 corresponds to the trimmed sequence. This matches the output of the Roche
43 tools (and the 3rd party tool sff_extract) for SFF to FASTA.
44
45 >>> print record.annotations["clip_qual_left"]
46 4
47 >>> print record.annotations["clip_qual_right"]
48 134
49 >>> print record.seq[:4]
50 tcag
51 >>> print record.seq[4:20], "...", record.seq[120:134]
52 AATCATCCACTTTTTA ... CAAAACACAAACAG
53 >>> print record.seq[134:]
54 atcttatcaacaaaactcaaagttcctaactgagacacgcaacaggggataagacaaggcacacaggggataggnnnnnnnnnnn
55
56 The annotations dictionary also contains any adapter clip positions
57 (usually zero), and information about the flows. e.g.
58
59 >>> len(record.annotations)
60 11
61 >>> print record.annotations["flow_key"]
62 TCAG
63 >>> print record.annotations["flow_values"][:10], "..."
64 (83, 1, 128, 7, 4, 84, 6, 106, 3, 172) ...
65 >>> print len(record.annotations["flow_values"])
66 400
67 >>> print record.annotations["flow_index"][:10], "..."
68 (1, 2, 3, 2, 2, 0, 3, 2, 3, 3) ...
69 >>> print len(record.annotations["flow_index"])
70 219
71
72 Note that to convert from a raw reading in flow_values to the corresponding
73 homopolymer stretch estimate, the value should be rounded to the nearest 100:
74
75 >>> print [int(round(value, -2)) // 100
76 ... for value in record.annotations["flow_values"][:10]], '...'
77 [1, 0, 1, 0, 0, 1, 0, 1, 0, 2] ...
78
79 If a read name is exactly 14 alphanumeric characters, the annotations
80 dictionary will also contain meta-data about the read extracted by
81 interpretting the name as a 454 Sequencing System "Universal" Accession
82 Number. Note that if a read name happens to be exactly 14 alphanumeric
83 characters but was not generated automatically, these annotation records
84 will contain nonsense information.
85
86 >>> print record.annotations["region"]
87 2
88 >>> print record.annotations["time"]
89 [2008, 1, 9, 16, 16, 0]
90 >>> print record.annotations["coords"]
91 (2434, 1658)
92
93 As a convenience method, you can read the file with SeqIO format name "sff-trim"
94 instead of "sff" to get just the trimmed sequences (without any annotation
95 except for the PHRED quality scores and anything encoded in the read names):
96
97 >>> from Bio import SeqIO
98 >>> for record in SeqIO.parse("Roche/E3MFGYR02_random_10_reads.sff", "sff-trim"):
99 ... print record.id, len(record), record.seq[:20]+"..."
100 E3MFGYR02JWQ7T 260 GGTCTACATGTTGGTTAACC...
101 E3MFGYR02JA6IL 265 TTTTTTTTGGAAAGGAAAAC...
102 E3MFGYR02JHD4H 292 AAAGACAAGTGGTATCAACG...
103 E3MFGYR02GFKUC 295 CGGCCGGGCCTCTCATCGGT...
104 E3MFGYR02FTGED 277 TGGTAATGGGGGGAAATTTA...
105 E3MFGYR02FR9G7 256 CTCCGTAAGAAGGTGCTGCC...
106 E3MFGYR02GAZMS 271 AAAGAAGTAAGGTAAATAAC...
107 E3MFGYR02HHZ8O 150 ACTTTCTTCTTTACCGTAAC...
108 E3MFGYR02GPGB1 221 AAGCAGTGGTATCAACGCAG...
109 E3MFGYR02F7Z7G 130 AATCATCCACTTTTTAACGT...
110
111 Looking at the final record in more detail, note how this differs to the
112 example above:
113
114 >>> print record.id, len(record)
115 E3MFGYR02F7Z7G 130
116 >>> print record.seq[:10], "..."
117 AATCATCCAC ...
118 >>> print record.letter_annotations["phred_quality"][:10], "..."
119 [26, 15, 12, 21, 28, 21, 36, 28, 27, 27] ...
120 >>> len(record.annotations)
121 3
122 >>> print record.annotations["region"]
123 2
124 >>> print record.annotations["coords"]
125 (2434, 1658)
126 >>> print record.annotations["time"]
127 [2008, 1, 9, 16, 16, 0]
128
129 You might use the Bio.SeqIO.convert() function to convert the (trimmed) SFF
130 reads into a FASTQ file (or a FASTA file and a QUAL file), e.g.
131
132 >>> from Bio import SeqIO
133 >>> from StringIO import StringIO
134 >>> out_handle = StringIO()
135 >>> count = SeqIO.convert("Roche/E3MFGYR02_random_10_reads.sff", "sff",
136 ... out_handle, "fastq")
137 >>> print "Converted %i records" % count
138 Converted 10 records
139
140 The output FASTQ file would start like this:
141
142 >>> print "%s..." % out_handle.getvalue()[:50]
143 @E3MFGYR02JWQ7T
144 tcagGGTCTACATGTTGGTTAACCCGTACTGATT...
145
146 Bio.SeqIO.index() provides memory efficient random access to the reads in an
147 SFF file by name. SFF files can include an index within the file, which can
148 be read in making this very fast. If the index is missing (or in a format not
149 yet supported in Biopython) the file is indexed by scanning all the reads -
150 which is a little slower. For example,
151
152 >>> from Bio import SeqIO
153 >>> reads = SeqIO.index("Roche/E3MFGYR02_random_10_reads.sff", "sff")
154 >>> record = reads["E3MFGYR02JHD4H"]
155 >>> print record.id, len(record), record.seq[:20]+"..."
156 E3MFGYR02JHD4H 310 tcagAAAGACAAGTGGTATC...
157
158 Or, using the trimmed reads:
159
160 >>> from Bio import SeqIO
161 >>> reads = SeqIO.index("Roche/E3MFGYR02_random_10_reads.sff", "sff-trim")
162 >>> record = reads["E3MFGYR02JHD4H"]
163 >>> print record.id, len(record), record.seq[:20]+"..."
164 E3MFGYR02JHD4H 292 AAAGACAAGTGGTATCAACG...
165
166 You can also use the Bio.SeqIO.write() function with the "sff" format. Note
167 that this requires all the flow information etc, and thus is probably only
168 useful for SeqRecord objects originally from reading another SFF file (and
169 not the trimmed SeqRecord objects from parsing an SFF file as "sff-trim").
170
171 As an example, let's pretend this example SFF file represents some DNA which
172 was pre-amplified with a PCR primers AAAGANNNNN. The following script would
173 produce a sub-file containing all those reads whose post-quality clipping
174 region (i.e. the sequence after trimming) starts with AAAGA exactly (the non-
175 degenerate bit of this pretend primer):
176
177 >>> from Bio import SeqIO
178 >>> records = (record for record in
179 ... SeqIO.parse("Roche/E3MFGYR02_random_10_reads.sff","sff")
180 ... if record.seq[record.annotations["clip_qual_left"]:].startswith("AAAGA"))
181 >>> count = SeqIO.write(records, "temp_filtered.sff", "sff")
182 >>> print "Selected %i records" % count
183 Selected 2 records
184
185 Of course, for an assembly you would probably want to remove these primers.
186 If you want FASTA or FASTQ output, you could just slice the SeqRecord. However,
187 if you want SFF output we have to preserve all the flow information - the trick
188 is just to adjust the left clip position!
189
190 >>> from Bio import SeqIO
191 >>> def filter_and_trim(records, primer):
192 ... for record in records:
193 ... if record.seq[record.annotations["clip_qual_left"]:].startswith(primer):
194 ... record.annotations["clip_qual_left"] += len(primer)
195 ... yield record
196 >>> records = SeqIO.parse("Roche/E3MFGYR02_random_10_reads.sff", "sff")
197 >>> count = SeqIO.write(filter_and_trim(records,"AAAGA"),
198 ... "temp_filtered.sff", "sff")
199 >>> print "Selected %i records" % count
200 Selected 2 records
201
202 We can check the results, note the lower case clipped region now includes the "AAAGA"
203 sequence:
204
205 >>> for record in SeqIO.parse("temp_filtered.sff", "sff"):
206 ... print record.id, len(record), record.seq[:20]+"..."
207 E3MFGYR02JHD4H 310 tcagaaagaCAAGTGGTATC...
208 E3MFGYR02GAZMS 278 tcagaaagaAGTAAGGTAAA...
209 >>> for record in SeqIO.parse("temp_filtered.sff", "sff-trim"):
210 ... print record.id, len(record), record.seq[:20]+"..."
211 E3MFGYR02JHD4H 287 CAAGTGGTATCAACGCAGAG...
212 E3MFGYR02GAZMS 266 AGTAAGGTAAATAACAAACG...
213 >>> import os
214 >>> os.remove("temp_filtered.sff")
215
216 For a description of the file format, please see the Roche manuals and:
217 http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=show&f=formats&m=doc&s=formats
218
219 """
220 from Bio.SeqIO.Interfaces import SequenceWriter
221 from Bio import Alphabet
222 from Bio.Seq import Seq
223 from Bio.SeqRecord import SeqRecord
224 import struct
225 import sys
226 import re
227
228 from Bio._py3k import _bytes_to_string, _as_bytes
229 _null = _as_bytes("\0")
230 _sff = _as_bytes(".sff")
231 _hsh = _as_bytes(".hsh")
232 _srt = _as_bytes(".srt")
233 _mft = _as_bytes(".mft")
234 _flag = _as_bytes("\xff")
235
236
238 """Read in an SFF file header (PRIVATE).
239
240 Assumes the handle is at the start of the file, will read forwards
241 though the header and leave the handle pointing at the first record.
242 Returns a tuple of values from the header (header_length, index_offset,
243 index_length, number_of_reads, flows_per_read, flow_chars, key_sequence)
244
245 >>> handle = open("Roche/greek.sff", "rb")
246 >>> values = _sff_file_header(handle)
247 >>> handle.close()
248 >>> print values[0]
249 840
250 >>> print values[1]
251 65040
252 >>> print values[2]
253 256
254 >>> print values[3]
255 24
256 >>> print values[4]
257 800
258 >>> values[-1]
259 'TCAG'
260
261 """
262 if hasattr(handle, "mode") and "U" in handle.mode.upper():
263 raise ValueError("SFF files must NOT be opened in universal new "
264 "lines mode. Binary mode is recommended (although "
265 "on Unix the default mode is also fine).")
266 elif hasattr(handle, "mode") and "B" not in handle.mode.upper() \
267 and sys.platform == "win32":
268 raise ValueError("SFF files must be opened in binary mode on Windows")
269
270
271
272
273
274
275
276
277
278
279
280
281 fmt = '>4s4BQIIHHHB'
282 assert 31 == struct.calcsize(fmt)
283 data = handle.read(31)
284 if not data:
285 raise ValueError("Empty file.")
286 elif len(data) < 13:
287 raise ValueError("File too small to hold a valid SFF header.")
288 magic_number, ver0, ver1, ver2, ver3, index_offset, index_length, \
289 number_of_reads, header_length, key_length, number_of_flows_per_read, \
290 flowgram_format = struct.unpack(fmt, data)
291 if magic_number in [_hsh, _srt, _mft]:
292
293 raise ValueError("Handle seems to be at SFF index block, not start")
294 if magic_number != _sff:
295 raise ValueError("SFF file did not start '.sff', but %s"
296 % repr(magic_number))
297 if (ver0, ver1, ver2, ver3) != (0, 0, 0, 1):
298 raise ValueError("Unsupported SFF version in header, %i.%i.%i.%i"
299 % (ver0, ver1, ver2, ver3))
300 if flowgram_format != 1:
301 raise ValueError("Flowgram format code %i not supported"
302 % flowgram_format)
303 if (index_offset != 0) ^ (index_length != 0):
304 raise ValueError("Index offset %i but index length %i"
305 % (index_offset, index_length))
306 flow_chars = _bytes_to_string(handle.read(number_of_flows_per_read))
307 key_sequence = _bytes_to_string(handle.read(key_length))
308
309
310
311
312 assert header_length % 8 == 0
313 padding = header_length - number_of_flows_per_read - key_length - 31
314 assert 0 <= padding < 8, padding
315 if handle.read(padding).count(_null) != padding:
316 raise ValueError("Post header %i byte padding region contained data"
317 % padding)
318 return header_length, index_offset, index_length, \
319 number_of_reads, number_of_flows_per_read, \
320 flow_chars, key_sequence
321
322
324 """Generates an index by scanning though all the reads in an SFF file (PRIVATE).
325
326 This is a slow but generic approach if we can't parse the provided index
327 (if present).
328
329 Will use the handle seek/tell functions.
330 """
331 handle.seek(0)
332 header_length, index_offset, index_length, number_of_reads, \
333 number_of_flows_per_read, flow_chars, key_sequence \
334 = _sff_file_header(handle)
335
336 read_header_fmt = '>2HI4H'
337 read_header_size = struct.calcsize(read_header_fmt)
338
339 read_flow_fmt = ">%iH" % number_of_flows_per_read
340 read_flow_size = struct.calcsize(read_flow_fmt)
341 assert 1 == struct.calcsize(">B")
342 assert 1 == struct.calcsize(">s")
343 assert 1 == struct.calcsize(">c")
344 assert read_header_size % 8 == 0
345 for read in range(number_of_reads):
346 record_offset = handle.tell()
347 if record_offset == index_offset:
348
349 offset = index_offset + index_length
350 if offset % 8:
351 offset += 8 - (offset % 8)
352 assert offset % 8 == 0
353 handle.seek(offset)
354 record_offset = offset
355
356
357 data = handle.read(read_header_size)
358 read_header_length, name_length, seq_len, clip_qual_left, \
359 clip_qual_right, clip_adapter_left, clip_adapter_right \
360 = struct.unpack(read_header_fmt, data)
361 if read_header_length < 10 or read_header_length % 8 != 0:
362 raise ValueError("Malformed read header, says length is %i:\n%s"
363 % (read_header_length, repr(data)))
364
365 name = _bytes_to_string(handle.read(name_length))
366 padding = read_header_length - read_header_size - name_length
367 if handle.read(padding).count(_null) != padding:
368 raise ValueError("Post name %i byte padding region contained data"
369 % padding)
370 assert record_offset + read_header_length == handle.tell()
371
372 size = read_flow_size + 3 * seq_len
373 handle.seek(size, 1)
374
375 padding = size % 8
376 if padding:
377 padding = 8 - padding
378 if handle.read(padding).count(_null) != padding:
379 raise ValueError("Post quality %i byte padding region contained data"
380 % padding)
381
382 yield name, record_offset
383 if handle.tell() % 8 != 0:
384 raise ValueError(
385 "After scanning reads, did not end on a multiple of 8")
386
387
389 """Locate any existing Roche style XML meta data and read index (PRIVATE).
390
391 Makes a number of hard coded assumptions based on reverse engineered SFF
392 files from Roche 454 machines.
393
394 Returns a tuple of read count, SFF "index" offset and size, XML offset
395 and size, and the actual read index offset and size.
396
397 Raises a ValueError for unsupported or non-Roche index blocks.
398 """
399 handle.seek(0)
400 header_length, index_offset, index_length, number_of_reads, \
401 number_of_flows_per_read, flow_chars, key_sequence \
402 = _sff_file_header(handle)
403 assert handle.tell() == header_length
404 if not index_offset or not index_offset:
405 raise ValueError("No index present in this SFF file")
406
407 handle.seek(index_offset)
408 fmt = ">4s4B"
409 fmt_size = struct.calcsize(fmt)
410 data = handle.read(fmt_size)
411 if not data:
412 raise ValueError("Premature end of file? Expected index of size %i at offest %i, found nothing"
413 % (index_length, index_offset))
414 if len(data) < fmt_size:
415 raise ValueError("Premature end of file? Expected index of size %i at offest %i, found %s"
416 % (index_length, index_offset, repr(data)))
417 magic_number, ver0, ver1, ver2, ver3 = struct.unpack(fmt, data)
418 if magic_number == _mft:
419
420
421
422 if (ver0, ver1, ver2, ver3) != (49, 46, 48, 48):
423
424 raise ValueError("Unsupported version in .mft index header, %i.%i.%i.%i"
425 % (ver0, ver1, ver2, ver3))
426 fmt2 = ">LL"
427 fmt2_size = struct.calcsize(fmt2)
428 xml_size, data_size = struct.unpack(fmt2, handle.read(fmt2_size))
429 if index_length != fmt_size + fmt2_size + xml_size + data_size:
430 raise ValueError("Problem understanding .mft index header, %i != %i + %i + %i + %i"
431 % (index_length, fmt_size, fmt2_size, xml_size, data_size))
432 return number_of_reads, header_length, \
433 index_offset, index_length, \
434 index_offset + fmt_size + fmt2_size, xml_size, \
435 index_offset + fmt_size + fmt2_size + xml_size, data_size
436 elif magic_number == _srt:
437
438
439
440 if (ver0, ver1, ver2, ver3) != (49, 46, 48, 48):
441
442 raise ValueError("Unsupported version in .srt index header, %i.%i.%i.%i"
443 % (ver0, ver1, ver2, ver3))
444 data = handle.read(4)
445 if data != _null * 4:
446 raise ValueError(
447 "Did not find expected null four bytes in .srt index")
448 return number_of_reads, header_length, \
449 index_offset, index_length, \
450 0, 0, \
451 index_offset + fmt_size + 4, index_length - fmt_size - 4
452 elif magic_number == _hsh:
453 raise ValueError("Hash table style indexes (.hsh) in SFF files are "
454 "not (yet) supported")
455 else:
456 raise ValueError("Unknown magic number %s in SFF index header:\n%s"
457 % (repr(magic_number), repr(data)))
458
459
461 """Reads any Roche style XML manifest data in the SFF "index".
462
463 The SFF file format allows for multiple different index blocks, and Roche
464 took advantage of this to define their own index block which also embeds
465 an XML manifest string. This is not a publically documented extension to
466 the SFF file format, this was reverse engineered.
467
468 The handle should be to an SFF file opened in binary mode. This function
469 will use the handle seek/tell functions and leave the handle in an
470 arbitrary location.
471
472 Any XML manifest found is returned as a Python string, which you can then
473 parse as appropriate, or reuse when writing out SFF files with the
474 SffWriter class.
475
476 Returns a string, or raises a ValueError if an Roche manifest could not be
477 found.
478 """
479 number_of_reads, header_length, index_offset, index_length, xml_offset, \
480 xml_size, read_index_offset, read_index_size = _sff_find_roche_index(
481 handle)
482 if not xml_offset or not xml_size:
483 raise ValueError("No XML manifest found")
484 handle.seek(xml_offset)
485 return _bytes_to_string(handle.read(xml_size))
486
487
488
490 """Reads any existing Roche style read index provided in the SFF file (PRIVATE).
491
492 Will use the handle seek/tell functions.
493
494 This works on ".srt1.00" and ".mft1.00" style Roche SFF index blocks.
495
496 Roche SFF indices use base 255 not 256, meaning we see bytes in range the
497 range 0 to 254 only. This appears to be so that byte 0xFF (character 255)
498 can be used as a marker character to separate entries (required if the
499 read name lengths vary).
500
501 Note that since only four bytes are used for the read offset, this is
502 limited to 255^4 bytes (nearly 4GB). If you try to use the Roche sfffile
503 tool to combine SFF files beyound this limit, they issue a warning and
504 omit the index (and manifest).
505 """
506 number_of_reads, header_length, index_offset, index_length, xml_offset, \
507 xml_size, read_index_offset, read_index_size = _sff_find_roche_index(
508 handle)
509
510 handle.seek(read_index_offset)
511 fmt = ">5B"
512 for read in range(number_of_reads):
513
514 data = handle.read(6)
515 while True:
516 more = handle.read(1)
517 if not more:
518 raise ValueError("Premature end of file!")
519 data += more
520 if more == _flag:
521 break
522 assert data[-1:] == _flag, data[-1:]
523 name = _bytes_to_string(data[:-6])
524 off4, off3, off2, off1, off0 = struct.unpack(fmt, data[-6:-1])
525 offset = off0 + 255 * off1 + 65025 * off2 + 16581375 * off3
526 if off4:
527
528
529
530 raise ValueError("Expected a null terminator to the read name.")
531 yield name, offset
532 if handle.tell() != read_index_offset + read_index_size:
533 raise ValueError("Problem with index length? %i vs %i"
534 % (handle.tell(), read_index_offset + read_index_size))
535
536 _valid_UAN_read_name = re.compile(r'^[a-zA-Z0-9]{14}$')
537
538
539 -def _sff_read_seq_record(handle, number_of_flows_per_read, flow_chars,
540 key_sequence, alphabet, trim=False):
541 """Parse the next read in the file, return data as a SeqRecord (PRIVATE)."""
542
543
544
545
546
547
548
549
550
551
552 read_header_fmt = '>2HI4H'
553 read_header_size = struct.calcsize(read_header_fmt)
554 read_flow_fmt = ">%iH" % number_of_flows_per_read
555 read_flow_size = struct.calcsize(read_flow_fmt)
556
557 read_header_length, name_length, seq_len, clip_qual_left, \
558 clip_qual_right, clip_adapter_left, clip_adapter_right \
559 = struct.unpack(read_header_fmt, handle.read(read_header_size))
560 if clip_qual_left:
561 clip_qual_left -= 1
562 if clip_adapter_left:
563 clip_adapter_left -= 1
564 if read_header_length < 10 or read_header_length % 8 != 0:
565 raise ValueError("Malformed read header, says length is %i"
566 % read_header_length)
567
568 name = _bytes_to_string(handle.read(name_length))
569 padding = read_header_length - read_header_size - name_length
570 if handle.read(padding).count(_null) != padding:
571 raise ValueError("Post name %i byte padding region contained data"
572 % padding)
573
574
575 flow_values = handle.read(read_flow_size)
576 temp_fmt = ">%iB" % seq_len
577 flow_index = handle.read(seq_len)
578 seq = _bytes_to_string(handle.read(seq_len))
579 quals = list(struct.unpack(temp_fmt, handle.read(seq_len)))
580
581 padding = (read_flow_size + seq_len * 3) % 8
582 if padding:
583 padding = 8 - padding
584 if handle.read(padding).count(_null) != padding:
585 raise ValueError("Post quality %i byte padding region contained data"
586 % padding)
587
588
589
590 clip_left = max(clip_qual_left, clip_adapter_left)
591
592 if clip_qual_right:
593 if clip_adapter_right:
594 clip_right = min(clip_qual_right, clip_adapter_right)
595 else:
596
597 clip_right = clip_qual_right
598 elif clip_adapter_right:
599 clip_right = clip_adapter_right
600 else:
601 clip_right = seq_len
602
603 if trim:
604 seq = seq[clip_left:clip_right].upper()
605 quals = quals[clip_left:clip_right]
606
607 annotations = {}
608 else:
609
610 seq = seq[:clip_left].lower() + \
611 seq[clip_left:clip_right].upper() + \
612 seq[clip_right:].lower()
613 annotations = {"flow_values": struct.unpack(read_flow_fmt, flow_values),
614 "flow_index": struct.unpack(temp_fmt, flow_index),
615 "flow_chars": flow_chars,
616 "flow_key": key_sequence,
617 "clip_qual_left": clip_qual_left,
618 "clip_qual_right": clip_qual_right,
619 "clip_adapter_left": clip_adapter_left,
620 "clip_adapter_right": clip_adapter_right}
621 if re.match(_valid_UAN_read_name, name):
622 annotations["time"] = _get_read_time(name)
623 annotations["region"] = _get_read_region(name)
624 annotations["coords"] = _get_read_xy(name)
625 record = SeqRecord(Seq(seq, alphabet),
626 id=name,
627 name=name,
628 description="",
629 annotations=annotations)
630
631
632 dict.__setitem__(record._per_letter_annotations,
633 "phred_quality", quals)
634
635 return record
636
637 _powers_of_36 = [36 ** i for i in range(6)]
638
639
641 """Interpret a string as a base-36 number as per 454 manual."""
642 total = 0
643 for c, power in zip(string[::-1], _powers_of_36):
644
645
646
647 if 48 <= ord(c) <= 57:
648 val = ord(c) - 22
649 elif 65 <= ord(c) <= 90:
650 val = ord(c) - 65
651 elif 97 <= ord(c) <= 122:
652 val = ord(c) - 97
653 else:
654
655 val = 0
656 total += val * power
657 return total
658
659
661 """Extract coordinates from last 5 characters of read name."""
662 number = _string_as_base_36(read_name[9:])
663 return divmod(number, 4096)
664
665 _time_denominators = [13 * 32 * 24 * 60 * 60,
666 32 * 24 * 60 * 60,
667 24 * 60 * 60,
668 60 * 60,
669 60]
670
671
673 """Extract time from first 6 characters of read name."""
674 time_list = []
675 remainder = _string_as_base_36(read_name[:6])
676 for denominator in _time_denominators:
677 this_term, remainder = divmod(remainder, denominator)
678 time_list.append(this_term)
679 time_list.append(remainder)
680 time_list[0] += 2000
681 return time_list
682
683
685 """Extract region from read name."""
686 return int(read_name[8])
687
688
690 """Extract the next read in the file as a raw (bytes) string (PRIVATE)."""
691 read_header_fmt = '>2HI'
692 read_header_size = struct.calcsize(read_header_fmt)
693 read_flow_fmt = ">%iH" % number_of_flows_per_read
694 read_flow_size = struct.calcsize(read_flow_fmt)
695
696 raw = handle.read(read_header_size)
697 read_header_length, name_length, seq_len \
698 = struct.unpack(read_header_fmt, raw)
699 if read_header_length < 10 or read_header_length % 8 != 0:
700 raise ValueError("Malformed read header, says length is %i"
701 % read_header_length)
702
703 raw += handle.read(8 + name_length)
704
705 padding = read_header_length - read_header_size - 8 - name_length
706 pad = handle.read(padding)
707 if pad.count(_null) != padding:
708 raise ValueError("Post name %i byte padding region contained data"
709 % padding)
710 raw += pad
711
712 raw += handle.read(read_flow_size + seq_len * 3)
713 padding = (read_flow_size + seq_len * 3) % 8
714
715 if padding:
716 padding = 8 - padding
717 pad = handle.read(padding)
718 if pad.count(_null) != padding:
719 raise ValueError("Post quality %i byte padding region contained data"
720 % padding)
721 raw += pad
722
723 return raw
724
725
727 """Wrapper for handles which do not support the tell method (PRIVATE).
728
729 Intended for use with things like network handles where tell (and reverse
730 seek) are not supported. The SFF file needs to track the current offset in
731 order to deal with the index block.
732 """
734 self._handle = handle
735 self._offset = 0
736
737 - def read(self, length):
741
744
745 - def seek(self, offset):
746 if offset < self._offset:
747 raise RunTimeError("Can't seek backwards")
748 self._handle.read(offset - self._offset)
749
751 return self._handle.close()
752
753
754
756 """Iterate over Standard Flowgram Format (SFF) reads (as SeqRecord objects).
757
758 handle - input file, an SFF file, e.g. from Roche 454 sequencing.
759 This must NOT be opened in universal read lines mode!
760 alphabet - optional alphabet, defaults to generic DNA.
761 trim - should the sequences be trimmed?
762
763 The resulting SeqRecord objects should match those from a paired FASTA
764 and QUAL file converted from the SFF file using the Roche 454 tool
765 ssfinfo. i.e. The sequence will be mixed case, with the trim regions
766 shown in lower case.
767
768 This function is used internally via the Bio.SeqIO functions:
769
770 >>> from Bio import SeqIO
771 >>> handle = open("Roche/E3MFGYR02_random_10_reads.sff", "rb")
772 >>> for record in SeqIO.parse(handle, "sff"):
773 ... print record.id, len(record)
774 E3MFGYR02JWQ7T 265
775 E3MFGYR02JA6IL 271
776 E3MFGYR02JHD4H 310
777 E3MFGYR02GFKUC 299
778 E3MFGYR02FTGED 281
779 E3MFGYR02FR9G7 261
780 E3MFGYR02GAZMS 278
781 E3MFGYR02HHZ8O 221
782 E3MFGYR02GPGB1 269
783 E3MFGYR02F7Z7G 219
784 >>> handle.close()
785
786 You can also call it directly:
787
788 >>> handle = open("Roche/E3MFGYR02_random_10_reads.sff", "rb")
789 >>> for record in SffIterator(handle):
790 ... print record.id, len(record)
791 E3MFGYR02JWQ7T 265
792 E3MFGYR02JA6IL 271
793 E3MFGYR02JHD4H 310
794 E3MFGYR02GFKUC 299
795 E3MFGYR02FTGED 281
796 E3MFGYR02FR9G7 261
797 E3MFGYR02GAZMS 278
798 E3MFGYR02HHZ8O 221
799 E3MFGYR02GPGB1 269
800 E3MFGYR02F7Z7G 219
801 >>> handle.close()
802
803 Or, with the trim option:
804
805 >>> handle = open("Roche/E3MFGYR02_random_10_reads.sff", "rb")
806 >>> for record in SffIterator(handle, trim=True):
807 ... print record.id, len(record)
808 E3MFGYR02JWQ7T 260
809 E3MFGYR02JA6IL 265
810 E3MFGYR02JHD4H 292
811 E3MFGYR02GFKUC 295
812 E3MFGYR02FTGED 277
813 E3MFGYR02FR9G7 256
814 E3MFGYR02GAZMS 271
815 E3MFGYR02HHZ8O 150
816 E3MFGYR02GPGB1 221
817 E3MFGYR02F7Z7G 130
818 >>> handle.close()
819
820 """
821 if isinstance(Alphabet._get_base_alphabet(alphabet),
822 Alphabet.ProteinAlphabet):
823 raise ValueError("Invalid alphabet, SFF files do not hold proteins.")
824 if isinstance(Alphabet._get_base_alphabet(alphabet),
825 Alphabet.RNAAlphabet):
826 raise ValueError("Invalid alphabet, SFF files do not hold RNA.")
827 try:
828 assert 0 == handle.tell()
829 except AttributeError:
830
831 handle = _AddTellHandle(handle)
832 header_length, index_offset, index_length, number_of_reads, \
833 number_of_flows_per_read, flow_chars, key_sequence \
834 = _sff_file_header(handle)
835
836
837
838
839
840
841
842
843
844
845 read_header_fmt = '>2HI4H'
846 read_header_size = struct.calcsize(read_header_fmt)
847 read_flow_fmt = ">%iH" % number_of_flows_per_read
848 read_flow_size = struct.calcsize(read_flow_fmt)
849 assert 1 == struct.calcsize(">B")
850 assert 1 == struct.calcsize(">s")
851 assert 1 == struct.calcsize(">c")
852 assert read_header_size % 8 == 0
853
854
855
856 for read in range(number_of_reads):
857 if index_offset and handle.tell() == index_offset:
858 offset = index_offset + index_length
859 if offset % 8:
860 offset += 8 - (offset % 8)
861 assert offset % 8 == 0
862 handle.seek(offset)
863
864
865 index_offset = 0
866 yield _sff_read_seq_record(handle,
867 number_of_flows_per_read,
868 flow_chars,
869 key_sequence,
870 alphabet,
871 trim)
872
873
874 if index_offset and handle.tell() == index_offset:
875 offset = index_offset + index_length
876 if offset % 8:
877 offset += 8 - (offset % 8)
878 assert offset % 8 == 0
879 handle.seek(offset)
880
881 if handle.read(1):
882 raise ValueError("Additional data at end of SFF file")
883
884
885
887 """Iterate over SFF reads (as SeqRecord objects) with trimming (PRIVATE)."""
888 return SffIterator(handle, alphabet, trim=True)
889
890
892 """SFF file writer."""
893
894 - def __init__(self, handle, index=True, xml=None):
895 """Creates the writer object.
896
897 handle - Output handle, ideally in binary write mode.
898 index - Boolean argument, should we try and write an index?
899 xml - Optional string argument, xml manifest to be recorded in the index
900 block (see function ReadRocheXmlManifest for reading this data).
901 """
902 if hasattr(handle, "mode") and "U" in handle.mode.upper():
903 raise ValueError("SFF files must NOT be opened in universal new "
904 "lines mode. Binary mode is required")
905 elif hasattr(handle, "mode") and "B" not in handle.mode.upper():
906 raise ValueError("SFF files must be opened in binary mode")
907 self.handle = handle
908 self._xml = xml
909 if index:
910 self._index = []
911 else:
912 self._index = None
913
915 """Use this to write an entire file containing the given records."""
916 try:
917 self._number_of_reads = len(records)
918 except TypeError:
919 self._number_of_reads = 0
920 if not hasattr(self.handle, "seek") \
921 or not hasattr(self.handle, "tell"):
922 raise ValueError("A handle with a seek/tell methods is "
923 "required in order to record the total "
924 "record count in the file header (once it "
925 "is known at the end).")
926 if self._index is not None and \
927 not (hasattr(self.handle, "seek") and hasattr(self.handle, "tell")):
928 import warnings
929 warnings.warn("A handle with a seek/tell methods is required in "
930 "order to record an SFF index.")
931 self._index = None
932 self._index_start = 0
933 self._index_length = 0
934 if not hasattr(records, "next"):
935 records = iter(records)
936
937
938 try:
939 record = records.next()
940 except StopIteration:
941 record = None
942 if record is None:
943
944
945
946 raise ValueError("Must have at least one sequence")
947 try:
948 self._key_sequence = _as_bytes(record.annotations["flow_key"])
949 self._flow_chars = _as_bytes(record.annotations["flow_chars"])
950 self._number_of_flows_per_read = len(self._flow_chars)
951 except KeyError:
952 raise ValueError("Missing SFF flow information")
953 self.write_header()
954 self.write_record(record)
955 count = 1
956 for record in records:
957 self.write_record(record)
958 count += 1
959 if self._number_of_reads == 0:
960
961 offset = self.handle.tell()
962 self.handle.seek(0)
963 self._number_of_reads = count
964 self.write_header()
965 self.handle.seek(offset)
966 else:
967 assert count == self._number_of_reads
968 if self._index is not None:
969 self._write_index()
970 return count
971
973 assert len(self._index) == self._number_of_reads
974 handle = self.handle
975 self._index.sort()
976 self._index_start = handle.tell()
977
978 if self._xml is not None:
979 xml = _as_bytes(self._xml)
980 else:
981 from Bio import __version__
982 xml = "<!-- This file was output with Biopython %s -->\n" % __version__
983 xml += "<!-- This XML and index block attempts to mimic Roche SFF files -->\n"
984 xml += "<!-- This file may be a combination of multiple SFF files etc -->\n"
985 xml = _as_bytes(xml)
986 xml_len = len(xml)
987
988 fmt = ">I4BLL"
989 fmt_size = struct.calcsize(fmt)
990 handle.write(_null * fmt_size + xml)
991 fmt2 = ">6B"
992 assert 6 == struct.calcsize(fmt2)
993 self._index.sort()
994 index_len = 0
995 for name, offset in self._index:
996
997
998
999 off3 = offset
1000 off0 = off3 % 255
1001 off3 -= off0
1002 off1 = off3 % 65025
1003 off3 -= off1
1004 off2 = off3 % 16581375
1005 off3 -= off2
1006 assert offset == off0 + off1 + off2 + off3, \
1007 "%i -> %i %i %i %i" % (offset, off0, off1, off2, off3)
1008 off3, off2, off1, off0 = off3 // 16581375, off2 // 65025, \
1009 off1 // 255, off0
1010 assert off0 < 255 and off1 < 255 and off2 < 255 and off3 < 255, \
1011 "%i -> %i %i %i %i" % (offset, off0, off1, off2, off3)
1012 handle.write(name + struct.pack(fmt2, 0,
1013 off3, off2, off1, off0, 255))
1014 index_len += len(name) + 6
1015
1016 self._index_length = fmt_size + xml_len + index_len
1017
1018
1019
1020 if self._index_length % 8:
1021 padding = 8 - (self._index_length % 8)
1022 handle.write(_null * padding)
1023 else:
1024 padding = 0
1025 offset = handle.tell()
1026 assert offset == self._index_start + self._index_length + padding, \
1027 "%i vs %i + %i + %i" % (offset, self._index_start,
1028 self._index_length, padding)
1029
1030 handle.seek(self._index_start)
1031 handle.write(struct.pack(fmt, 778921588,
1032 49, 46, 48, 48,
1033 xml_len, index_len) + xml)
1034
1035 handle.seek(0)
1036 self.write_header()
1037 handle.seek(offset)
1038
1040
1041 key_length = len(self._key_sequence)
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054 fmt = '>I4BQIIHHHB%is%is' % (
1055 self._number_of_flows_per_read, key_length)
1056
1057
1058
1059
1060 if struct.calcsize(fmt) % 8 == 0:
1061 padding = 0
1062 else:
1063 padding = 8 - (struct.calcsize(fmt) % 8)
1064 header_length = struct.calcsize(fmt) + padding
1065 assert header_length % 8 == 0
1066 header = struct.pack(fmt, 779314790,
1067 0, 0, 0, 1,
1068 self._index_start, self._index_length,
1069 self._number_of_reads,
1070 header_length, key_length,
1071 self._number_of_flows_per_read,
1072 1,
1073 self._flow_chars, self._key_sequence)
1074 self.handle.write(header + _null * padding)
1075
1077 """Write a single additional record to the output file.
1078
1079 This assumes the header has been done.
1080 """
1081
1082 name = _as_bytes(record.id)
1083 name_len = len(name)
1084 seq = _as_bytes(str(record.seq).upper())
1085 seq_len = len(seq)
1086
1087 try:
1088 quals = record.letter_annotations["phred_quality"]
1089 except KeyError:
1090 raise ValueError("Missing PHRED qualities information")
1091
1092 try:
1093 flow_values = record.annotations["flow_values"]
1094 flow_index = record.annotations["flow_index"]
1095 if self._key_sequence != _as_bytes(record.annotations["flow_key"]) \
1096 or self._flow_chars != _as_bytes(record.annotations["flow_chars"]):
1097 raise ValueError("Records have inconsistent SFF flow data")
1098 except KeyError:
1099 raise ValueError("Missing SFF flow information")
1100 except AttributeError:
1101 raise ValueError("Header not written yet?")
1102
1103 try:
1104 clip_qual_left = record.annotations["clip_qual_left"]
1105 if clip_qual_left:
1106 clip_qual_left += 1
1107 clip_qual_right = record.annotations["clip_qual_right"]
1108 clip_adapter_left = record.annotations["clip_adapter_left"]
1109 if clip_adapter_left:
1110 clip_adapter_left += 1
1111 clip_adapter_right = record.annotations["clip_adapter_right"]
1112 except KeyError:
1113 raise ValueError("Missing SFF clipping information")
1114
1115
1116 if self._index is not None:
1117 offset = self.handle.tell()
1118
1119
1120
1121
1122 if offset > 4228250624:
1123 import warnings
1124 warnings.warn("Read %s has file offset %i, which is too large "
1125 "to store in the Roche SFF index structure. No "
1126 "index block will be recorded." % (name, offset))
1127
1128 self._index = None
1129 else:
1130 self._index.append((name, self.handle.tell()))
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146 read_header_fmt = '>2HI4H%is' % name_len
1147 if struct.calcsize(read_header_fmt) % 8 == 0:
1148 padding = 0
1149 else:
1150 padding = 8 - (struct.calcsize(read_header_fmt) % 8)
1151 read_header_length = struct.calcsize(read_header_fmt) + padding
1152 assert read_header_length % 8 == 0
1153 data = struct.pack(read_header_fmt,
1154 read_header_length,
1155 name_len, seq_len,
1156 clip_qual_left, clip_qual_right,
1157 clip_adapter_left, clip_adapter_right,
1158 name) + _null * padding
1159 assert len(data) == read_header_length
1160
1161
1162 read_flow_fmt = ">%iH" % self._number_of_flows_per_read
1163 read_flow_size = struct.calcsize(read_flow_fmt)
1164 temp_fmt = ">%iB" % seq_len
1165 data += struct.pack(read_flow_fmt, *flow_values) \
1166 + struct.pack(temp_fmt, *flow_index) \
1167 + seq \
1168 + struct.pack(temp_fmt, *quals)
1169
1170 padding = (read_flow_size + seq_len * 3) % 8
1171 if padding:
1172 padding = 8 - padding
1173 self.handle.write(data + _null * padding)
1174
1175
1176 if __name__ == "__main__":
1177 print "Running quick self test"
1178 filename = "../../Tests/Roche/E3MFGYR02_random_10_reads.sff"
1179 metadata = ReadRocheXmlManifest(open(filename, "rb"))
1180 index1 = sorted(_sff_read_roche_index(open(filename, "rb")))
1181 index2 = sorted(_sff_do_slow_index(open(filename, "rb")))
1182 assert index1 == index2
1183 assert len(index1) == len(list(SffIterator(open(filename, "rb"))))
1184 from StringIO import StringIO
1185 try:
1186
1187 from io import BytesIO
1188 except ImportError:
1189 BytesIO = StringIO
1190 assert len(index1) == len(
1191 list(SffIterator(BytesIO(open(filename, "rb").read()))))
1192
1193 if sys.platform != "win32":
1194 assert len(index1) == len(list(SffIterator(open(filename, "r"))))
1195 index2 = sorted(_sff_read_roche_index(open(filename)))
1196 assert index1 == index2
1197 index2 = sorted(_sff_do_slow_index(open(filename)))
1198 assert index1 == index2
1199 assert len(index1) == len(list(SffIterator(open(filename))))
1200 assert len(index1) == len(
1201 list(SffIterator(BytesIO(open(filename, "r").read()))))
1202 assert len(
1203 index1) == len(list(SffIterator(BytesIO(open(filename).read()))))
1204
1205 sff = list(SffIterator(open(filename, "rb")))
1206
1207 sff2 = list(SffIterator(
1208 open("../../Tests/Roche/E3MFGYR02_alt_index_at_end.sff", "rb")))
1209 assert len(sff) == len(sff2)
1210 for old, new in zip(sff, sff2):
1211 assert old.id == new.id
1212 assert str(old.seq) == str(new.seq)
1213
1214 sff2 = list(SffIterator(
1215 open("../../Tests/Roche/E3MFGYR02_alt_index_at_start.sff", "rb")))
1216 assert len(sff) == len(sff2)
1217 for old, new in zip(sff, sff2):
1218 assert old.id == new.id
1219 assert str(old.seq) == str(new.seq)
1220
1221 sff2 = list(SffIterator(
1222 open("../../Tests/Roche/E3MFGYR02_alt_index_in_middle.sff", "rb")))
1223 assert len(sff) == len(sff2)
1224 for old, new in zip(sff, sff2):
1225 assert old.id == new.id
1226 assert str(old.seq) == str(new.seq)
1227
1228 sff2 = list(SffIterator(
1229 open("../../Tests/Roche/E3MFGYR02_index_at_start.sff", "rb")))
1230 assert len(sff) == len(sff2)
1231 for old, new in zip(sff, sff2):
1232 assert old.id == new.id
1233 assert str(old.seq) == str(new.seq)
1234
1235 sff2 = list(SffIterator(
1236 open("../../Tests/Roche/E3MFGYR02_index_in_middle.sff", "rb")))
1237 assert len(sff) == len(sff2)
1238 for old, new in zip(sff, sff2):
1239 assert old.id == new.id
1240 assert str(old.seq) == str(new.seq)
1241
1242 sff_trim = list(SffIterator(open(filename, "rb"), trim=True))
1243
1244 print ReadRocheXmlManifest(open(filename, "rb"))
1245
1246 from Bio import SeqIO
1247 filename = "../../Tests/Roche/E3MFGYR02_random_10_reads_no_trim.fasta"
1248 fasta_no_trim = list(SeqIO.parse(open(filename, "rU"), "fasta"))
1249 filename = "../../Tests/Roche/E3MFGYR02_random_10_reads_no_trim.qual"
1250 qual_no_trim = list(SeqIO.parse(open(filename, "rU"), "qual"))
1251
1252 filename = "../../Tests/Roche/E3MFGYR02_random_10_reads.fasta"
1253 fasta_trim = list(SeqIO.parse(open(filename, "rU"), "fasta"))
1254 filename = "../../Tests/Roche/E3MFGYR02_random_10_reads.qual"
1255 qual_trim = list(SeqIO.parse(open(filename, "rU"), "qual"))
1256
1257 for s, sT, f, q, fT, qT in zip(sff, sff_trim, fasta_no_trim,
1258 qual_no_trim, fasta_trim, qual_trim):
1259
1260 print s.id
1261
1262
1263
1264 assert s.id == f.id == q.id
1265 assert str(s.seq) == str(f.seq)
1266 assert s.letter_annotations[
1267 "phred_quality"] == q.letter_annotations["phred_quality"]
1268
1269 assert s.id == sT.id == fT.id == qT.id
1270 assert str(sT.seq) == str(fT.seq)
1271 assert sT.letter_annotations[
1272 "phred_quality"] == qT.letter_annotations["phred_quality"]
1273
1274 print "Writing with a list of SeqRecords..."
1275 handle = StringIO()
1276 w = SffWriter(handle, xml=metadata)
1277 w.write_file(sff)
1278 data = handle.getvalue()
1279 print "And again with an iterator..."
1280 handle = StringIO()
1281 w = SffWriter(handle, xml=metadata)
1282 w.write_file(iter(sff))
1283 assert data == handle.getvalue()
1284
1285 filename = "../../Tests/Roche/E3MFGYR02_random_10_reads.sff"
1286 original = open(filename, "rb").read()
1287 assert len(data) == len(original)
1288 assert data == original
1289 del data
1290 handle.close()
1291
1292 print "-" * 50
1293 filename = "../../Tests/Roche/greek.sff"
1294 for record in SffIterator(open(filename, "rb")):
1295 print record.id
1296 index1 = sorted(_sff_read_roche_index(open(filename, "rb")))
1297 index2 = sorted(_sff_do_slow_index(open(filename, "rb")))
1298 assert index1 == index2
1299 try:
1300 print ReadRocheXmlManifest(open(filename, "rb"))
1301 assert False, "Should fail!"
1302 except ValueError:
1303 pass
1304
1305 handle = open(filename, "rb")
1306 for record in SffIterator(handle):
1307 pass
1308 try:
1309 for record in SffIterator(handle):
1310 print record.id
1311 assert False, "Should have failed"
1312 except ValueError, err:
1313 print "Checking what happens on re-reading a handle:"
1314 print err
1315
1316 """
1317 #Ugly code to make test files...
1318 index = ".diy1.00This is a fake index block (DIY = Do It Yourself), which is allowed under the SFF standard.\0"
1319 padding = len(index)%8
1320 if padding:
1321 padding = 8 - padding
1322 index += chr(0)*padding
1323 assert len(index)%8 == 0
1324
1325 #Ugly bit of code to make a fake index at start
1326 records = list(SffIterator(
1327 open("../../Tests/Roche/E3MFGYR02_random_10_reads.sff", "rb")))
1328 out_handle = open(
1329 "../../Tests/Roche/E3MFGYR02_alt_index_at_start.sff", "w")
1330 index = ".diy1.00This is a fake index block (DIY = Do It Yourself), which is allowed under the SFF standard.\0"
1331 padding = len(index)%8
1332 if padding:
1333 padding = 8 - padding
1334 index += chr(0)*padding
1335 w = SffWriter(out_handle, index=False, xml=None)
1336 #Fake the header...
1337 w._number_of_reads = len(records)
1338 w._index_start = 0
1339 w._index_length = 0
1340 w._key_sequence = records[0].annotations["flow_key"]
1341 w._flow_chars = records[0].annotations["flow_chars"]
1342 w._number_of_flows_per_read = len(w._flow_chars)
1343 w.write_header()
1344 w._index_start = out_handle.tell()
1345 w._index_length = len(index)
1346 out_handle.seek(0)
1347 w.write_header() #this time with index info
1348 w.handle.write(index)
1349 for record in records:
1350 w.write_record(record)
1351 out_handle.close()
1352 records2 = list(SffIterator(
1353 open("../../Tests/Roche/E3MFGYR02_alt_index_at_start.sff", "rb")))
1354 for old, new in zip(records, records2):
1355 assert str(old.seq)==str(new.seq)
1356 i = list(_sff_do_slow_index(
1357 open("../../Tests/Roche/E3MFGYR02_alt_index_at_start.sff", "rb")))
1358
1359 #Ugly bit of code to make a fake index in middle
1360 records = list(SffIterator(
1361 open("../../Tests/Roche/E3MFGYR02_random_10_reads.sff", "rb")))
1362 out_handle = open(
1363 "../../Tests/Roche/E3MFGYR02_alt_index_in_middle.sff", "w")
1364 index = ".diy1.00This is a fake index block (DIY = Do It Yourself), which is allowed under the SFF standard.\0"
1365 padding = len(index)%8
1366 if padding:
1367 padding = 8 - padding
1368 index += chr(0)*padding
1369 w = SffWriter(out_handle, index=False, xml=None)
1370 #Fake the header...
1371 w._number_of_reads = len(records)
1372 w._index_start = 0
1373 w._index_length = 0
1374 w._key_sequence = records[0].annotations["flow_key"]
1375 w._flow_chars = records[0].annotations["flow_chars"]
1376 w._number_of_flows_per_read = len(w._flow_chars)
1377 w.write_header()
1378 for record in records[:5]:
1379 w.write_record(record)
1380 w._index_start = out_handle.tell()
1381 w._index_length = len(index)
1382 w.handle.write(index)
1383 for record in records[5:]:
1384 w.write_record(record)
1385 out_handle.seek(0)
1386 w.write_header() #this time with index info
1387 out_handle.close()
1388 records2 = list(SffIterator(
1389 open("../../Tests/Roche/E3MFGYR02_alt_index_in_middle.sff", "rb")))
1390 for old, new in zip(records, records2):
1391 assert str(old.seq)==str(new.seq)
1392 j = list(_sff_do_slow_index(
1393 open("../../Tests/Roche/E3MFGYR02_alt_index_in_middle.sff", "rb")))
1394
1395 #Ugly bit of code to make a fake index at end
1396 records = list(SffIterator(
1397 open("../../Tests/Roche/E3MFGYR02_random_10_reads.sff", "rb")))
1398 out_handle = open("../../Tests/Roche/E3MFGYR02_alt_index_at_end.sff", "w")
1399 w = SffWriter(out_handle, index=False, xml=None)
1400 #Fake the header...
1401 w._number_of_reads = len(records)
1402 w._index_start = 0
1403 w._index_length = 0
1404 w._key_sequence = records[0].annotations["flow_key"]
1405 w._flow_chars = records[0].annotations["flow_chars"]
1406 w._number_of_flows_per_read = len(w._flow_chars)
1407 w.write_header()
1408 for record in records:
1409 w.write_record(record)
1410 w._index_start = out_handle.tell()
1411 w._index_length = len(index)
1412 out_handle.write(index)
1413 out_handle.seek(0)
1414 w.write_header() #this time with index info
1415 out_handle.close()
1416 records2 = list(SffIterator(
1417 open("../../Tests/Roche/E3MFGYR02_alt_index_at_end.sff", "rb")))
1418 for old, new in zip(records, records2):
1419 assert str(old.seq)==str(new.seq)
1420 try:
1421 print ReadRocheXmlManifest(
1422 open("../../Tests/Roche/E3MFGYR02_alt_index_at_end.sff", "rb"))
1423 assert False, "Should fail!"
1424 except ValueError:
1425 pass
1426 k = list(_sff_do_slow_index(
1427 open("../../Tests/Roche/E3MFGYR02_alt_index_at_end.sff", "rb")))
1428 """
1429
1430 print "Done"
1431