1
2
3
4
5 """Dictionary like indexing of sequence files (PRIVATE).
6
7 You are not expected to access this module, or any of its code, directly. This
8 is all handled internally by the Bio.SeqIO.index(...) function which is the
9 public interface for this functionality.
10
11 The basic idea is that we scan over a sequence file, looking for new record
12 markers. We then try and extract the string that Bio.SeqIO.parse/read would
13 use as the record id, ideally without actually parsing the full record. We
14 then use a subclassed Python dictionary to record the file offset for the
15 record start against the record id.
16
17 Note that this means full parsing is on demand, so any invalid or problem
18 record may not trigger an exception until it is accessed. This is by design.
19
20 This means our dictionary like objects have in memory ALL the keys (all the
21 record identifiers), which shouldn't be a problem even with second generation
22 sequencing. If this is an issue later on, storing the keys and offsets in a
23 temp lookup file might be one idea (e.g. using SQLite or an OBDA style index).
24 """
25
26 import os
27 try:
28 from collections import UserDict as _dict_base
29 except ImportError:
30 from UserDict import DictMixin as _dict_base
31 import re
32 import itertools
33 from StringIO import StringIO
34
35 try:
36 from sqlite3 import dbapi2 as _sqlite
37 from sqlite3 import IntegrityError as _IntegrityError
38 from sqlite3 import OperationalError as _OperationalError
39 except ImportError:
40
41
42 _sqlite = None
43 pass
44
45 from Bio._py3k import _bytes_to_string, _as_bytes, _as_string
46
47 from Bio import SeqIO
48 from Bio import Alphabet
49
51 """Read only dictionary interface to a sequential sequence file.
52
53 Keeps the keys and associated file offsets in memory, reads the file to
54 access entries as SeqRecord objects using Bio.SeqIO for parsing them.
55 This approach is memory limited, but will work even with millions of
56 sequences.
57
58 Note - as with the Bio.SeqIO.to_dict() function, duplicate keys
59 (record identifiers by default) are not allowed. If this happens,
60 a ValueError exception is raised.
61
62 By default the SeqRecord's id string is used as the dictionary
63 key. This can be changed by suppling an optional key_function,
64 a callback function which will be given the record id and must
65 return the desired key. For example, this allows you to parse
66 NCBI style FASTA identifiers, and extract the GI number to use
67 as the dictionary key.
68
69 Note that this dictionary is essentially read only. You cannot
70 add or change values, pop values, nor clear the dictionary.
71 """
72 - def __init__(self, filename, format, alphabet, key_function):
73
74 try:
75 proxy_class = _FormatToRandomAccess[format]
76 except KeyError:
77 raise ValueError("Unsupported format '%s'" % format)
78 random_access_proxy = proxy_class(filename, format, alphabet)
79 self._proxy = random_access_proxy
80 self._key_function = key_function
81 if key_function:
82 offset_iter = ((key_function(k),o,l) for (k,o,l) in random_access_proxy)
83 else:
84 offset_iter = random_access_proxy
85 offsets = {}
86 for key, offset, length in offset_iter:
87
88
89
90 if key in offsets:
91 self._proxy._handle.close()
92 raise ValueError("Duplicate key '%s'" % key)
93 else:
94 offsets[key] = offset
95 self._offsets = offsets
96
98 return "SeqIO.index(%r, %r, alphabet=%r, key_function=%r)" \
99 % (self._proxy._handle.name, self._proxy._format,
100 self._proxy._alphabet, self._key_function)
101
103 if self:
104 return "{%s : SeqRecord(...), ...}" % repr(self.keys()[0])
105 else:
106 return "{}"
107
109 return key in self._offsets
110
112 """How many records are there?"""
113 return len(self._offsets)
114
115 if hasattr(dict, "iteritems"):
116
118 """Would be a list of the SeqRecord objects, but not implemented.
119
120 In general you can be indexing very very large files, with millions
121 of sequences. Loading all these into memory at once as SeqRecord
122 objects would (probably) use up all the RAM. Therefore we simply
123 don't support this dictionary method.
124 """
125 raise NotImplementedError("Due to memory concerns, when indexing a "
126 "sequence file you cannot access all the "
127 "records at once.")
128
130 """Would be a list of the (key, SeqRecord) tuples, but not implemented.
131
132 In general you can be indexing very very large files, with millions
133 of sequences. Loading all these into memory at once as SeqRecord
134 objects would (probably) use up all the RAM. Therefore we simply
135 don't support this dictionary method.
136 """
137 raise NotImplementedError("Due to memory concerns, when indexing a "
138 "sequence file you cannot access all the "
139 "records at once.")
140
142 """Return a list of all the keys (SeqRecord identifiers)."""
143
144 return self._offsets.keys()
145
147 """Iterate over the SeqRecord) items."""
148 for key in self.__iter__():
149 yield self.__getitem__(key)
150
152 """Iterate over the (key, SeqRecord) items."""
153 for key in self.__iter__():
154 yield key, self.__getitem__(key)
155
157 """Iterate over the keys."""
158 return self.__iter__()
159
160 else:
161
163 """Iterate over the (key, SeqRecord) items."""
164 for key in self.__iter__():
165 yield key, self.__getitem__(key)
166
168 """Iterate over the SeqRecord items."""
169 for key in self.__iter__():
170 yield self.__getitem__(key)
171
173 """Iterate over the keys."""
174 return self.__iter__()
175
177 """Iterate over the keys."""
178 return iter(self._offsets)
179
181 """x.__getitem__(y) <==> x[y]"""
182
183 record = self._proxy.get(self._offsets[key])
184 if self._key_function:
185 key2 = self._key_function(record.id)
186 else:
187 key2 = record.id
188 if key != key2:
189 raise ValueError("Key did not match (%s vs %s)" % (key, key2))
190 return record
191
192 - def get(self, k, d=None):
193 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
194 try:
195 return self.__getitem__(k)
196 except KeyError:
197 return d
198
200 """Similar to the get method, but returns the record as a raw string.
201
202 If the key is not found, a KeyError exception is raised.
203
204 Note that on Python 3 a bytes string is returned, not a typical
205 unicode string.
206
207 NOTE - This functionality is not supported for every file format.
208 """
209
210 return self._proxy.get_raw(self._offsets[key])
211
213 """Would allow setting or replacing records, but not implemented."""
214 raise NotImplementedError("An indexed a sequence file is read only.")
215
216 - def update(self, *args, **kwargs):
217 """Would allow adding more values, but not implemented."""
218 raise NotImplementedError("An indexed a sequence file is read only.")
219
220
221 - def pop(self, key, default=None):
222 """Would remove specified record, but not implemented."""
223 raise NotImplementedError("An indexed a sequence file is read only.")
224
226 """Would remove and return a SeqRecord, but not implemented."""
227 raise NotImplementedError("An indexed a sequence file is read only.")
228
229
231 """Would clear dictionary, but not implemented."""
232 raise NotImplementedError("An indexed a sequence file is read only.")
233
235 """A dictionary method which we don't implement."""
236 raise NotImplementedError("An indexed a sequence file doesn't "
237 "support this.")
238
240 """A dictionary method which we don't implement."""
241 raise NotImplementedError("An indexed a sequence file doesn't "
242 "support this.")
243
244
246 """Read only dictionary interface to many sequential sequence files.
247
248 Keeps the keys, file-numbers and offsets in an SQLite database. To access
249 a record by key, reads from the offset in the approapriate file using
250 Bio.SeqIO for parsing.
251
252 There are OS limits on the number of files that can be open at once,
253 so a pool are kept. If a record is required from a closed file, then
254 one of the open handles is closed first.
255 """
256 - def __init__(self, index_filename, filenames, format, alphabet,
257 key_function, max_open=10):
258 random_access_proxies = {}
259
260
261
262
263 if not _sqlite:
264
265 from Bio import MissingPythonDependencyError
266 raise MissingPythonDependencyError("Requires sqlite3, which is "
267 "included Python 2.5+")
268 if filenames is not None:
269 filenames = list(filenames)
270 if os.path.isfile(index_filename):
271
272 con = _sqlite.connect(index_filename)
273 self._con = con
274
275 try:
276 count, = con.execute("SELECT value FROM meta_data WHERE key=?;",
277 ("count",)).fetchone()
278 self._length = int(count)
279 if self._length == -1:
280 con.close()
281 raise ValueError("Unfinished/partial database")
282 count, = con.execute("SELECT COUNT(key) FROM offset_data;").fetchone()
283 if self._length <> int(count):
284 con.close()
285 raise ValueError("Corrupt database? %i entries not %i" \
286 % (int(count), self._length))
287 self._format, = con.execute("SELECT value FROM meta_data WHERE key=?;",
288 ("format",)).fetchone()
289 if format and format != self._format:
290 con.close()
291 raise ValueError("Index file says format %s, not %s" \
292 % (self._format, format))
293 self._filenames = [row[0] for row in \
294 con.execute("SELECT name FROM file_data "
295 "ORDER BY file_number;").fetchall()]
296 if filenames and len(filenames) != len(self._filenames):
297 con.close()
298 raise ValueError("Index file says %i files, not %i" \
299 % (len(self.filenames) != len(filenames)))
300 if filenames and filenames != self._filenames:
301 con.close()
302 raise ValueError("Index file has different filenames")
303 except _OperationalError, err:
304 con.close()
305 raise ValueError("Not a Biopython index database? %s" % err)
306
307 try:
308 proxy_class = _FormatToRandomAccess[self._format]
309 except KeyError:
310 con.close()
311 raise ValueError("Unsupported format '%s'" % self._format)
312 else:
313 self._filenames = filenames
314 self._format = format
315 if not format or not filenames:
316 raise ValueError("Filenames to index and format required")
317 try:
318 proxy_class = _FormatToRandomAccess[format]
319 except KeyError:
320 raise ValueError("Unsupported format '%s'" % format)
321
322 con = _sqlite.connect(index_filename)
323 self._con = con
324
325
326 con.execute("PRAGMA synchronous='OFF'")
327 con.execute("PRAGMA locking_mode=EXCLUSIVE")
328
329
330
331 con.execute("CREATE TABLE meta_data (key TEXT, value TEXT);")
332 con.execute("INSERT INTO meta_data (key, value) VALUES (?,?);",
333 ("count", -1))
334 con.execute("INSERT INTO meta_data (key, value) VALUES (?,?);",
335 ("format", format))
336
337
338 con.execute("CREATE TABLE file_data (file_number INTEGER, name TEXT);")
339 con.execute("CREATE TABLE offset_data (key TEXT, file_number INTEGER, offset INTEGER, length INTEGER);")
340 count = 0
341 for i, filename in enumerate(filenames):
342 con.execute("INSERT INTO file_data (file_number, name) VALUES (?,?);",
343 (i, filename))
344 random_access_proxy = proxy_class(filename, format, alphabet)
345 if key_function:
346 offset_iter = ((key_function(k),i,o,l) for (k,o,l) in random_access_proxy)
347 else:
348 offset_iter = ((k,i,o,l) for (k,o,l) in random_access_proxy)
349 while True:
350 batch = list(itertools.islice(offset_iter, 100))
351 if not batch: break
352
353
354 con.executemany("INSERT INTO offset_data (key,file_number,offset,length) VALUES (?,?,?,?);",
355 batch)
356 con.commit()
357 count += len(batch)
358 if len(random_access_proxies) < max_open:
359 random_access_proxies[i] = random_access_proxy
360 else:
361 random_access_proxy._handle.close()
362 self._length = count
363
364 try:
365 con.execute("CREATE UNIQUE INDEX IF NOT EXISTS "
366 "key_index ON offset_data(key);")
367 except _IntegrityError, err:
368 self._proxies = random_access_proxies
369 self.close()
370 con.close()
371 raise ValueError("Duplicate key? %s" % err)
372 con.execute("PRAGMA locking_mode=NORMAL")
373 con.execute("UPDATE meta_data SET value = ? WHERE key = ?;",
374 (count, "count"))
375 con.commit()
376
377 self._proxies = random_access_proxies
378 self._max_open = max_open
379 self._index_filename = index_filename
380 self._alphabet = alphabet
381 self._key_function = key_function
382
384 return "SeqIO.index_db(%r, filenames=%r, format=%r, alphabet=%r, key_function=%r)" \
385 % (self._index_filename, self._filenames, self._format,
386 self._alphabet, self._key_function)
387
389 return bool(self._con.execute("SELECT key FROM offset_data WHERE key=?;",
390 (key,)).fetchone())
391
393 """How many records are there?"""
394 return self._length
395
396
398 """Iterate over the keys."""
399 for row in self._con.execute("SELECT key FROM offset_data;"):
400 yield str(row[0])
401
402 if hasattr(dict, "iteritems"):
403
404
406 """Return a list of all the keys (SeqRecord identifiers)."""
407 return [str(row[0]) for row in \
408 self._con.execute("SELECT key FROM offset_data;").fetchall()]
409
411 """x.__getitem__(y) <==> x[y]"""
412
413 row = self._con.execute("SELECT file_number, offset FROM offset_data WHERE key=?;",
414 (key,)).fetchone()
415 if not row: raise KeyError
416 file_number, offset = row
417 proxies = self._proxies
418 if file_number in proxies:
419 record = proxies[file_number].get(offset)
420 else:
421 if len(proxies) >= self._max_open:
422
423 proxies.popitem()[1]._handle.close()
424
425 proxy = _FormatToRandomAccess[self._format]( \
426 self._filenames[file_number],
427 self._format, self._alphabet)
428 record = proxy.get(offset)
429 proxies[file_number] = proxy
430 if self._key_function:
431 key2 = self._key_function(record.id)
432 else:
433 key2 = record.id
434 if key != key2:
435 raise ValueError("Key did not match (%s vs %s)" % (key, key2))
436 return record
437
438 - def get(self, k, d=None):
439 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
440 try:
441 return self.__getitem__(k)
442 except KeyError:
443 return d
444
446 """Similar to the get method, but returns the record as a raw string.
447
448 If the key is not found, a KeyError exception is raised.
449
450 Note that on Python 3 a bytes string is returned, not a typical
451 unicode string.
452
453 NOTE - This functionality is not supported for every file format.
454 """
455
456 row = self._con.execute("SELECT file_number, offset, length FROM offset_data WHERE key=?;",
457 (key,)).fetchone()
458 if not row: raise KeyError
459 file_number, offset, length = row
460 proxies = self._proxies
461 if file_number in proxies:
462 if length:
463
464 h = proxies[file_number]._handle
465 h.seek(offset)
466 return h.read(length)
467 else:
468 return proxies[file_number].get_raw(offset)
469 else:
470
471 if len(proxies) >= self._max_open:
472
473 proxies.popitem()[1]._handle.close()
474
475 proxy = _FormatToRandomAccess[self._format]( \
476 self._filenames[file_number],
477 self._format, self._alphabet)
478 proxies[file_number] = proxy
479 if length:
480
481 h = proxy._handle
482 h.seek(offset)
483 return h.read(length)
484 else:
485 return proxy.get_raw(offset)
486
488 """Close any open file handles."""
489 proxies = self._proxies
490 while proxies:
491 proxies.popitem()[1]._handle.close()
492
493
494
495
497 - def __init__(self, filename, format, alphabet):
498 self._handle = open(filename, "rb")
499 self._alphabet = alphabet
500 self._format = format
501
502
503 i = SeqIO._FormatToIterator[format]
504
505
506 if alphabet is None:
507 def _parse(handle):
508 """Dynamically generated parser function (PRIVATE)."""
509 return i(handle).next()
510 else:
511
512 def _parse(handle):
513 """Dynamically generated parser function (PRIVATE)."""
514 try:
515 return i(handle, alphabet=alphabet).next()
516 except TypeError:
517 return SeqIO._force_alphabet(i(handle),
518 alphabet).next()
519 self._parse = _parse
520
522 """Returns (id,offset) tuples."""
523 raise NotImplementedError("Subclass should implement this")
524
525 - def get(self, offset):
529
531 """Returns bytes string (if implemented for this file format)."""
532
533 raise NotImplementedError("Not available for this file format.")
534
535
536
537
538
539
540
541
542
543
544
545
546
548 """Random access to a Standard Flowgram Format (SFF) file."""
549 - def __init__(self, filename, format, alphabet):
554
556 """Load any index block in the file, or build it the slow way (PRIVATE)."""
557 if self._alphabet is None:
558 self._alphabet = Alphabet.generic_dna
559 handle = self._handle
560 handle.seek(0)
561
562 header_length, index_offset, index_length, number_of_reads, \
563 self._flows_per_read, self._flow_chars, self._key_sequence \
564 = SeqIO.SffIO._sff_file_header(handle)
565 if index_offset and index_length:
566
567 count = 0
568 try :
569 for name, offset in SeqIO.SffIO._sff_read_roche_index(handle) :
570 yield name, offset, 0
571 count += 1
572 assert count == number_of_reads, \
573 "Indexed %i records, expected %i" \
574 % (count, number_of_reads)
575 return
576 except ValueError, err :
577 import warnings
578 warnings.warn("Could not parse the SFF index: %s" % err)
579 assert count==0, "Partially populated index"
580 handle.seek(0)
581
582
583
584 count = 0
585 for name, offset in SeqIO.SffIO._sff_do_slow_index(handle) :
586 yield name, offset, 0
587 count += 1
588 assert count == number_of_reads, \
589 "Indexed %i records, expected %i" % (count, number_of_reads)
590
591 - def get(self, offset) :
599
604
605
607 - def get(self, offset) :
616
617
618
619
620
621
623 - def __init__(self, filename, format, alphabet):
624 SeqFileRandomAccess.__init__(self, filename, format, alphabet)
625 marker = {"ace" : "CO ",
626 "embl" : "ID ",
627 "fasta" : ">",
628 "genbank" : "LOCUS ",
629 "gb": "LOCUS ",
630 "imgt" : "ID ",
631 "phd" : "BEGIN_SEQUENCE",
632 "pir" : ">..;",
633 "qual": ">",
634 "qual": ">",
635 "swiss" : "ID ",
636 "uniprot-xml" : "<entry ",
637 }[format]
638 self._marker = marker
639 self._marker_re = re.compile(_as_bytes("^%s" % marker))
640
642 """Returns (id,offset) tuples."""
643 marker_offset = len(self._marker)
644 marker_re = self._marker_re
645 handle = self._handle
646 handle.seek(0)
647
648 while True:
649 start_offset = handle.tell()
650 line = handle.readline()
651 if marker_re.match(line) or not line:
652 break
653
654 while marker_re.match(line):
655
656
657 id = line[marker_offset:].strip().split(None, 1)[0]
658 while True:
659 line = handle.readline()
660 if marker_re.match(line) or not line:
661 end_offset = handle.tell() - len(line)
662 yield _bytes_to_string(id), start_offset, end_offset - start_offset
663 start_offset = end_offset
664 break
665 assert not line, repr(line)
666
668 """Similar to the get method, but returns the record as a raw string."""
669
670 handle = self._handle
671 marker_re = self._marker_re
672 handle.seek(offset)
673 lines = [handle.readline()]
674 while True:
675 line = handle.readline()
676 if marker_re.match(line) or not line:
677
678 break
679 lines.append(line)
680 return _as_bytes("").join(lines)
681
682
683
684
685
686
688 """Indexed dictionary like access to a GenBank file."""
690 handle = self._handle
691 handle.seek(0)
692 marker_re = self._marker_re
693 dot_char = _as_bytes(".")
694 accession_marker = _as_bytes("ACCESSION ")
695 version_marker = _as_bytes("VERSION ")
696
697 while True:
698 start_offset = handle.tell()
699 line = handle.readline()
700 if marker_re.match(line) or not line:
701 break
702
703 while marker_re.match(line):
704
705
706 key = None
707 while True:
708 line = handle.readline()
709 if marker_re.match(line) or not line:
710 if not key:
711 raise ValueError("Did not find ACCESSION/VERSION lines")
712 end_offset = handle.tell() - len(line)
713 yield _bytes_to_string(key), start_offset, end_offset - start_offset
714 start_offset = end_offset
715 break
716 elif line.startswith(accession_marker):
717 key = line.rstrip().split()[1]
718 elif line.startswith(version_marker):
719 version_id = line.rstrip().split()[1]
720 if version_id.count(dot_char)==1 and version_id.split(dot_char)[1].isdigit():
721
722 key = version_id
723 assert not line, repr(line)
724
725
727 """Indexed dictionary like access to an EMBL file."""
729 handle = self._handle
730 handle.seek(0)
731 marker_re = self._marker_re
732 semi_char = _as_bytes(";")
733 dot_char = _as_bytes(".")
734 sv_marker = _as_bytes("SV ")
735
736 while True:
737 start_offset = handle.tell()
738 line = handle.readline()
739 if marker_re.match(line) or not line:
740 break
741
742 while marker_re.match(line):
743
744
745 if line[2:].count(semi_char) == 6:
746
747 parts = line[3:].rstrip().split(semi_char)
748 if parts[1].strip().startswith(sv_marker):
749
750 key = parts[0].strip() + dot_char + parts[1].strip().split()[1]
751 else:
752 key = parts[0].strip()
753 elif line[2:].count(semi_char) == 3:
754
755 key = line[3:].strip().split(None,1)[0]
756 else:
757 raise ValueError('Did not recognise the ID line layout:\n' + line)
758 while True:
759 line = handle.readline()
760 if marker_re.match(line) or not line:
761 end_offset = handle.tell() - len(line)
762 yield _bytes_to_string(key), start_offset, end_offset - start_offset
763 start_offset = end_offset
764 break
765 elif line.startswith(sv_marker):
766 key = line.rstrip().split()[1]
767 assert not line, repr(line)
768
769
771 """Random access to a SwissProt file."""
798
799
801 """Random access to a UniProt XML file."""
803 handle = self._handle
804 handle.seek(0)
805 marker_re = self._marker_re
806 start_acc_marker = _as_bytes("<accession>")
807 end_acc_marker = _as_bytes("</accession>")
808 end_entry_marker = _as_bytes("</entry>")
809
810 while True:
811 start_offset = handle.tell()
812 line = handle.readline()
813 if marker_re.match(line) or not line:
814 break
815
816 while marker_re.match(line):
817
818
819
820 key = None
821 done = False
822 while True:
823 line = handle.readline()
824 if key is None and start_acc_marker in line:
825 assert end_acc_marker in line, line
826 key = line[line.find(start_acc_marker)+11:].split(_as_bytes("<"))[0]
827 elif end_entry_marker in line:
828 end_offset = handle.tell() - len(line) \
829 + line.find(end_entry_marker) + 8
830 break
831 elif marker_re.match(line) or not line:
832
833 raise ValueError("Didn't find end of record")
834 if not key:
835 raise ValueError("Did not find <accession> line in bytes %i to %i" \
836 % (start_offset, end_offset))
837 yield _bytes_to_string(key), start_offset, end_offset - start_offset
838
839 while not marker_re.match(line) and line:
840 start_offset = handle.tell()
841 line = handle.readline()
842 assert not line, repr(line)
843
845 """Similar to the get method, but returns the record as a raw string."""
846 handle = self._handle
847 marker_re = self._marker_re
848 end_entry_marker = _as_bytes("</entry>")
849 handle.seek(offset)
850 data = handle.readline()
851 while True:
852 line = handle.readline()
853 i = line.find(end_entry_marker)
854 if i != -1:
855 data += line[:i+8]
856 break
857 if marker_re.match(line) or not line:
858
859 raise ValueError("Didn't find end of record")
860 data += line
861 return data
862
863 - def get(self, offset) :
864
865
866
867 data = """<?xml version='1.0' encoding='UTF-8'?>
868 <uniprot xmlns="http://uniprot.org/uniprot"
869 xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
870 xsi:schemaLocation="http://uniprot.org/uniprot
871 http://www.uniprot.org/support/docs/uniprot.xsd">
872 %s
873 </uniprot>
874 """ % _bytes_to_string(self.get_raw(offset))
875
876 return SeqIO.UniprotIO.UniprotIterator(data).next()
877
878
880 """Random access to a IntelliGenetics file."""
881 - def __init__(self, filename, format, alphabet):
884
886 handle = self._handle
887 handle.seek(0)
888 marker_re = self._marker_re
889 semi_char = _as_bytes(";")
890 while True:
891 offset = handle.tell()
892 line = handle.readline()
893 if marker_re.match(line):
894
895 while True:
896 line = handle.readline()
897 if line[0:1] != semi_char and line.strip():
898 key = line.split()[0]
899 yield _bytes_to_string(key), offset, 0
900 break
901 if not line:
902 raise ValueError("Premature end of file?")
903 elif not line:
904
905 break
906
907
922
924 """Random access to a simple tabbed file."""
926 handle = self._handle
927 handle.seek(0)
928 start_offset = handle.tell()
929 tab_char = _as_bytes("\t")
930 while True:
931 line = handle.readline()
932 if not line : break
933 try:
934 key = line.split(tab_char)[0]
935 except ValueError, err:
936 if not line.strip():
937
938 start_offset = handle.tell()
939 continue
940 else:
941 raise err
942 else:
943 end_offset = handle.tell()
944 yield _bytes_to_string(key), start_offset, end_offset - start_offset
945 start_offset = end_offset
946
952
953
954
955
956
957
959 """Random access to a FASTQ file (any supported variant).
960
961 With FASTQ the records all start with a "@" line, but so can quality lines.
962 Note this will cope with line-wrapped FASTQ files.
963 """
965 handle = self._handle
966 handle.seek(0)
967 id = None
968 start_offset = handle.tell()
969 line = handle.readline()
970 if not line:
971
972 return
973 at_char = _as_bytes("@")
974 plus_char = _as_bytes("+")
975 if line[0:1] != at_char:
976 raise ValueError("Problem with FASTQ @ line:\n%s" % repr(line))
977 while line:
978
979
980 id = line[1:].rstrip().split(None, 1)[0]
981
982 seq_len = 0
983 while line:
984 line = handle.readline()
985 if line.startswith(plus_char) : break
986 seq_len += len(line.strip())
987 if not line:
988 raise ValueError("Premature end of file in seq section")
989
990
991 qual_len = 0
992 while line:
993 if seq_len == qual_len:
994
995 line = handle.readline()
996 if line and line[0:1] != at_char:
997 ValueError("Problem with line %s" % repr(line))
998 break
999 else:
1000 line = handle.readline()
1001 qual_len += len(line.strip())
1002 if seq_len != qual_len:
1003 raise ValueError("Problem with quality section")
1004 end_offset = handle.tell() - len(line)
1005 yield _bytes_to_string(id), start_offset, end_offset - start_offset
1006 start_offset = end_offset
1007
1008
1010 """Similar to the get method, but returns the record as a raw string."""
1011
1012 handle = self._handle
1013 handle.seek(offset)
1014 line = handle.readline()
1015 data = line
1016 at_char = _as_bytes("@")
1017 plus_char = _as_bytes("+")
1018 if line[0:1] != at_char:
1019 raise ValueError("Problem with FASTQ @ line:\n%s" % repr(line))
1020 identifier = line[1:].rstrip().split(None, 1)[0]
1021
1022 seq_len = 0
1023 while line:
1024 line = handle.readline()
1025 data += line
1026 if line.startswith(plus_char) : break
1027 seq_len += len(line.strip())
1028 if not line:
1029 raise ValueError("Premature end of file in seq section")
1030 assert line[0:1] == plus_char
1031
1032 qual_len = 0
1033 while line:
1034 if seq_len == qual_len:
1035
1036 pos = handle.tell()
1037 line = handle.readline()
1038 if line and line[0:1] != at_char:
1039 ValueError("Problem with line %s" % repr(line))
1040 break
1041 else:
1042 line = handle.readline()
1043 data += line
1044 qual_len += len(line.strip())
1045 if seq_len != qual_len:
1046 raise ValueError("Problem with quality section")
1047 return data
1048
1049
1050
1051
1052 _FormatToRandomAccess = {"ace" : SequentialSeqFileRandomAccess,
1053 "embl" : EmblRandomAccess,
1054 "fasta" : SequentialSeqFileRandomAccess,
1055 "fastq" : FastqRandomAccess,
1056 "fastq-sanger" : FastqRandomAccess,
1057 "fastq-solexa" : FastqRandomAccess,
1058 "fastq-illumina" : FastqRandomAccess,
1059 "genbank" : GenBankRandomAccess,
1060 "gb" : GenBankRandomAccess,
1061 "ig" : IntelliGeneticsRandomAccess,
1062 "imgt" : EmblRandomAccess,
1063 "phd" : SequentialSeqFileRandomAccess,
1064 "pir" : SequentialSeqFileRandomAccess,
1065 "sff" : SffRandomAccess,
1066 "sff-trim" : SffTrimedRandomAccess,
1067 "swiss" : SwissRandomAccess,
1068 "tab" : TabRandomAccess,
1069 "qual" : SequentialSeqFileRandomAccess,
1070 "uniprot-xml" : UniprotRandomAccess,
1071 }
1072