Package Bio :: Package SeqIO :: Module _index
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO._index

   1  # Copyright 2009-2011 by Peter Cock.  All rights reserved. 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license.  Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5  """Dictionary like indexing of sequence files (PRIVATE). 
   6   
   7  You are not expected to access this module, or any of its code, directly. This 
   8  is all handled internally by the Bio.SeqIO.index(...) function which is the 
   9  public interface for this functionality. 
  10   
  11  The basic idea is that we scan over a sequence file, looking for new record 
  12  markers. We then try and extract the string that Bio.SeqIO.parse/read would 
  13  use as the record id, ideally without actually parsing the full record. We 
  14  then use a subclassed Python dictionary to record the file offset for the 
  15  record start against the record id. 
  16   
  17  Note that this means full parsing is on demand, so any invalid or problem 
  18  record may not trigger an exception until it is accessed. This is by design. 
  19   
  20  This means our dictionary like objects have in memory ALL the keys (all the 
  21  record identifiers), which shouldn't be a problem even with second generation 
  22  sequencing. If this is an issue later on, storing the keys and offsets in a 
  23  temp lookup file might be one idea (e.g. using SQLite or an OBDA style index). 
  24  """ 
  25   
  26  import os 
  27  try: 
  28      from collections import UserDict as _dict_base 
  29  except ImportError: 
  30      from UserDict import DictMixin as _dict_base 
  31  import re 
  32  import itertools 
  33  from StringIO import StringIO 
  34   
  35  try: 
  36      from sqlite3 import dbapi2 as _sqlite 
  37      from sqlite3 import IntegrityError as _IntegrityError 
  38      from sqlite3 import OperationalError as _OperationalError 
  39  except ImportError: 
  40      #Not expected to be present on Python 2.4, ignore it 
  41      #and at least offer Bio.SeqIO.index() functionality 
  42      _sqlite = None 
  43      pass 
  44   
  45  from Bio._py3k import _bytes_to_string, _as_bytes, _as_string 
  46   
  47  from Bio import SeqIO 
  48  from Bio import Alphabet 
  49   
50 -class _IndexedSeqFileDict(_dict_base):
51 """Read only dictionary interface to a sequential sequence file. 52 53 Keeps the keys and associated file offsets in memory, reads the file to 54 access entries as SeqRecord objects using Bio.SeqIO for parsing them. 55 This approach is memory limited, but will work even with millions of 56 sequences. 57 58 Note - as with the Bio.SeqIO.to_dict() function, duplicate keys 59 (record identifiers by default) are not allowed. If this happens, 60 a ValueError exception is raised. 61 62 By default the SeqRecord's id string is used as the dictionary 63 key. This can be changed by suppling an optional key_function, 64 a callback function which will be given the record id and must 65 return the desired key. For example, this allows you to parse 66 NCBI style FASTA identifiers, and extract the GI number to use 67 as the dictionary key. 68 69 Note that this dictionary is essentially read only. You cannot 70 add or change values, pop values, nor clear the dictionary. 71 """
72 - def __init__(self, filename, format, alphabet, key_function):
73 #Use key_function=None for default value 74 try: 75 proxy_class = _FormatToRandomAccess[format] 76 except KeyError: 77 raise ValueError("Unsupported format '%s'" % format) 78 random_access_proxy = proxy_class(filename, format, alphabet) 79 self._proxy = random_access_proxy 80 self._key_function = key_function 81 if key_function: 82 offset_iter = ((key_function(k),o,l) for (k,o,l) in random_access_proxy) 83 else: 84 offset_iter = random_access_proxy 85 offsets = {} 86 for key, offset, length in offset_iter: 87 #Note - we don't store the length because I want to minimise the 88 #memory requirements. With the SQLite backend the length is kept 89 #and is used to speed up the get_raw method (by about 3 times). 90 if key in offsets: 91 self._proxy._handle.close() 92 raise ValueError("Duplicate key '%s'" % key) 93 else: 94 offsets[key] = offset 95 self._offsets = offsets
96
97 - def __repr__(self):
98 return "SeqIO.index(%r, %r, alphabet=%r, key_function=%r)" \ 99 % (self._proxy._handle.name, self._proxy._format, 100 self._proxy._alphabet, self._key_function)
101
102 - def __str__(self):
103 if self: 104 return "{%s : SeqRecord(...), ...}" % repr(self.keys()[0]) 105 else: 106 return "{}"
107
108 - def __contains__(self, key) :
109 return key in self._offsets
110
111 - def __len__(self):
112 """How many records are there?""" 113 return len(self._offsets)
114 115 if hasattr(dict, "iteritems"): 116 #Python 2, use iteritems but not items etc
117 - def values(self):
118 """Would be a list of the SeqRecord objects, but not implemented. 119 120 In general you can be indexing very very large files, with millions 121 of sequences. Loading all these into memory at once as SeqRecord 122 objects would (probably) use up all the RAM. Therefore we simply 123 don't support this dictionary method. 124 """ 125 raise NotImplementedError("Due to memory concerns, when indexing a " 126 "sequence file you cannot access all the " 127 "records at once.")
128
129 - def items(self):
130 """Would be a list of the (key, SeqRecord) tuples, but not implemented. 131 132 In general you can be indexing very very large files, with millions 133 of sequences. Loading all these into memory at once as SeqRecord 134 objects would (probably) use up all the RAM. Therefore we simply 135 don't support this dictionary method. 136 """ 137 raise NotImplementedError("Due to memory concerns, when indexing a " 138 "sequence file you cannot access all the " 139 "records at once.")
140
141 - def keys(self) :
142 """Return a list of all the keys (SeqRecord identifiers).""" 143 #TODO - Stick a warning in here for large lists? Or just refuse? 144 return self._offsets.keys()
145
146 - def itervalues(self):
147 """Iterate over the SeqRecord) items.""" 148 for key in self.__iter__(): 149 yield self.__getitem__(key)
150
151 - def iteritems(self):
152 """Iterate over the (key, SeqRecord) items.""" 153 for key in self.__iter__(): 154 yield key, self.__getitem__(key)
155
156 - def iterkeys(self):
157 """Iterate over the keys.""" 158 return self.__iter__()
159 160 else: 161 #Python 3 - define items and values as iterators
162 - def items(self):
163 """Iterate over the (key, SeqRecord) items.""" 164 for key in self.__iter__(): 165 yield key, self.__getitem__(key)
166
167 - def values(self):
168 """Iterate over the SeqRecord items.""" 169 for key in self.__iter__(): 170 yield self.__getitem__(key)
171
172 - def keys(self):
173 """Iterate over the keys.""" 174 return self.__iter__()
175
176 - def __iter__(self):
177 """Iterate over the keys.""" 178 return iter(self._offsets)
179
180 - def __getitem__(self, key):
181 """x.__getitem__(y) <==> x[y]""" 182 #Pass the offset to the proxy 183 record = self._proxy.get(self._offsets[key]) 184 if self._key_function: 185 key2 = self._key_function(record.id) 186 else: 187 key2 = record.id 188 if key != key2: 189 raise ValueError("Key did not match (%s vs %s)" % (key, key2)) 190 return record
191
192 - def get(self, k, d=None):
193 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" 194 try: 195 return self.__getitem__(k) 196 except KeyError: 197 return d
198
199 - def get_raw(self, key):
200 """Similar to the get method, but returns the record as a raw string. 201 202 If the key is not found, a KeyError exception is raised. 203 204 Note that on Python 3 a bytes string is returned, not a typical 205 unicode string. 206 207 NOTE - This functionality is not supported for every file format. 208 """ 209 #Pass the offset to the proxy 210 return self._proxy.get_raw(self._offsets[key])
211
212 - def __setitem__(self, key, value):
213 """Would allow setting or replacing records, but not implemented.""" 214 raise NotImplementedError("An indexed a sequence file is read only.")
215
216 - def update(self, *args, **kwargs):
217 """Would allow adding more values, but not implemented.""" 218 raise NotImplementedError("An indexed a sequence file is read only.")
219 220
221 - def pop(self, key, default=None):
222 """Would remove specified record, but not implemented.""" 223 raise NotImplementedError("An indexed a sequence file is read only.")
224
225 - def popitem(self):
226 """Would remove and return a SeqRecord, but not implemented.""" 227 raise NotImplementedError("An indexed a sequence file is read only.")
228 229
230 - def clear(self):
231 """Would clear dictionary, but not implemented.""" 232 raise NotImplementedError("An indexed a sequence file is read only.")
233
234 - def fromkeys(self, keys, value=None):
235 """A dictionary method which we don't implement.""" 236 raise NotImplementedError("An indexed a sequence file doesn't " 237 "support this.")
238
239 - def copy(self):
240 """A dictionary method which we don't implement.""" 241 raise NotImplementedError("An indexed a sequence file doesn't " 242 "support this.")
243 244
245 -class _SQLiteManySeqFilesDict(_IndexedSeqFileDict):
246 """Read only dictionary interface to many sequential sequence files. 247 248 Keeps the keys, file-numbers and offsets in an SQLite database. To access 249 a record by key, reads from the offset in the approapriate file using 250 Bio.SeqIO for parsing. 251 252 There are OS limits on the number of files that can be open at once, 253 so a pool are kept. If a record is required from a closed file, then 254 one of the open handles is closed first. 255 """
256 - def __init__(self, index_filename, filenames, format, alphabet, 257 key_function, max_open=10):
258 random_access_proxies = {} 259 #TODO? - Don't keep filename list in memory (just in DB)? 260 #Should save a chunk of memory if dealing with 1000s of files. 261 #Furthermore could compare a generator to the DB on reloading 262 #(no need to turn it into a list) 263 if not _sqlite: 264 #Hack for Python 2.4 (of if Python is compiled without it) 265 from Bio import MissingPythonDependencyError 266 raise MissingPythonDependencyError("Requires sqlite3, which is " 267 "included Python 2.5+") 268 if filenames is not None: 269 filenames = list(filenames) #In case it was a generator 270 if os.path.isfile(index_filename): 271 #Reuse the index. 272 con = _sqlite.connect(index_filename) 273 self._con = con 274 #Check the count... 275 try: 276 count, = con.execute("SELECT value FROM meta_data WHERE key=?;", 277 ("count",)).fetchone() 278 self._length = int(count) 279 if self._length == -1: 280 con.close() 281 raise ValueError("Unfinished/partial database") 282 count, = con.execute("SELECT COUNT(key) FROM offset_data;").fetchone() 283 if self._length <> int(count): 284 con.close() 285 raise ValueError("Corrupt database? %i entries not %i" \ 286 % (int(count), self._length)) 287 self._format, = con.execute("SELECT value FROM meta_data WHERE key=?;", 288 ("format",)).fetchone() 289 if format and format != self._format: 290 con.close() 291 raise ValueError("Index file says format %s, not %s" \ 292 % (self._format, format)) 293 self._filenames = [row[0] for row in \ 294 con.execute("SELECT name FROM file_data " 295 "ORDER BY file_number;").fetchall()] 296 if filenames and len(filenames) != len(self._filenames): 297 con.close() 298 raise ValueError("Index file says %i files, not %i" \ 299 % (len(self.filenames) != len(filenames))) 300 if filenames and filenames != self._filenames: 301 con.close() 302 raise ValueError("Index file has different filenames") 303 except _OperationalError, err: 304 con.close() 305 raise ValueError("Not a Biopython index database? %s" % err) 306 #Now we have the format (from the DB if not given to us), 307 try: 308 proxy_class = _FormatToRandomAccess[self._format] 309 except KeyError: 310 con.close() 311 raise ValueError("Unsupported format '%s'" % self._format) 312 else: 313 self._filenames = filenames 314 self._format = format 315 if not format or not filenames: 316 raise ValueError("Filenames to index and format required") 317 try: 318 proxy_class = _FormatToRandomAccess[format] 319 except KeyError: 320 raise ValueError("Unsupported format '%s'" % format) 321 #Create the index 322 con = _sqlite.connect(index_filename) 323 self._con = con 324 #print "Creating index" 325 # Sqlite PRAGMA settings for speed 326 con.execute("PRAGMA synchronous='OFF'") 327 con.execute("PRAGMA locking_mode=EXCLUSIVE") 328 #Don't index the key column until the end (faster) 329 #con.execute("CREATE TABLE offset_data (key TEXT PRIMARY KEY, " 330 # "offset INTEGER);") 331 con.execute("CREATE TABLE meta_data (key TEXT, value TEXT);") 332 con.execute("INSERT INTO meta_data (key, value) VALUES (?,?);", 333 ("count", -1)) 334 con.execute("INSERT INTO meta_data (key, value) VALUES (?,?);", 335 ("format", format)) 336 #TODO - Record the alphabet? 337 #TODO - Record the file size and modified date? 338 con.execute("CREATE TABLE file_data (file_number INTEGER, name TEXT);") 339 con.execute("CREATE TABLE offset_data (key TEXT, file_number INTEGER, offset INTEGER, length INTEGER);") 340 count = 0 341 for i, filename in enumerate(filenames): 342 con.execute("INSERT INTO file_data (file_number, name) VALUES (?,?);", 343 (i, filename)) 344 random_access_proxy = proxy_class(filename, format, alphabet) 345 if key_function: 346 offset_iter = ((key_function(k),i,o,l) for (k,o,l) in random_access_proxy) 347 else: 348 offset_iter = ((k,i,o,l) for (k,o,l) in random_access_proxy) 349 while True: 350 batch = list(itertools.islice(offset_iter, 100)) 351 if not batch: break 352 #print "Inserting batch of %i offsets, %s ... %s" \ 353 # % (len(batch), batch[0][0], batch[-1][0]) 354 con.executemany("INSERT INTO offset_data (key,file_number,offset,length) VALUES (?,?,?,?);", 355 batch) 356 con.commit() 357 count += len(batch) 358 if len(random_access_proxies) < max_open: 359 random_access_proxies[i] = random_access_proxy 360 else: 361 random_access_proxy._handle.close() 362 self._length = count 363 #print "About to index %i entries" % count 364 try: 365 con.execute("CREATE UNIQUE INDEX IF NOT EXISTS " 366 "key_index ON offset_data(key);") 367 except _IntegrityError, err: 368 self._proxies = random_access_proxies 369 self.close() 370 con.close() 371 raise ValueError("Duplicate key? %s" % err) 372 con.execute("PRAGMA locking_mode=NORMAL") 373 con.execute("UPDATE meta_data SET value = ? WHERE key = ?;", 374 (count, "count")) 375 con.commit() 376 #print "Index created" 377 self._proxies = random_access_proxies 378 self._max_open = max_open 379 self._index_filename = index_filename 380 self._alphabet = alphabet 381 self._key_function = key_function
382
383 - def __repr__(self):
384 return "SeqIO.index_db(%r, filenames=%r, format=%r, alphabet=%r, key_function=%r)" \ 385 % (self._index_filename, self._filenames, self._format, 386 self._alphabet, self._key_function)
387
388 - def __contains__(self, key):
389 return bool(self._con.execute("SELECT key FROM offset_data WHERE key=?;", 390 (key,)).fetchone())
391
392 - def __len__(self):
393 """How many records are there?""" 394 return self._length
395 #return self._con.execute("SELECT COUNT(key) FROM offset_data;").fetchone()[0] 396
397 - def __iter__(self):
398 """Iterate over the keys.""" 399 for row in self._con.execute("SELECT key FROM offset_data;"): 400 yield str(row[0])
401 402 if hasattr(dict, "iteritems"): 403 #Python 2, use iteritems but not items etc 404 #Just need to override this...
405 - def keys(self) :
406 """Return a list of all the keys (SeqRecord identifiers).""" 407 return [str(row[0]) for row in \ 408 self._con.execute("SELECT key FROM offset_data;").fetchall()]
409
410 - def __getitem__(self, key):
411 """x.__getitem__(y) <==> x[y]""" 412 #Pass the offset to the proxy 413 row = self._con.execute("SELECT file_number, offset FROM offset_data WHERE key=?;", 414 (key,)).fetchone() 415 if not row: raise KeyError 416 file_number, offset = row 417 proxies = self._proxies 418 if file_number in proxies: 419 record = proxies[file_number].get(offset) 420 else: 421 if len(proxies) >= self._max_open: 422 #Close an old handle... 423 proxies.popitem()[1]._handle.close() 424 #Open a new handle... 425 proxy = _FormatToRandomAccess[self._format]( \ 426 self._filenames[file_number], 427 self._format, self._alphabet) 428 record = proxy.get(offset) 429 proxies[file_number] = proxy 430 if self._key_function: 431 key2 = self._key_function(record.id) 432 else: 433 key2 = record.id 434 if key != key2: 435 raise ValueError("Key did not match (%s vs %s)" % (key, key2)) 436 return record
437
438 - def get(self, k, d=None):
439 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" 440 try: 441 return self.__getitem__(k) 442 except KeyError: 443 return d
444
445 - def get_raw(self, key):
446 """Similar to the get method, but returns the record as a raw string. 447 448 If the key is not found, a KeyError exception is raised. 449 450 Note that on Python 3 a bytes string is returned, not a typical 451 unicode string. 452 453 NOTE - This functionality is not supported for every file format. 454 """ 455 #Pass the offset to the proxy 456 row = self._con.execute("SELECT file_number, offset, length FROM offset_data WHERE key=?;", 457 (key,)).fetchone() 458 if not row: raise KeyError 459 file_number, offset, length = row 460 proxies = self._proxies 461 if file_number in proxies: 462 if length: 463 #Shortcut if we have the length 464 h = proxies[file_number]._handle 465 h.seek(offset) 466 return h.read(length) 467 else: 468 return proxies[file_number].get_raw(offset) 469 else: 470 #This code is duplicated from __getitem__ to avoid a function call 471 if len(proxies) >= self._max_open: 472 #Close an old handle... 473 proxies.popitem()[1]._handle.close() 474 #Open a new handle... 475 proxy = _FormatToRandomAccess[self._format]( \ 476 self._filenames[file_number], 477 self._format, self._alphabet) 478 proxies[file_number] = proxy 479 if length: 480 #Shortcut if we have the length 481 h = proxy._handle 482 h.seek(offset) 483 return h.read(length) 484 else: 485 return proxy.get_raw(offset)
486
487 - def close(self):
488 """Close any open file handles.""" 489 proxies = self._proxies 490 while proxies: 491 proxies.popitem()[1]._handle.close()
492 493 494 ############################################################################## 495
496 -class SeqFileRandomAccess(object):
497 - def __init__(self, filename, format, alphabet):
498 self._handle = open(filename, "rb") 499 self._alphabet = alphabet 500 self._format = format 501 #Load the parser class/function once an avoid the dict lookup in each 502 #__getitem__ call: 503 i = SeqIO._FormatToIterator[format] 504 #The following alphabet code is a bit nasty... duplicates logic in 505 #Bio.SeqIO.parse() 506 if alphabet is None: 507 def _parse(handle): 508 """Dynamically generated parser function (PRIVATE).""" 509 return i(handle).next()
510 else: 511 #TODO - Detect alphabet support ONCE at __init__ 512 def _parse(handle): 513 """Dynamically generated parser function (PRIVATE).""" 514 try: 515 return i(handle, alphabet=alphabet).next() 516 except TypeError: 517 return SeqIO._force_alphabet(i(handle), 518 alphabet).next()
519 self._parse = _parse 520
521 - def __iter__(self):
522 """Returns (id,offset) tuples.""" 523 raise NotImplementedError("Subclass should implement this")
524
525 - def get(self, offset):
526 """Returns SeqRecord.""" 527 #Should be overriden for binary file formats etc: 528 return self._parse(StringIO(_bytes_to_string(self.get_raw(offset))))
529
530 - def get_raw(self, offset):
531 """Returns bytes string (if implemented for this file format).""" 532 #Should be done by each sub-class (if possible) 533 raise NotImplementedError("Not available for this file format.")
534 535 536 537 538 #################### 539 # Special indexers # 540 #################### 541 542 # Anything where the records cannot be read simply by parsing from 543 # the record start. For example, anything requiring information from 544 # a file header - e.g. SFF files where we would need to know the 545 # number of flows. 546
547 -class SffRandomAccess(SeqFileRandomAccess):
548 """Random access to a Standard Flowgram Format (SFF) file."""
549 - def __init__(self, filename, format, alphabet):
550 SeqFileRandomAccess.__init__(self, filename, format, alphabet) 551 header_length, index_offset, index_length, number_of_reads, \ 552 self._flows_per_read, self._flow_chars, self._key_sequence \ 553 = SeqIO.SffIO._sff_file_header(self._handle)
554
555 - def __iter__(self):
556 """Load any index block in the file, or build it the slow way (PRIVATE).""" 557 if self._alphabet is None: 558 self._alphabet = Alphabet.generic_dna 559 handle = self._handle 560 handle.seek(0) 561 #Alread did this in __init__ but need handle in right place 562 header_length, index_offset, index_length, number_of_reads, \ 563 self._flows_per_read, self._flow_chars, self._key_sequence \ 564 = SeqIO.SffIO._sff_file_header(handle) 565 if index_offset and index_length: 566 #There is an index provided, try this the fast way: 567 count = 0 568 try : 569 for name, offset in SeqIO.SffIO._sff_read_roche_index(handle) : 570 yield name, offset, 0 571 count += 1 572 assert count == number_of_reads, \ 573 "Indexed %i records, expected %i" \ 574 % (count, number_of_reads) 575 return 576 except ValueError, err : 577 import warnings 578 warnings.warn("Could not parse the SFF index: %s" % err) 579 assert count==0, "Partially populated index" 580 handle.seek(0) 581 #We used to give a warning in this case, but Ion Torrent's 582 #SFF files don't have an index so that would be annoying. 583 #Fall back on the slow way! 584 count = 0 585 for name, offset in SeqIO.SffIO._sff_do_slow_index(handle) : 586 yield name, offset, 0 587 count += 1 588 assert count == number_of_reads, \ 589 "Indexed %i records, expected %i" % (count, number_of_reads)
590
591 - def get(self, offset) :
592 handle = self._handle 593 handle.seek(offset) 594 return SeqIO.SffIO._sff_read_seq_record(handle, 595 self._flows_per_read, 596 self._flow_chars, 597 self._key_sequence, 598 self._alphabet)
599
600 - def get_raw(self, offset):
601 handle = self._handle 602 handle.seek(offset) 603 return SeqIO.SffIO._sff_read_raw_record(handle, self._flows_per_read)
604 605
606 -class SffTrimedRandomAccess(SffRandomAccess) :
607 - def get(self, offset) :
608 handle = self._handle 609 handle.seek(offset) 610 return SeqIO.SffIO._sff_read_seq_record(handle, 611 self._flows_per_read, 612 self._flow_chars, 613 self._key_sequence, 614 self._alphabet, 615 trim=True)
616 617 618 ################### 619 # Simple indexers # 620 ################### 621
622 -class SequentialSeqFileRandomAccess(SeqFileRandomAccess):
623 - def __init__(self, filename, format, alphabet):
624 SeqFileRandomAccess.__init__(self, filename, format, alphabet) 625 marker = {"ace" : "CO ", 626 "embl" : "ID ", 627 "fasta" : ">", 628 "genbank" : "LOCUS ", 629 "gb": "LOCUS ", 630 "imgt" : "ID ", 631 "phd" : "BEGIN_SEQUENCE", 632 "pir" : ">..;", 633 "qual": ">", 634 "qual": ">", 635 "swiss" : "ID ", 636 "uniprot-xml" : "<entry ", 637 }[format] 638 self._marker = marker 639 self._marker_re = re.compile(_as_bytes("^%s" % marker))
640
641 - def __iter__(self):
642 """Returns (id,offset) tuples.""" 643 marker_offset = len(self._marker) 644 marker_re = self._marker_re 645 handle = self._handle 646 handle.seek(0) 647 #Skip and header before first record 648 while True: 649 start_offset = handle.tell() 650 line = handle.readline() 651 if marker_re.match(line) or not line: 652 break 653 #Should now be at the start of a record, or end of the file 654 while marker_re.match(line): 655 #Here we can assume the record.id is the first word after the 656 #marker. This is generally fine... but not for GenBank, EMBL, Swiss 657 id = line[marker_offset:].strip().split(None, 1)[0] 658 while True: 659 line = handle.readline() 660 if marker_re.match(line) or not line: 661 end_offset = handle.tell() - len(line) 662 yield _bytes_to_string(id), start_offset, end_offset - start_offset 663 start_offset = end_offset 664 break 665 assert not line, repr(line)
666
667 - def get_raw(self, offset):
668 """Similar to the get method, but returns the record as a raw string.""" 669 #For non-trivial file formats this must be over-ridden in the subclass 670 handle = self._handle 671 marker_re = self._marker_re 672 handle.seek(offset) 673 lines = [handle.readline()] 674 while True: 675 line = handle.readline() 676 if marker_re.match(line) or not line: 677 #End of file, or start of next record => end of this record 678 break 679 lines.append(line) 680 return _as_bytes("").join(lines)
681 682 683 ####################################### 684 # Fiddly indexers: GenBank, EMBL, ... # 685 ####################################### 686
687 -class GenBankRandomAccess(SequentialSeqFileRandomAccess):
688 """Indexed dictionary like access to a GenBank file."""
689 - def __iter__(self):
690 handle = self._handle 691 handle.seek(0) 692 marker_re = self._marker_re 693 dot_char = _as_bytes(".") 694 accession_marker = _as_bytes("ACCESSION ") 695 version_marker = _as_bytes("VERSION ") 696 #Skip and header before first record 697 while True: 698 start_offset = handle.tell() 699 line = handle.readline() 700 if marker_re.match(line) or not line: 701 break 702 #Should now be at the start of a record, or end of the file 703 while marker_re.match(line): 704 #We cannot assume the record.id is the first word after LOCUS, 705 #normally the first entry on the VERSION or ACCESSION line is used. 706 key = None 707 while True: 708 line = handle.readline() 709 if marker_re.match(line) or not line: 710 if not key: 711 raise ValueError("Did not find ACCESSION/VERSION lines") 712 end_offset = handle.tell() - len(line) 713 yield _bytes_to_string(key), start_offset, end_offset - start_offset 714 start_offset = end_offset 715 break 716 elif line.startswith(accession_marker): 717 key = line.rstrip().split()[1] 718 elif line.startswith(version_marker): 719 version_id = line.rstrip().split()[1] 720 if version_id.count(dot_char)==1 and version_id.split(dot_char)[1].isdigit(): 721 #This should mimic the GenBank parser... 722 key = version_id 723 assert not line, repr(line)
724 725
726 -class EmblRandomAccess(SequentialSeqFileRandomAccess):
727 """Indexed dictionary like access to an EMBL file."""
728 - def __iter__(self):
729 handle = self._handle 730 handle.seek(0) 731 marker_re = self._marker_re 732 semi_char = _as_bytes(";") 733 dot_char = _as_bytes(".") 734 sv_marker = _as_bytes("SV ") 735 #Skip any header before first record 736 while True: 737 start_offset = handle.tell() 738 line = handle.readline() 739 if marker_re.match(line) or not line: 740 break 741 #Should now be at the start of a record, or end of the file 742 while marker_re.match(line): 743 #We cannot assume the record.id is the first word after ID, 744 #normally the SV line is used. 745 if line[2:].count(semi_char) == 6: 746 #Looks like the semi colon separated style introduced in 2006 747 parts = line[3:].rstrip().split(semi_char) 748 if parts[1].strip().startswith(sv_marker): 749 #The SV bit gives the version 750 key = parts[0].strip() + dot_char + parts[1].strip().split()[1] 751 else: 752 key = parts[0].strip() 753 elif line[2:].count(semi_char) == 3: 754 #Looks like the pre 2006 style, take first word only 755 key = line[3:].strip().split(None,1)[0] 756 else: 757 raise ValueError('Did not recognise the ID line layout:\n' + line) 758 while True: 759 line = handle.readline() 760 if marker_re.match(line) or not line: 761 end_offset = handle.tell() - len(line) 762 yield _bytes_to_string(key), start_offset, end_offset - start_offset 763 start_offset = end_offset 764 break 765 elif line.startswith(sv_marker): 766 key = line.rstrip().split()[1] 767 assert not line, repr(line)
768 769
770 -class SwissRandomAccess(SequentialSeqFileRandomAccess):
771 """Random access to a SwissProt file."""
772 - def __iter__(self):
773 handle = self._handle 774 handle.seek(0) 775 marker_re = self._marker_re 776 semi_char = _as_bytes(";") 777 #Skip any header before first record 778 while True: 779 start_offset = handle.tell() 780 line = handle.readline() 781 if marker_re.match(line) or not line: 782 break 783 #Should now be at the start of a record, or end of the file 784 while marker_re.match(line): 785 #We cannot assume the record.id is the first word after ID, 786 #normally the following AC line is used. 787 line = handle.readline() 788 assert line.startswith(_as_bytes("AC ")) 789 key = line[3:].strip().split(semi_char)[0].strip() 790 while True: 791 line = handle.readline() 792 if marker_re.match(line) or not line: 793 end_offset = handle.tell() - len(line) 794 yield _bytes_to_string(key), start_offset, end_offset - start_offset 795 start_offset = end_offset 796 break 797 assert not line, repr(line)
798 799
800 -class UniprotRandomAccess(SequentialSeqFileRandomAccess):
801 """Random access to a UniProt XML file."""
802 - def __iter__(self):
803 handle = self._handle 804 handle.seek(0) 805 marker_re = self._marker_re 806 start_acc_marker = _as_bytes("<accession>") 807 end_acc_marker = _as_bytes("</accession>") 808 end_entry_marker = _as_bytes("</entry>") 809 #Skip any header before first record 810 while True: 811 start_offset = handle.tell() 812 line = handle.readline() 813 if marker_re.match(line) or not line: 814 break 815 #Should now be at the start of a record, or end of the file 816 while marker_re.match(line): 817 #We expect the next line to be <accession>xxx</accession> 818 #(possibly with leading spaces) 819 #but allow it to be later on within the <entry> 820 key = None 821 done = False 822 while True: 823 line = handle.readline() 824 if key is None and start_acc_marker in line: 825 assert end_acc_marker in line, line 826 key = line[line.find(start_acc_marker)+11:].split(_as_bytes("<"))[0] 827 elif end_entry_marker in line: 828 end_offset = handle.tell() - len(line) \ 829 + line.find(end_entry_marker) + 8 830 break 831 elif marker_re.match(line) or not line: 832 #Start of next record or end of file 833 raise ValueError("Didn't find end of record") 834 if not key: 835 raise ValueError("Did not find <accession> line in bytes %i to %i" \ 836 % (start_offset, end_offset)) 837 yield _bytes_to_string(key), start_offset, end_offset - start_offset 838 #Find start of next record 839 while not marker_re.match(line) and line: 840 start_offset = handle.tell() 841 line = handle.readline() 842 assert not line, repr(line)
843
844 - def get_raw(self, offset):
845 """Similar to the get method, but returns the record as a raw string.""" 846 handle = self._handle 847 marker_re = self._marker_re 848 end_entry_marker = _as_bytes("</entry>") 849 handle.seek(offset) 850 data = handle.readline() 851 while True: 852 line = handle.readline() 853 i = line.find(end_entry_marker) 854 if i != -1: 855 data += line[:i+8] 856 break 857 if marker_re.match(line) or not line: 858 #End of file, or start of next record 859 raise ValueError("Didn't find end of record") 860 data += line 861 return data
862
863 - def get(self, offset) :
864 #TODO - Can we handle this directly in the parser? 865 #This is a hack - use get_raw for <entry>...</entry> and wrap it with 866 #the apparently required XML header and footer. 867 data = """<?xml version='1.0' encoding='UTF-8'?> 868 <uniprot xmlns="http://uniprot.org/uniprot" 869 xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 870 xsi:schemaLocation="http://uniprot.org/uniprot 871 http://www.uniprot.org/support/docs/uniprot.xsd"> 872 %s 873 </uniprot> 874 """ % _bytes_to_string(self.get_raw(offset)) 875 #TODO - For consistency, this function should not accept a string: 876 return SeqIO.UniprotIO.UniprotIterator(data).next()
877 878
879 -class IntelliGeneticsRandomAccess(SeqFileRandomAccess):
880 """Random access to a IntelliGenetics file."""
881 - def __init__(self, filename, format, alphabet):
882 SeqFileRandomAccess.__init__(self, filename, format, alphabet) 883 self._marker_re = re.compile(_as_bytes("^;"))
884
885 - def __iter__(self):
886 handle = self._handle 887 handle.seek(0) 888 marker_re = self._marker_re 889 semi_char = _as_bytes(";") 890 while True: 891 offset = handle.tell() 892 line = handle.readline() 893 if marker_re.match(line): 894 #Now look for the first line which doesn't start ";" 895 while True: 896 line = handle.readline() 897 if line[0:1] != semi_char and line.strip(): 898 key = line.split()[0] 899 yield _bytes_to_string(key), offset, 0 900 break 901 if not line: 902 raise ValueError("Premature end of file?") 903 elif not line: 904 #End of file 905 break
906 907
908 - def get_raw(self, offset):
909 handle = self._handle 910 handle.seek(offset) 911 marker_re = self._marker_re 912 lines = [] 913 line = handle.readline() 914 semi_char = _as_bytes(";") 915 while line.startswith(semi_char): 916 lines.append(line) 917 line = handle.readline() 918 while line and not line.startswith(semi_char): 919 lines.append(line) 920 line = handle.readline() 921 return _as_bytes("").join(lines)
922
923 -class TabRandomAccess(SeqFileRandomAccess):
924 """Random access to a simple tabbed file."""
925 - def __iter__(self):
926 handle = self._handle 927 handle.seek(0) 928 start_offset = handle.tell() 929 tab_char = _as_bytes("\t") 930 while True: 931 line = handle.readline() 932 if not line : break #End of file 933 try: 934 key = line.split(tab_char)[0] 935 except ValueError, err: 936 if not line.strip(): 937 #Ignore blank lines 938 start_offset = handle.tell() 939 continue 940 else: 941 raise err 942 else: 943 end_offset = handle.tell() 944 yield _bytes_to_string(key), start_offset, end_offset - start_offset 945 start_offset = end_offset
946
947 - def get_raw(self, offset):
948 """Like the get method, but returns the record as a raw string.""" 949 handle = self._handle 950 handle.seek(offset) 951 return handle.readline()
952 953 954 ########################## 955 # Now the FASTQ indexers # 956 ########################## 957
958 -class FastqRandomAccess(SeqFileRandomAccess):
959 """Random access to a FASTQ file (any supported variant). 960 961 With FASTQ the records all start with a "@" line, but so can quality lines. 962 Note this will cope with line-wrapped FASTQ files. 963 """
964 - def __iter__(self):
965 handle = self._handle 966 handle.seek(0) 967 id = None 968 start_offset = handle.tell() 969 line = handle.readline() 970 if not line: 971 #Empty file! 972 return 973 at_char = _as_bytes("@") 974 plus_char = _as_bytes("+") 975 if line[0:1] != at_char: 976 raise ValueError("Problem with FASTQ @ line:\n%s" % repr(line)) 977 while line: 978 #assert line[0]=="@" 979 #This record seems OK (so far) 980 id = line[1:].rstrip().split(None, 1)[0] 981 #Find the seq line(s) 982 seq_len = 0 983 while line: 984 line = handle.readline() 985 if line.startswith(plus_char) : break 986 seq_len += len(line.strip()) 987 if not line: 988 raise ValueError("Premature end of file in seq section") 989 #assert line[0]=="+" 990 #Find the qual line(s) 991 qual_len = 0 992 while line: 993 if seq_len == qual_len: 994 #Should be end of record... 995 line = handle.readline() 996 if line and line[0:1] != at_char: 997 ValueError("Problem with line %s" % repr(line)) 998 break 999 else: 1000 line = handle.readline() 1001 qual_len += len(line.strip()) 1002 if seq_len != qual_len: 1003 raise ValueError("Problem with quality section") 1004 end_offset = handle.tell() - len(line) 1005 yield _bytes_to_string(id), start_offset, end_offset - start_offset 1006 start_offset = end_offset
1007 #print "EOF" 1008
1009 - def get_raw(self, offset):
1010 """Similar to the get method, but returns the record as a raw string.""" 1011 #TODO - Refactor this and the __init__ method to reduce code duplication? 1012 handle = self._handle 1013 handle.seek(offset) 1014 line = handle.readline() 1015 data = line 1016 at_char = _as_bytes("@") 1017 plus_char = _as_bytes("+") 1018 if line[0:1] != at_char: 1019 raise ValueError("Problem with FASTQ @ line:\n%s" % repr(line)) 1020 identifier = line[1:].rstrip().split(None, 1)[0] 1021 #Find the seq line(s) 1022 seq_len = 0 1023 while line: 1024 line = handle.readline() 1025 data += line 1026 if line.startswith(plus_char) : break 1027 seq_len += len(line.strip()) 1028 if not line: 1029 raise ValueError("Premature end of file in seq section") 1030 assert line[0:1] == plus_char 1031 #Find the qual line(s) 1032 qual_len = 0 1033 while line: 1034 if seq_len == qual_len: 1035 #Should be end of record... 1036 pos = handle.tell() 1037 line = handle.readline() 1038 if line and line[0:1] != at_char: 1039 ValueError("Problem with line %s" % repr(line)) 1040 break 1041 else: 1042 line = handle.readline() 1043 data += line 1044 qual_len += len(line.strip()) 1045 if seq_len != qual_len: 1046 raise ValueError("Problem with quality section") 1047 return data
1048 1049 1050 ############################################################################### 1051 1052 _FormatToRandomAccess = {"ace" : SequentialSeqFileRandomAccess, 1053 "embl" : EmblRandomAccess, 1054 "fasta" : SequentialSeqFileRandomAccess, 1055 "fastq" : FastqRandomAccess, #Class handles all three variants 1056 "fastq-sanger" : FastqRandomAccess, #alias of the above 1057 "fastq-solexa" : FastqRandomAccess, 1058 "fastq-illumina" : FastqRandomAccess, 1059 "genbank" : GenBankRandomAccess, 1060 "gb" : GenBankRandomAccess, #alias of the above 1061 "ig" : IntelliGeneticsRandomAccess, 1062 "imgt" : EmblRandomAccess, 1063 "phd" : SequentialSeqFileRandomAccess, 1064 "pir" : SequentialSeqFileRandomAccess, 1065 "sff" : SffRandomAccess, 1066 "sff-trim" : SffTrimedRandomAccess, 1067 "swiss" : SwissRandomAccess, 1068 "tab" : TabRandomAccess, 1069 "qual" : SequentialSeqFileRandomAccess, 1070 "uniprot-xml" : UniprotRandomAccess, 1071 } 1072