Package Bio :: Module File
[hide private]
[frames] | no frames]

Source Code for Module Bio.File

  1  # Copyright 1999 by Jeffrey Chang.  All rights reserved. 
  2  # Copyright 2009-2012 by Peter Cock. All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6   
  7  """Code for more fancy file handles. 
  8   
  9   
 10  Classes: 
 11   
 12  UndoHandle     File object decorator with support for undo-like operations. 
 13   
 14  Additional private classes used in Bio.SeqIO and Bio.SearchIO for indexing 
 15  files are also defined under Bio.File but these are not intended for direct 
 16  use. 
 17  """ 
 18  # For with statement in Python 2.5 
 19  from __future__ import with_statement 
 20  import codecs 
 21  import os 
 22  import contextlib 
 23  import StringIO 
 24  import itertools 
 25   
 26  try: 
 27      from collections import UserDict as _dict_base 
 28  except ImportError: 
 29      from UserDict import DictMixin as _dict_base 
 30   
 31  try: 
 32      from sqlite3 import dbapi2 as _sqlite 
 33      from sqlite3 import IntegrityError as _IntegrityError 
 34      from sqlite3 import OperationalError as _OperationalError 
 35  except ImportError: 
 36      #Not present on Jython, but should be included in Python 2.5 
 37      #or later (unless compiled from source without its dependencies) 
 38      #Still want to offer in-memory indexing. 
 39      _sqlite = None 
 40      pass 
41 42 43 @contextlib.contextmanager 44 -def as_handle(handleish, mode='r', **kwargs):
45 """ 46 Context manager for arguments that can be passed to 47 SeqIO and AlignIO read, write, and parse methods: either file objects or strings. 48 49 When given a string, returns a file handle open to handleish with provided 50 mode which will be closed when the manager exits. 51 52 All other inputs are returned, and are *not* closed 53 54 - handleish - Either a string or file handle 55 - mode - Mode to open handleish (used only if handleish is a string) 56 - kwargs - Further arguments to pass to open(...) 57 58 Example: 59 60 >>> with as_handle('seqs.fasta', 'w') as fp: 61 ... fp.write('>test\nACGT') 62 >>> fp.closed 63 True 64 65 >>> handle = open('seqs.fasta', 'w') 66 >>> with as_handle(handle) as fp: 67 ... fp.write('>test\nACGT') 68 >>> fp.closed 69 False 70 >>> fp.close() 71 """ 72 if isinstance(handleish, basestring): 73 if 'encoding' in kwargs: 74 with codecs.open(handleish, mode, **kwargs) as fp: 75 yield fp 76 else: 77 with open(handleish, mode, **kwargs) as fp: 78 yield fp 79 else: 80 yield handleish
81
82 -def _open_for_random_access(filename):
83 """Open a file in binary mode, spot if it is BGZF format etc (PRIVATE). 84 85 This funcationality is used by the Bio.SeqIO and Bio.SearchIO index 86 and index_db functions. 87 """ 88 handle = open(filename, "rb") 89 import bgzf 90 try: 91 return bgzf.BgzfReader(mode="rb", fileobj=handle) 92 except ValueError, e: 93 assert "BGZF" in str(e) 94 #Not a BGZF file after all, rewind to start: 95 handle.seek(0) 96 return handle
97
98 99 -class UndoHandle(object):
100 """A Python handle that adds functionality for saving lines. 101 102 Saves lines in a LIFO fashion. 103 104 Added methods: 105 saveline Save a line to be returned next time. 106 peekline Peek at the next line without consuming it. 107 108 """
109 - def __init__(self, handle):
110 self._handle = handle 111 self._saved = []
112
113 - def __iter__(self):
114 return self
115
116 - def next(self):
117 next = self.readline() 118 if not next: 119 raise StopIteration 120 return next
121
122 - def readlines(self, *args, **keywds):
123 lines = self._saved + self._handle.readlines(*args, **keywds) 124 self._saved = [] 125 return lines
126
127 - def readline(self, *args, **keywds):
128 if self._saved: 129 line = self._saved.pop(0) 130 else: 131 line = self._handle.readline(*args, **keywds) 132 return line
133
134 - def read(self, size=-1):
135 if size == -1: 136 saved = "".join(self._saved) 137 self._saved[:] = [] 138 else: 139 saved = '' 140 while size > 0 and self._saved: 141 if len(self._saved[0]) <= size: 142 size = size - len(self._saved[0]) 143 saved = saved + self._saved.pop(0) 144 else: 145 saved = saved + self._saved[0][:size] 146 self._saved[0] = self._saved[0][size:] 147 size = 0 148 return saved + self._handle.read(size)
149
150 - def saveline(self, line):
151 if line: 152 self._saved = [line] + self._saved
153
154 - def peekline(self):
155 if self._saved: 156 line = self._saved[0] 157 else: 158 line = self._handle.readline() 159 self.saveline(line) 160 return line
161
162 - def tell(self):
163 lengths = map(len, self._saved) 164 sum = reduce(lambda x, y: x+y, lengths, 0) 165 return self._handle.tell() - sum
166
167 - def seek(self, *args):
168 self._saved = [] 169 self._handle.seek(*args)
170
171 - def __getattr__(self, attr):
172 return getattr(self._handle, attr)
173
174 - def __enter__(self):
175 return self
176
177 - def __exit__(self, type, value, traceback):
178 self._handle.close()
179
180 181 #The rest of this file defines code used in Bio.SeqIO and Bio.SearchIO 182 #for indexing 183 184 -class _IndexedSeqFileProxy(object):
185 """Base class for file format specific random access (PRIVATE). 186 187 This is subclasses in both Bio.SeqIO for indexing as SeqRecord 188 objects, and in Bio.SearchIO for indexing QueryResult objects. 189 190 Subclasses for each file format should define '__iter__', 'get' 191 and optionally 'get_raw' methods. 192 """ 193
194 - def __iter__(self):
195 """Returns (identifier, offset, length in bytes) tuples. 196 197 The length can be zero where it is not implemented or not 198 possible for a particular file format. 199 """ 200 raise NotImplementedError("Subclass should implement this")
201
202 - def get(self, offset):
203 """Returns parsed object for this entry.""" 204 #Most file formats with self contained records can be handled by 205 #parsing StringIO(_bytes_to_string(self.get_raw(offset))) 206 raise NotImplementedError("Subclass should implement this")
207
208 - def get_raw(self, offset):
209 """Returns bytes string (if implemented for this file format).""" 210 #Should be done by each sub-class (if possible) 211 raise NotImplementedError("Not available for this file format.")
212
213 214 -class _IndexedSeqFileDict(_dict_base):
215 """Read only dictionary interface to a sequential record file. 216 217 This code is used in both Bio.SeqIO for indexing as SeqRecord 218 objects, and in Bio.SearchIO for indexing QueryResult objects. 219 220 Keeps the keys and associated file offsets in memory, reads the file 221 to access entries as objects parsing them on demand. This approach 222 is memory limited, but will work even with millions of records. 223 224 Note duplicate keys are not allowed. If this happens, a ValueError 225 exception is raised. 226 227 As used in Bio.SeqIO, by default the SeqRecord's id string is used 228 as the dictionary key. In Bio.SearchIO, the query's id string is 229 used. This can be changed by suppling an optional key_function, 230 a callback function which will be given the record id and must 231 return the desired key. For example, this allows you to parse 232 NCBI style FASTA identifiers, and extract the GI number to use 233 as the dictionary key. 234 235 Note that this dictionary is essentially read only. You cannot 236 add or change values, pop values, nor clear the dictionary. 237 """
238 - def __init__(self, random_access_proxy, key_function, 239 repr, obj_repr):
240 #Use key_function=None for default value 241 self._proxy = random_access_proxy 242 self._key_function = key_function 243 self._repr = repr 244 self._obj_repr = obj_repr 245 if key_function: 246 offset_iter = ( 247 (key_function(k), o, l) for (k, o, l) in random_access_proxy) 248 else: 249 offset_iter = random_access_proxy 250 offsets = {} 251 for key, offset, length in offset_iter: 252 #Note - we don't store the length because I want to minimise the 253 #memory requirements. With the SQLite backend the length is kept 254 #and is used to speed up the get_raw method (by about 3 times). 255 #The length should be provided by all the current backends except 256 #SFF where there is an existing Roche index we can reuse (very fast 257 #but lacks the record lengths) 258 #assert length or format in ["sff", "sff-trim"], \ 259 # "%s at offset %i given length %r (%s format %s)" \ 260 # % (key, offset, length, filename, format) 261 if key in offsets: 262 self._proxy._handle.close() 263 raise ValueError("Duplicate key '%s'" % key) 264 else: 265 offsets[key] = offset 266 self._offsets = offsets
267
268 - def __repr__(self):
269 return self._repr
270
271 - def __str__(self):
272 #TODO - How best to handle the __str__ for SeqIO and SearchIO? 273 if self: 274 return "{%r : %s(...), ...}" % (self.keys()[0], self._obj_repr) 275 else: 276 return "{}"
277
278 - def __contains__(self, key):
279 return key in self._offsets
280
281 - def __len__(self):
282 """How many records are there?""" 283 return len(self._offsets)
284 285 if hasattr(dict, "iteritems"): 286 #Python 2, use iteritems but not items etc
287 - def values(self):
288 """Would be a list of the SeqRecord objects, but not implemented. 289 290 In general you can be indexing very very large files, with millions 291 of sequences. Loading all these into memory at once as SeqRecord 292 objects would (probably) use up all the RAM. Therefore we simply 293 don't support this dictionary method. 294 """ 295 raise NotImplementedError("Due to memory concerns, when indexing a " 296 "sequence file you cannot access all the " 297 "records at once.")
298
299 - def items(self):
300 """Would be a list of the (key, SeqRecord) tuples, but not implemented. 301 302 In general you can be indexing very very large files, with millions 303 of sequences. Loading all these into memory at once as SeqRecord 304 objects would (probably) use up all the RAM. Therefore we simply 305 don't support this dictionary method. 306 """ 307 raise NotImplementedError("Due to memory concerns, when indexing a " 308 "sequence file you cannot access all the " 309 "records at once.")
310
311 - def keys(self):
312 """Return a list of all the keys (SeqRecord identifiers).""" 313 #TODO - Stick a warning in here for large lists? Or just refuse? 314 return self._offsets.keys()
315
316 - def itervalues(self):
317 """Iterate over the SeqRecord) items.""" 318 for key in self.__iter__(): 319 yield self.__getitem__(key)
320
321 - def iteritems(self):
322 """Iterate over the (key, SeqRecord) items.""" 323 for key in self.__iter__(): 324 yield key, self.__getitem__(key)
325
326 - def iterkeys(self):
327 """Iterate over the keys.""" 328 return self.__iter__()
329 330 else: 331 #Python 3 - define items and values as iterators
332 - def items(self):
333 """Iterate over the (key, SeqRecord) items.""" 334 for key in self.__iter__(): 335 yield key, self.__getitem__(key)
336
337 - def values(self):
338 """Iterate over the SeqRecord items.""" 339 for key in self.__iter__(): 340 yield self.__getitem__(key)
341
342 - def keys(self):
343 """Iterate over the keys.""" 344 return self.__iter__()
345
346 - def __iter__(self):
347 """Iterate over the keys.""" 348 return iter(self._offsets)
349
350 - def __getitem__(self, key):
351 """x.__getitem__(y) <==> x[y]""" 352 #Pass the offset to the proxy 353 record = self._proxy.get(self._offsets[key]) 354 if self._key_function: 355 key2 = self._key_function(record.id) 356 else: 357 key2 = record.id 358 if key != key2: 359 raise ValueError("Key did not match (%s vs %s)" % (key, key2)) 360 return record
361
362 - def get(self, k, d=None):
363 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" 364 try: 365 return self.__getitem__(k) 366 except KeyError: 367 return d
368
369 - def get_raw(self, key):
370 """Similar to the get method, but returns the record as a raw string. 371 372 If the key is not found, a KeyError exception is raised. 373 374 Note that on Python 3 a bytes string is returned, not a typical 375 unicode string. 376 377 NOTE - This functionality is not supported for every file format. 378 """ 379 #Pass the offset to the proxy 380 return self._proxy.get_raw(self._offsets[key])
381
382 - def __setitem__(self, key, value):
383 """Would allow setting or replacing records, but not implemented.""" 384 raise NotImplementedError("An indexed a sequence file is read only.")
385
386 - def update(self, *args, **kwargs):
387 """Would allow adding more values, but not implemented.""" 388 raise NotImplementedError("An indexed a sequence file is read only.")
389
390 - def pop(self, key, default=None):
391 """Would remove specified record, but not implemented.""" 392 raise NotImplementedError("An indexed a sequence file is read only.")
393
394 - def popitem(self):
395 """Would remove and return a SeqRecord, but not implemented.""" 396 raise NotImplementedError("An indexed a sequence file is read only.")
397
398 - def clear(self):
399 """Would clear dictionary, but not implemented.""" 400 raise NotImplementedError("An indexed a sequence file is read only.")
401
402 - def fromkeys(self, keys, value=None):
403 """A dictionary method which we don't implement.""" 404 raise NotImplementedError("An indexed a sequence file doesn't " 405 "support this.")
406
407 - def copy(self):
408 """A dictionary method which we don't implement.""" 409 raise NotImplementedError("An indexed a sequence file doesn't " 410 "support this.")
411
412 413 -class _SQLiteManySeqFilesDict(_IndexedSeqFileDict):
414 """Read only dictionary interface to many sequential record files. 415 416 This code is used in both Bio.SeqIO for indexing as SeqRecord 417 objects, and in Bio.SearchIO for indexing QueryResult objects. 418 419 Keeps the keys, file-numbers and offsets in an SQLite database. To access 420 a record by key, reads from the offset in the appropriate file and then 421 parses the record into an object. 422 423 There are OS limits on the number of files that can be open at once, 424 so a pool are kept. If a record is required from a closed file, then 425 one of the open handles is closed first. 426 """
427 - def __init__(self, index_filename, filenames, 428 proxy_factory, format, 429 key_function, repr, max_open=10):
430 self._proxy_factory = proxy_factory 431 self._repr = repr 432 random_access_proxies = {} 433 #TODO? - Don't keep filename list in memory (just in DB)? 434 #Should save a chunk of memory if dealing with 1000s of files. 435 #Furthermore could compare a generator to the DB on reloading 436 #(no need to turn it into a list) 437 if not _sqlite: 438 # Hack for Jython (of if Python is compiled without it) 439 from Bio import MissingPythonDependencyError 440 raise MissingPythonDependencyError("Requires sqlite3, which is " 441 "included Python 2.5+") 442 if filenames is not None: 443 filenames = list(filenames) # In case it was a generator 444 if os.path.isfile(index_filename): 445 #Reuse the index. 446 con = _sqlite.connect(index_filename) 447 self._con = con 448 #Check the count... 449 try: 450 count, = con.execute( 451 "SELECT value FROM meta_data WHERE key=?;", 452 ("count",)).fetchone() 453 self._length = int(count) 454 if self._length == -1: 455 con.close() 456 raise ValueError("Unfinished/partial database") 457 count, = con.execute( 458 "SELECT COUNT(key) FROM offset_data;").fetchone() 459 if self._length != int(count): 460 con.close() 461 raise ValueError("Corrupt database? %i entries not %i" 462 % (int(count), self._length)) 463 self._format, = con.execute( 464 "SELECT value FROM meta_data WHERE key=?;", 465 ("format",)).fetchone() 466 if format and format != self._format: 467 con.close() 468 raise ValueError("Index file says format %s, not %s" 469 % (self._format, format)) 470 self._filenames = [row[0] for row in 471 con.execute("SELECT name FROM file_data " 472 "ORDER BY file_number;").fetchall()] 473 if filenames and len(filenames) != len(self._filenames): 474 con.close() 475 raise ValueError("Index file says %i files, not %i" 476 % (len(self._filenames), len(filenames))) 477 if filenames and filenames != self._filenames: 478 con.close() 479 raise ValueError("Index file has different filenames") 480 except _OperationalError, err: 481 con.close() 482 raise ValueError("Not a Biopython index database? %s" % err) 483 #Now we have the format (from the DB if not given to us), 484 if not proxy_factory(self._format): 485 con.close() 486 raise ValueError("Unsupported format '%s'" % self._format) 487 else: 488 self._filenames = filenames 489 self._format = format 490 if not format or not filenames: 491 raise ValueError("Filenames to index and format required") 492 if not proxy_factory(format): 493 raise ValueError("Unsupported format '%s'" % format) 494 #Create the index 495 con = _sqlite.connect(index_filename) 496 self._con = con 497 #print "Creating index" 498 # Sqlite PRAGMA settings for speed 499 con.execute("PRAGMA synchronous=OFF") 500 con.execute("PRAGMA locking_mode=EXCLUSIVE") 501 #Don't index the key column until the end (faster) 502 #con.execute("CREATE TABLE offset_data (key TEXT PRIMARY KEY, " 503 # "offset INTEGER);") 504 con.execute("CREATE TABLE meta_data (key TEXT, value TEXT);") 505 con.execute("INSERT INTO meta_data (key, value) VALUES (?,?);", 506 ("count", -1)) 507 con.execute("INSERT INTO meta_data (key, value) VALUES (?,?);", 508 ("format", format)) 509 #TODO - Record the alphabet? 510 #TODO - Record the file size and modified date? 511 con.execute( 512 "CREATE TABLE file_data (file_number INTEGER, name TEXT);") 513 con.execute("CREATE TABLE offset_data (key TEXT, file_number INTEGER, offset INTEGER, length INTEGER);") 514 count = 0 515 for i, filename in enumerate(filenames): 516 con.execute( 517 "INSERT INTO file_data (file_number, name) VALUES (?,?);", 518 (i, filename)) 519 random_access_proxy = proxy_factory(format, filename) 520 if key_function: 521 offset_iter = ((key_function( 522 k), i, o, l) for (k, o, l) in random_access_proxy) 523 else: 524 offset_iter = ( 525 (k, i, o, l) for (k, o, l) in random_access_proxy) 526 while True: 527 batch = list(itertools.islice(offset_iter, 100)) 528 if not batch: 529 break 530 #print "Inserting batch of %i offsets, %s ... %s" \ 531 # % (len(batch), batch[0][0], batch[-1][0]) 532 con.executemany( 533 "INSERT INTO offset_data (key,file_number,offset,length) VALUES (?,?,?,?);", 534 batch) 535 con.commit() 536 count += len(batch) 537 if len(random_access_proxies) < max_open: 538 random_access_proxies[i] = random_access_proxy 539 else: 540 random_access_proxy._handle.close() 541 self._length = count 542 #print "About to index %i entries" % count 543 try: 544 con.execute("CREATE UNIQUE INDEX IF NOT EXISTS " 545 "key_index ON offset_data(key);") 546 except _IntegrityError, err: 547 self._proxies = random_access_proxies 548 self.close() 549 con.close() 550 raise ValueError("Duplicate key? %s" % err) 551 con.execute("PRAGMA locking_mode=NORMAL") 552 con.execute("UPDATE meta_data SET value = ? WHERE key = ?;", 553 (count, "count")) 554 con.commit() 555 #print "Index created" 556 self._proxies = random_access_proxies 557 self._max_open = max_open 558 self._index_filename = index_filename 559 self._key_function = key_function
560
561 - def __repr__(self):
562 return self._repr
563
564 - def __contains__(self, key):
565 return bool( 566 self._con.execute("SELECT key FROM offset_data WHERE key=?;", 567 (key,)).fetchone())
568
569 - def __len__(self):
570 """How many records are there?""" 571 return self._length
572 #return self._con.execute("SELECT COUNT(key) FROM offset_data;").fetchone()[0] 573
574 - def __iter__(self):
575 """Iterate over the keys.""" 576 for row in self._con.execute("SELECT key FROM offset_data;"): 577 yield str(row[0])
578 579 if hasattr(dict, "iteritems"): 580 #Python 2, use iteritems but not items etc 581 #Just need to override this...
582 - def keys(self):
583 """Return a list of all the keys (SeqRecord identifiers).""" 584 return [str(row[0]) for row in 585 self._con.execute("SELECT key FROM offset_data;").fetchall()]
586
587 - def __getitem__(self, key):
588 """x.__getitem__(y) <==> x[y]""" 589 #Pass the offset to the proxy 590 row = self._con.execute( 591 "SELECT file_number, offset FROM offset_data WHERE key=?;", 592 (key,)).fetchone() 593 if not row: 594 raise KeyError 595 file_number, offset = row 596 proxies = self._proxies 597 if file_number in proxies: 598 record = proxies[file_number].get(offset) 599 else: 600 if len(proxies) >= self._max_open: 601 #Close an old handle... 602 proxies.popitem()[1]._handle.close() 603 #Open a new handle... 604 proxy = self._proxy_factory(self._format, self._filenames[file_number]) 605 record = proxy.get(offset) 606 proxies[file_number] = proxy 607 if self._key_function: 608 key2 = self._key_function(record.id) 609 else: 610 key2 = record.id 611 if key != key2: 612 raise ValueError("Key did not match (%s vs %s)" % (key, key2)) 613 return record
614
615 - def get(self, k, d=None):
616 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" 617 try: 618 return self.__getitem__(k) 619 except KeyError: 620 return d
621
622 - def get_raw(self, key):
623 """Similar to the get method, but returns the record as a raw string. 624 625 If the key is not found, a KeyError exception is raised. 626 627 Note that on Python 3 a bytes string is returned, not a typical 628 unicode string. 629 630 NOTE - This functionality is not supported for every file format. 631 """ 632 #Pass the offset to the proxy 633 row = self._con.execute( 634 "SELECT file_number, offset, length FROM offset_data WHERE key=?;", 635 (key,)).fetchone() 636 if not row: 637 raise KeyError 638 file_number, offset, length = row 639 proxies = self._proxies 640 if file_number in proxies: 641 if length: 642 #Shortcut if we have the length 643 h = proxies[file_number]._handle 644 h.seek(offset) 645 return h.read(length) 646 else: 647 return proxies[file_number].get_raw(offset) 648 else: 649 #This code is duplicated from __getitem__ to avoid a function call 650 if len(proxies) >= self._max_open: 651 #Close an old handle... 652 proxies.popitem()[1]._handle.close() 653 #Open a new handle... 654 proxy = self._proxy_factory(self._format, self._filenames[file_number]) 655 proxies[file_number] = proxy 656 if length: 657 #Shortcut if we have the length 658 h = proxy._handle 659 h.seek(offset) 660 return h.read(length) 661 else: 662 return proxy.get_raw(offset)
663
664 - def close(self):
665 """Close any open file handles.""" 666 proxies = self._proxies 667 while proxies: 668 proxies.popitem()[1]._handle.close()
669