Package Bio :: Module File
[hide private]
[frames] | no frames]

Source Code for Module Bio.File

  1  # Copyright 1999 by Jeffrey Chang.  All rights reserved. 
  2  # Copyright 2009-2013 by Peter Cock. All rights reserved. 
  3  # 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7   
  8  """Code for more fancy file handles. 
  9   
 10   
 11  Classes: 
 12   
 13  UndoHandle     File object decorator with support for undo-like operations. 
 14   
 15  Additional private classes used in Bio.SeqIO and Bio.SearchIO for indexing 
 16  files are also defined under Bio.File but these are not intended for direct 
 17  use. 
 18  """ 
 19  from __future__ import print_function 
 20   
 21  import codecs 
 22  import os 
 23  import sys 
 24  import contextlib 
 25  import itertools 
 26   
 27  from Bio._py3k import basestring 
 28   
 29  try: 
 30      from collections import UserDict as _dict_base 
 31  except ImportError: 
 32      from UserDict import DictMixin as _dict_base 
 33   
 34  try: 
 35      from sqlite3 import dbapi2 as _sqlite 
 36      from sqlite3 import IntegrityError as _IntegrityError 
 37      from sqlite3 import OperationalError as _OperationalError 
 38  except ImportError: 
 39      #Not present on Jython, but should be included in Python 2.5 
 40      #or later (unless compiled from source without its dependencies) 
 41      #Still want to offer in-memory indexing. 
 42      _sqlite = None 
 43      pass 
44 45 46 @contextlib.contextmanager 47 -def as_handle(handleish, mode='r', **kwargs):
48 """ 49 Context manager for arguments that can be passed to 50 SeqIO and AlignIO read, write, and parse methods: either file objects or strings. 51 52 When given a string, returns a file handle open to handleish with provided 53 mode which will be closed when the manager exits. 54 55 All other inputs are returned, and are *not* closed 56 57 - handleish - Either a string or file handle 58 - mode - Mode to open handleish (used only if handleish is a string) 59 - kwargs - Further arguments to pass to open(...) 60 61 Example: 62 63 >>> with as_handle('seqs.fasta', 'w') as fp: 64 ... fp.write('>test\nACGT') 65 >>> fp.closed 66 True 67 68 >>> handle = open('seqs.fasta', 'w') 69 >>> with as_handle(handle) as fp: 70 ... fp.write('>test\nACGT') 71 >>> fp.closed 72 False 73 >>> fp.close() 74 """ 75 if isinstance(handleish, basestring): 76 if 'encoding' in kwargs: 77 with codecs.open(handleish, mode, **kwargs) as fp: 78 yield fp 79 else: 80 with open(handleish, mode, **kwargs) as fp: 81 yield fp 82 else: 83 yield handleish
84
85 -def _open_for_random_access(filename):
86 """Open a file in binary mode, spot if it is BGZF format etc (PRIVATE). 87 88 This funcationality is used by the Bio.SeqIO and Bio.SearchIO index 89 and index_db functions. 90 """ 91 handle = open(filename, "rb") 92 from . import bgzf 93 try: 94 return bgzf.BgzfReader(mode="rb", fileobj=handle) 95 except ValueError as e: 96 assert "BGZF" in str(e) 97 #Not a BGZF file after all, rewind to start: 98 handle.seek(0) 99 return handle
100
101 102 -class UndoHandle(object):
103 """A Python handle that adds functionality for saving lines. 104 105 Saves lines in a LIFO fashion. 106 107 Added methods: 108 saveline Save a line to be returned next time. 109 peekline Peek at the next line without consuming it. 110 111 """
112 - def __init__(self, handle):
113 self._handle = handle 114 self._saved = []
115
116 - def __iter__(self):
117 return self
118
119 - def __next__(self):
120 next = self.readline() 121 if not next: 122 raise StopIteration 123 return next
124 125 if sys.version_info[0] < 3:
126 - def next(self):
127 """Python 2 style alias for Python 3 style __next__ method.""" 128 return self.__next__()
129
130 - def readlines(self, *args, **keywds):
131 lines = self._saved + self._handle.readlines(*args, **keywds) 132 self._saved = [] 133 return lines
134
135 - def readline(self, *args, **keywds):
136 if self._saved: 137 line = self._saved.pop(0) 138 else: 139 line = self._handle.readline(*args, **keywds) 140 return line
141
142 - def read(self, size=-1):
143 if size == -1: 144 saved = "".join(self._saved) 145 self._saved[:] = [] 146 else: 147 saved = '' 148 while size > 0 and self._saved: 149 if len(self._saved[0]) <= size: 150 size = size - len(self._saved[0]) 151 saved = saved + self._saved.pop(0) 152 else: 153 saved = saved + self._saved[0][:size] 154 self._saved[0] = self._saved[0][size:] 155 size = 0 156 return saved + self._handle.read(size)
157
158 - def saveline(self, line):
159 if line: 160 self._saved = [line] + self._saved
161
162 - def peekline(self):
163 if self._saved: 164 line = self._saved[0] 165 else: 166 line = self._handle.readline() 167 self.saveline(line) 168 return line
169
170 - def tell(self):
171 return self._handle.tell() - sum(len(line) for line in self._saved)
172
173 - def seek(self, *args):
174 self._saved = [] 175 self._handle.seek(*args)
176
177 - def __getattr__(self, attr):
178 return getattr(self._handle, attr)
179
180 - def __enter__(self):
181 return self
182
183 - def __exit__(self, type, value, traceback):
184 self._handle.close()
185
186 187 #The rest of this file defines code used in Bio.SeqIO and Bio.SearchIO 188 #for indexing 189 190 -class _IndexedSeqFileProxy(object):
191 """Base class for file format specific random access (PRIVATE). 192 193 This is subclasses in both Bio.SeqIO for indexing as SeqRecord 194 objects, and in Bio.SearchIO for indexing QueryResult objects. 195 196 Subclasses for each file format should define '__iter__', 'get' 197 and optionally 'get_raw' methods. 198 """ 199
200 - def __iter__(self):
201 """Returns (identifier, offset, length in bytes) tuples. 202 203 The length can be zero where it is not implemented or not 204 possible for a particular file format. 205 """ 206 raise NotImplementedError("Subclass should implement this")
207
208 - def get(self, offset):
209 """Returns parsed object for this entry.""" 210 #Most file formats with self contained records can be handled by 211 #parsing StringIO(_bytes_to_string(self.get_raw(offset))) 212 raise NotImplementedError("Subclass should implement this")
213
214 - def get_raw(self, offset):
215 """Returns bytes string (if implemented for this file format).""" 216 #Should be done by each sub-class (if possible) 217 raise NotImplementedError("Not available for this file format.")
218
219 220 -class _IndexedSeqFileDict(_dict_base):
221 """Read only dictionary interface to a sequential record file. 222 223 This code is used in both Bio.SeqIO for indexing as SeqRecord 224 objects, and in Bio.SearchIO for indexing QueryResult objects. 225 226 Keeps the keys and associated file offsets in memory, reads the file 227 to access entries as objects parsing them on demand. This approach 228 is memory limited, but will work even with millions of records. 229 230 Note duplicate keys are not allowed. If this happens, a ValueError 231 exception is raised. 232 233 As used in Bio.SeqIO, by default the SeqRecord's id string is used 234 as the dictionary key. In Bio.SearchIO, the query's id string is 235 used. This can be changed by suppling an optional key_function, 236 a callback function which will be given the record id and must 237 return the desired key. For example, this allows you to parse 238 NCBI style FASTA identifiers, and extract the GI number to use 239 as the dictionary key. 240 241 Note that this dictionary is essentially read only. You cannot 242 add or change values, pop values, nor clear the dictionary. 243 """
244 - def __init__(self, random_access_proxy, key_function, 245 repr, obj_repr):
246 #Use key_function=None for default value 247 self._proxy = random_access_proxy 248 self._key_function = key_function 249 self._repr = repr 250 self._obj_repr = obj_repr 251 if key_function: 252 offset_iter = ( 253 (key_function(k), o, l) for (k, o, l) in random_access_proxy) 254 else: 255 offset_iter = random_access_proxy 256 offsets = {} 257 for key, offset, length in offset_iter: 258 #Note - we don't store the length because I want to minimise the 259 #memory requirements. With the SQLite backend the length is kept 260 #and is used to speed up the get_raw method (by about 3 times). 261 #The length should be provided by all the current backends except 262 #SFF where there is an existing Roche index we can reuse (very fast 263 #but lacks the record lengths) 264 #assert length or format in ["sff", "sff-trim"], \ 265 # "%s at offset %i given length %r (%s format %s)" \ 266 # % (key, offset, length, filename, format) 267 if key in offsets: 268 self._proxy._handle.close() 269 raise ValueError("Duplicate key '%s'" % key) 270 else: 271 offsets[key] = offset 272 self._offsets = offsets
273
274 - def __repr__(self):
275 return self._repr
276
277 - def __str__(self):
278 #TODO - How best to handle the __str__ for SeqIO and SearchIO? 279 if self: 280 return "{%r : %s(...), ...}" % (list(self.keys())[0], self._obj_repr) 281 else: 282 return "{}"
283
284 - def __contains__(self, key):
285 return key in self._offsets
286
287 - def __len__(self):
288 """How many records are there?""" 289 return len(self._offsets)
290
291 - def items(self):
292 """Iterate over the (key, SeqRecord) items. 293 294 This tries to act like a Python 3 dictionary, and does not return 295 a list of (key, value) pairs due to memory concerns. 296 """ 297 for key in self.__iter__(): 298 yield key, self.__getitem__(key)
299
300 - def values(self):
301 """Iterate over the SeqRecord items. 302 303 This tries to act like a Python 3 dictionary, and does not return 304 a list of value due to memory concerns. 305 """ 306 for key in self.__iter__(): 307 yield self.__getitem__(key)
308
309 - def keys(self):
310 """Iterate over the keys. 311 312 This tries to act like a Python 3 dictionary, and does not return 313 a list of keys due to memory concerns. 314 """ 315 return self.__iter__()
316 317 if hasattr(dict, "iteritems"): 318 #Python 2, also define iteritems etc
319 - def itervalues(self):
320 """Iterate over the SeqRecord) items.""" 321 for key in self.__iter__(): 322 yield self.__getitem__(key)
323
324 - def iteritems(self):
325 """Iterate over the (key, SeqRecord) items.""" 326 for key in self.__iter__(): 327 yield key, self.__getitem__(key)
328
329 - def iterkeys(self):
330 """Iterate over the keys.""" 331 return self.__iter__()
332
333 - def __iter__(self):
334 """Iterate over the keys.""" 335 return iter(self._offsets)
336
337 - def __getitem__(self, key):
338 """x.__getitem__(y) <==> x[y]""" 339 #Pass the offset to the proxy 340 record = self._proxy.get(self._offsets[key]) 341 if self._key_function: 342 key2 = self._key_function(record.id) 343 else: 344 key2 = record.id 345 if key != key2: 346 raise ValueError("Key did not match (%s vs %s)" % (key, key2)) 347 return record
348
349 - def get(self, k, d=None):
350 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" 351 try: 352 return self.__getitem__(k) 353 except KeyError: 354 return d
355
356 - def get_raw(self, key):
357 """Similar to the get method, but returns the record as a raw string. 358 359 If the key is not found, a KeyError exception is raised. 360 361 Note that on Python 3 a bytes string is returned, not a typical 362 unicode string. 363 364 NOTE - This functionality is not supported for every file format. 365 """ 366 #Pass the offset to the proxy 367 return self._proxy.get_raw(self._offsets[key])
368
369 - def __setitem__(self, key, value):
370 """Would allow setting or replacing records, but not implemented.""" 371 raise NotImplementedError("An indexed a sequence file is read only.")
372
373 - def update(self, *args, **kwargs):
374 """Would allow adding more values, but not implemented.""" 375 raise NotImplementedError("An indexed a sequence file is read only.")
376
377 - def pop(self, key, default=None):
378 """Would remove specified record, but not implemented.""" 379 raise NotImplementedError("An indexed a sequence file is read only.")
380
381 - def popitem(self):
382 """Would remove and return a SeqRecord, but not implemented.""" 383 raise NotImplementedError("An indexed a sequence file is read only.")
384
385 - def clear(self):
386 """Would clear dictionary, but not implemented.""" 387 raise NotImplementedError("An indexed a sequence file is read only.")
388
389 - def fromkeys(self, keys, value=None):
390 """A dictionary method which we don't implement.""" 391 raise NotImplementedError("An indexed a sequence file doesn't " 392 "support this.")
393
394 - def copy(self):
395 """A dictionary method which we don't implement.""" 396 raise NotImplementedError("An indexed a sequence file doesn't " 397 "support this.")
398
399 - def close(self):
400 """Close the file handle being used to read the data. 401 402 Once called, further use of the index won't work. The sole purpose 403 of this method is to allow explicit handle closure - for example 404 if you wish to delete the file, on Windows you must first close 405 all open handles to that file. 406 """ 407 self._proxy._handle.close()
408
409 410 -class _SQLiteManySeqFilesDict(_IndexedSeqFileDict):
411 """Read only dictionary interface to many sequential record files. 412 413 This code is used in both Bio.SeqIO for indexing as SeqRecord 414 objects, and in Bio.SearchIO for indexing QueryResult objects. 415 416 Keeps the keys, file-numbers and offsets in an SQLite database. To access 417 a record by key, reads from the offset in the appropriate file and then 418 parses the record into an object. 419 420 There are OS limits on the number of files that can be open at once, 421 so a pool are kept. If a record is required from a closed file, then 422 one of the open handles is closed first. 423 """
424 - def __init__(self, index_filename, filenames, 425 proxy_factory, format, 426 key_function, repr, max_open=10):
427 self._proxy_factory = proxy_factory 428 self._repr = repr 429 random_access_proxies = {} 430 #TODO? - Don't keep filename list in memory (just in DB)? 431 #Should save a chunk of memory if dealing with 1000s of files. 432 #Furthermore could compare a generator to the DB on reloading 433 #(no need to turn it into a list) 434 if not _sqlite: 435 # Hack for Jython (of if Python is compiled without it) 436 from Bio import MissingPythonDependencyError 437 raise MissingPythonDependencyError("Requires sqlite3, which is " 438 "included Python 2.5+") 439 if filenames is not None: 440 filenames = list(filenames) # In case it was a generator 441 if os.path.isfile(index_filename): 442 #Reuse the index. 443 con = _sqlite.connect(index_filename) 444 self._con = con 445 #Check the count... 446 try: 447 count, = con.execute( 448 "SELECT value FROM meta_data WHERE key=?;", 449 ("count",)).fetchone() 450 self._length = int(count) 451 if self._length == -1: 452 con.close() 453 raise ValueError("Unfinished/partial database") 454 count, = con.execute( 455 "SELECT COUNT(key) FROM offset_data;").fetchone() 456 if self._length != int(count): 457 con.close() 458 raise ValueError("Corrupt database? %i entries not %i" 459 % (int(count), self._length)) 460 self._format, = con.execute( 461 "SELECT value FROM meta_data WHERE key=?;", 462 ("format",)).fetchone() 463 if format and format != self._format: 464 con.close() 465 raise ValueError("Index file says format %s, not %s" 466 % (self._format, format)) 467 self._filenames = [row[0] for row in 468 con.execute("SELECT name FROM file_data " 469 "ORDER BY file_number;").fetchall()] 470 if filenames and len(filenames) != len(self._filenames): 471 con.close() 472 raise ValueError("Index file says %i files, not %i" 473 % (len(self._filenames), len(filenames))) 474 if filenames and filenames != self._filenames: 475 con.close() 476 raise ValueError("Index file has different filenames") 477 except _OperationalError as err: 478 con.close() 479 raise ValueError("Not a Biopython index database? %s" % err) 480 #Now we have the format (from the DB if not given to us), 481 if not proxy_factory(self._format): 482 con.close() 483 raise ValueError("Unsupported format '%s'" % self._format) 484 else: 485 self._filenames = filenames 486 self._format = format 487 if not format or not filenames: 488 raise ValueError("Filenames to index and format required") 489 if not proxy_factory(format): 490 raise ValueError("Unsupported format '%s'" % format) 491 #Create the index 492 con = _sqlite.connect(index_filename) 493 self._con = con 494 #print("Creating index") 495 # Sqlite PRAGMA settings for speed 496 con.execute("PRAGMA synchronous=OFF") 497 con.execute("PRAGMA locking_mode=EXCLUSIVE") 498 #Don't index the key column until the end (faster) 499 #con.execute("CREATE TABLE offset_data (key TEXT PRIMARY KEY, " 500 # "offset INTEGER);") 501 con.execute("CREATE TABLE meta_data (key TEXT, value TEXT);") 502 con.execute("INSERT INTO meta_data (key, value) VALUES (?,?);", 503 ("count", -1)) 504 con.execute("INSERT INTO meta_data (key, value) VALUES (?,?);", 505 ("format", format)) 506 #TODO - Record the alphabet? 507 #TODO - Record the file size and modified date? 508 con.execute( 509 "CREATE TABLE file_data (file_number INTEGER, name TEXT);") 510 con.execute("CREATE TABLE offset_data (key TEXT, file_number INTEGER, offset INTEGER, length INTEGER);") 511 count = 0 512 for i, filename in enumerate(filenames): 513 con.execute( 514 "INSERT INTO file_data (file_number, name) VALUES (?,?);", 515 (i, filename)) 516 random_access_proxy = proxy_factory(format, filename) 517 if key_function: 518 offset_iter = ((key_function( 519 k), i, o, l) for (k, o, l) in random_access_proxy) 520 else: 521 offset_iter = ( 522 (k, i, o, l) for (k, o, l) in random_access_proxy) 523 while True: 524 batch = list(itertools.islice(offset_iter, 100)) 525 if not batch: 526 break 527 #print("Inserting batch of %i offsets, %s ... %s" \ 528 # % (len(batch), batch[0][0], batch[-1][0])) 529 con.executemany( 530 "INSERT INTO offset_data (key,file_number,offset,length) VALUES (?,?,?,?);", 531 batch) 532 con.commit() 533 count += len(batch) 534 if len(random_access_proxies) < max_open: 535 random_access_proxies[i] = random_access_proxy 536 else: 537 random_access_proxy._handle.close() 538 self._length = count 539 #print("About to index %i entries" % count) 540 try: 541 con.execute("CREATE UNIQUE INDEX IF NOT EXISTS " 542 "key_index ON offset_data(key);") 543 except _IntegrityError as err: 544 self._proxies = random_access_proxies 545 self.close() 546 con.close() 547 raise ValueError("Duplicate key? %s" % err) 548 con.execute("PRAGMA locking_mode=NORMAL") 549 con.execute("UPDATE meta_data SET value = ? WHERE key = ?;", 550 (count, "count")) 551 con.commit() 552 #print("Index created") 553 self._proxies = random_access_proxies 554 self._max_open = max_open 555 self._index_filename = index_filename 556 self._key_function = key_function
557
558 - def __repr__(self):
559 return self._repr
560
561 - def __contains__(self, key):
562 return bool( 563 self._con.execute("SELECT key FROM offset_data WHERE key=?;", 564 (key,)).fetchone())
565
566 - def __len__(self):
567 """How many records are there?""" 568 return self._length
569 #return self._con.execute("SELECT COUNT(key) FROM offset_data;").fetchone()[0] 570
571 - def __iter__(self):
572 """Iterate over the keys.""" 573 for row in self._con.execute("SELECT key FROM offset_data;"): 574 yield str(row[0])
575 576 if hasattr(dict, "iteritems"): 577 #Python 2, use iteritems but not items etc 578 #Just need to override this...
579 - def keys(self):
580 """Return a list of all the keys (SeqRecord identifiers).""" 581 return [str(row[0]) for row in 582 self._con.execute("SELECT key FROM offset_data;").fetchall()]
583
584 - def __getitem__(self, key):
585 """x.__getitem__(y) <==> x[y]""" 586 #Pass the offset to the proxy 587 row = self._con.execute( 588 "SELECT file_number, offset FROM offset_data WHERE key=?;", 589 (key,)).fetchone() 590 if not row: 591 raise KeyError 592 file_number, offset = row 593 proxies = self._proxies 594 if file_number in proxies: 595 record = proxies[file_number].get(offset) 596 else: 597 if len(proxies) >= self._max_open: 598 #Close an old handle... 599 proxies.popitem()[1]._handle.close() 600 #Open a new handle... 601 proxy = self._proxy_factory(self._format, self._filenames[file_number]) 602 record = proxy.get(offset) 603 proxies[file_number] = proxy 604 if self._key_function: 605 key2 = self._key_function(record.id) 606 else: 607 key2 = record.id 608 if key != key2: 609 raise ValueError("Key did not match (%s vs %s)" % (key, key2)) 610 return record
611
612 - def get(self, k, d=None):
613 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" 614 try: 615 return self.__getitem__(k) 616 except KeyError: 617 return d
618
619 - def get_raw(self, key):
620 """Similar to the get method, but returns the record as a raw string. 621 622 If the key is not found, a KeyError exception is raised. 623 624 Note that on Python 3 a bytes string is returned, not a typical 625 unicode string. 626 627 NOTE - This functionality is not supported for every file format. 628 """ 629 #Pass the offset to the proxy 630 row = self._con.execute( 631 "SELECT file_number, offset, length FROM offset_data WHERE key=?;", 632 (key,)).fetchone() 633 if not row: 634 raise KeyError 635 file_number, offset, length = row 636 proxies = self._proxies 637 if file_number in proxies: 638 if length: 639 #Shortcut if we have the length 640 h = proxies[file_number]._handle 641 h.seek(offset) 642 return h.read(length) 643 else: 644 return proxies[file_number].get_raw(offset) 645 else: 646 #This code is duplicated from __getitem__ to avoid a function call 647 if len(proxies) >= self._max_open: 648 #Close an old handle... 649 proxies.popitem()[1]._handle.close() 650 #Open a new handle... 651 proxy = self._proxy_factory(self._format, self._filenames[file_number]) 652 proxies[file_number] = proxy 653 if length: 654 #Shortcut if we have the length 655 h = proxy._handle 656 h.seek(offset) 657 return h.read(length) 658 else: 659 return proxy.get_raw(offset)
660
661 - def close(self):
662 """Close any open file handles.""" 663 proxies = self._proxies 664 while proxies: 665 proxies.popitem()[1]._handle.close()
666