Package Bio :: Package SeqIO :: Module _index
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO._index

  1  # Copyright 2009-2011 by Peter Cock.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Dictionary like indexing of sequence files (PRIVATE). 
  6   
  7  You are not expected to access this module, or any of its code, directly. This 
  8  is all handled internally by the Bio.SeqIO.index(...) function which is the 
  9  public interface for this functionality. 
 10   
 11  The basic idea is that we scan over a sequence file, looking for new record 
 12  markers. We then try and extract the string that Bio.SeqIO.parse/read would 
 13  use as the record id, ideally without actually parsing the full record. We 
 14  then use a subclassed Python dictionary to record the file offset for the 
 15  record start against the record id. 
 16   
 17  Note that this means full parsing is on demand, so any invalid or problem 
 18  record may not trigger an exception until it is accessed. This is by design. 
 19   
 20  This means our dictionary like objects have in memory ALL the keys (all the 
 21  record identifiers), which shouldn't be a problem even with second generation 
 22  sequencing. If this is an issue later on, storing the keys and offsets in a 
 23  temp lookup file might be one idea (e.g. using SQLite or an OBDA style index). 
 24  """ 
 25   
 26  from __future__ import print_function 
 27   
 28  import re 
 29  from Bio._py3k import StringIO 
 30  from Bio._py3k import _bytes_to_string, _as_bytes 
 31   
 32  from Bio import SeqIO 
 33  from Bio import Alphabet 
 34  from Bio.File import _IndexedSeqFileProxy, _open_for_random_access 
 35   
 36  __docformat__ = "restructuredtext en" 
 37   
 38   
39 -class SeqFileRandomAccess(_IndexedSeqFileProxy):
40 - def __init__(self, filename, format, alphabet):
41 self._handle = _open_for_random_access(filename) 42 self._alphabet = alphabet 43 self._format = format 44 # Load the parser class/function once an avoid the dict lookup in each 45 # __getitem__ call: 46 i = SeqIO._FormatToIterator[format] 47 # The following alphabet code is a bit nasty... duplicates logic in 48 # Bio.SeqIO.parse() 49 if alphabet is None: 50 def _parse(handle): 51 """Dynamically generated parser function (PRIVATE).""" 52 return next(i(handle))
53 else: 54 # TODO - Detect alphabet support ONCE at __init__ 55 def _parse(handle): 56 """Dynamically generated parser function (PRIVATE).""" 57 try: 58 return next(i(handle, alphabet=alphabet)) 59 except TypeError: 60 return next(SeqIO._force_alphabet(i(handle), alphabet))
61 self._parse = _parse 62
63 - def get(self, offset):
64 """Returns SeqRecord.""" 65 # Should be overridden for binary file formats etc: 66 return self._parse(StringIO(_bytes_to_string(self.get_raw(offset))))
67 68 69 #################### 70 # Special indexers # 71 #################### 72 # Anything where the records cannot be read simply by parsing from 73 # the record start. For example, anything requiring information from 74 # a file header - e.g. SFF files where we would need to know the 75 # number of flows.
76 -class SffRandomAccess(SeqFileRandomAccess):
77 """Random access to a Standard Flowgram Format (SFF) file."""
78 - def __init__(self, filename, format, alphabet):
79 SeqFileRandomAccess.__init__(self, filename, format, alphabet) 80 header_length, index_offset, index_length, number_of_reads, \ 81 self._flows_per_read, self._flow_chars, self._key_sequence \ 82 = SeqIO.SffIO._sff_file_header(self._handle)
83
84 - def __iter__(self):
85 """Load any index block in the file, or build it the slow way (PRIVATE).""" 86 if self._alphabet is None: 87 self._alphabet = Alphabet.generic_dna 88 handle = self._handle 89 handle.seek(0) 90 # Alread did this in __init__ but need handle in right place 91 header_length, index_offset, index_length, number_of_reads, \ 92 self._flows_per_read, self._flow_chars, self._key_sequence \ 93 = SeqIO.SffIO._sff_file_header(handle) 94 if index_offset and index_length: 95 # There is an index provided, try this the fast way: 96 count = 0 97 max_offset = 0 98 try: 99 for name, offset in SeqIO.SffIO._sff_read_roche_index(handle): 100 max_offset = max(max_offset, offset) 101 yield name, offset, 0 102 count += 1 103 assert count == number_of_reads, \ 104 "Indexed %i records, expected %i" \ 105 % (count, number_of_reads) 106 # If that worked, call _check_eof ... 107 except ValueError as err: 108 import warnings 109 from Bio import BiopythonParserWarning 110 warnings.warn("Could not parse the SFF index: %s" % err, 111 BiopythonParserWarning) 112 assert count == 0, "Partially populated index" 113 handle.seek(0) 114 # Drop out to the slow way... 115 else: 116 # Fast way worked, check EOF 117 if index_offset + index_length <= max_offset: 118 # Can have an index at start (or mid-file) 119 handle.seek(max_offset) 120 # Parse the final read, 121 SeqIO.SffIO._sff_read_raw_record(handle, self._flows_per_read) 122 # Should now be at the end of the file! 123 SeqIO.SffIO._check_eof(handle, index_offset, index_length) 124 return 125 # We used to give a warning in this case, but Ion Torrent's 126 # SFF files don't have an index so that would be annoying. 127 # Fall back on the slow way! 128 count = 0 129 for name, offset in SeqIO.SffIO._sff_do_slow_index(handle): 130 yield name, offset, 0 131 count += 1 132 assert count == number_of_reads, \ 133 "Indexed %i records, expected %i" % (count, number_of_reads) 134 SeqIO.SffIO._check_eof(handle, index_offset, index_length)
135
136 - def get(self, offset):
137 handle = self._handle 138 handle.seek(offset) 139 return SeqIO.SffIO._sff_read_seq_record(handle, 140 self._flows_per_read, 141 self._flow_chars, 142 self._key_sequence, 143 self._alphabet)
144
145 - def get_raw(self, offset):
146 handle = self._handle 147 handle.seek(offset) 148 return SeqIO.SffIO._sff_read_raw_record(handle, self._flows_per_read)
149 150
151 -class SffTrimedRandomAccess(SffRandomAccess):
152 - def get(self, offset):
153 handle = self._handle 154 handle.seek(offset) 155 return SeqIO.SffIO._sff_read_seq_record(handle, 156 self._flows_per_read, 157 self._flow_chars, 158 self._key_sequence, 159 self._alphabet, 160 trim=True)
161 162 163 ################### 164 # Simple indexers # 165 ################### 166
167 -class SequentialSeqFileRandomAccess(SeqFileRandomAccess):
168 - def __init__(self, filename, format, alphabet):
169 SeqFileRandomAccess.__init__(self, filename, format, alphabet) 170 marker = {"ace": "CO ", 171 "embl": "ID ", 172 "fasta": ">", 173 "genbank": "LOCUS ", 174 "gb": "LOCUS ", 175 "imgt": "ID ", 176 "phd": "BEGIN_SEQUENCE", 177 "pir": ">..;", 178 "qual": ">", 179 "qual": ">", 180 "swiss": "ID ", 181 "uniprot-xml": "<entry ", 182 }[format] 183 self._marker = marker 184 self._marker_re = re.compile(_as_bytes("^%s" % marker))
185
186 - def __iter__(self):
187 """Returns (id,offset) tuples.""" 188 marker_offset = len(self._marker) 189 marker_re = self._marker_re 190 handle = self._handle 191 handle.seek(0) 192 # Skip and header before first record 193 while True: 194 start_offset = handle.tell() 195 line = handle.readline() 196 if marker_re.match(line) or not line: 197 break 198 # Should now be at the start of a record, or end of the file 199 while marker_re.match(line): 200 # Here we can assume the record.id is the first word after the 201 # marker. This is generally fine... but not for GenBank, EMBL, Swiss 202 id = line[marker_offset:].strip().split(None, 1)[0] 203 length = len(line) 204 while True: 205 end_offset = handle.tell() 206 line = handle.readline() 207 if marker_re.match(line) or not line: 208 yield _bytes_to_string(id), start_offset, length 209 start_offset = end_offset 210 break 211 else: 212 # Track this explicitly as can't do file offset difference on BGZF 213 length += len(line) 214 assert not line, repr(line)
215
216 - def get_raw(self, offset):
217 """Similar to the get method, but returns the record as a raw string.""" 218 # For non-trivial file formats this must be over-ridden in the subclass 219 handle = self._handle 220 marker_re = self._marker_re 221 handle.seek(offset) 222 lines = [handle.readline()] 223 while True: 224 line = handle.readline() 225 if marker_re.match(line) or not line: 226 # End of file, or start of next record => end of this record 227 break 228 lines.append(line) 229 return _as_bytes("").join(lines)
230 231 232 ####################################### 233 # Fiddly indexers: GenBank, EMBL, ... # 234 ####################################### 235
236 -class GenBankRandomAccess(SequentialSeqFileRandomAccess):
237 """Indexed dictionary like access to a GenBank file."""
238 - def __iter__(self):
239 handle = self._handle 240 handle.seek(0) 241 marker_re = self._marker_re 242 dot_char = _as_bytes(".") 243 accession_marker = _as_bytes("ACCESSION ") 244 version_marker = _as_bytes("VERSION ") 245 # Skip and header before first record 246 while True: 247 start_offset = handle.tell() 248 line = handle.readline() 249 if marker_re.match(line) or not line: 250 break 251 # Should now be at the start of a record, or end of the file 252 while marker_re.match(line): 253 # We cannot assume the record.id is the first word after LOCUS, 254 # normally the first entry on the VERSION or ACCESSION line is used. 255 key = None 256 length = len(line) 257 while True: 258 end_offset = handle.tell() 259 line = handle.readline() 260 if marker_re.match(line) or not line: 261 if not key: 262 raise ValueError( 263 "Did not find ACCESSION/VERSION lines") 264 yield _bytes_to_string(key), start_offset, length 265 start_offset = end_offset 266 break 267 elif line.startswith(accession_marker): 268 key = line.rstrip().split()[1] 269 elif line.startswith(version_marker): 270 version_id = line.rstrip().split()[1] 271 if version_id.count(dot_char) == 1 and version_id.split(dot_char)[1].isdigit(): 272 # This should mimic the GenBank parser... 273 key = version_id 274 length += len(line) 275 assert not line, repr(line)
276 277
278 -class EmblRandomAccess(SequentialSeqFileRandomAccess):
279 """Indexed dictionary like access to an EMBL file."""
280 - def __iter__(self):
281 handle = self._handle 282 handle.seek(0) 283 marker_re = self._marker_re 284 semi_char = _as_bytes(";") 285 dot_char = _as_bytes(".") 286 sv_marker = _as_bytes("SV ") 287 ac_marker = _as_bytes("AC ") 288 # Skip any header before first record 289 while True: 290 start_offset = handle.tell() 291 line = handle.readline() 292 if marker_re.match(line) or not line: 293 break 294 # Should now be at the start of a record, or end of the file 295 while marker_re.match(line): 296 # We cannot assume the record.id is the first word after ID, 297 # normally the SV line is used. 298 setbysv = False # resets sv as false 299 length = len(line) 300 if line[2:].count(semi_char) == 6: 301 # Looks like the semi colon separated style introduced in 2006 302 parts = line[3:].rstrip().split(semi_char) 303 if parts[1].strip().startswith(sv_marker): 304 # The SV bit gives the version 305 key = parts[0].strip() + dot_char + \ 306 parts[1].strip().split()[1] 307 setbysv = True 308 else: 309 key = parts[0].strip() 310 elif line[2:].count(semi_char) == 3: 311 # Looks like the pre 2006 style, take first word only 312 key = line[3:].strip().split(None, 1)[0] 313 if key.endswith(semi_char): 314 key = key[:-1] 315 else: 316 raise ValueError( 317 'Did not recognise the ID line layout:\n' + line) 318 while True: 319 end_offset = handle.tell() 320 line = handle.readline() 321 if marker_re.match(line) or not line: 322 end_offset = handle.tell() - len(line) 323 yield _bytes_to_string(key), start_offset, length 324 start_offset = end_offset 325 break 326 elif line.startswith(ac_marker) and not setbysv: 327 key = line.rstrip().split()[1] 328 if key.endswith(semi_char): 329 key = key[:-1] 330 elif line.startswith(sv_marker): 331 key = line.rstrip().split()[1] 332 setbysv = True 333 length += len(line) 334 assert not line, repr(line)
335 336
337 -class SwissRandomAccess(SequentialSeqFileRandomAccess):
338 """Random access to a SwissProt file."""
339 - def __iter__(self):
340 handle = self._handle 341 handle.seek(0) 342 marker_re = self._marker_re 343 semi_char = _as_bytes(";") 344 # Skip any header before first record 345 while True: 346 start_offset = handle.tell() 347 line = handle.readline() 348 if marker_re.match(line) or not line: 349 break 350 # Should now be at the start of a record, or end of the file 351 while marker_re.match(line): 352 length = len(line) 353 # We cannot assume the record.id is the first word after ID, 354 # normally the following AC line is used. 355 line = handle.readline() 356 length += len(line) 357 assert line.startswith(_as_bytes("AC ")) 358 key = line[3:].strip().split(semi_char)[0].strip() 359 while True: 360 end_offset = handle.tell() 361 line = handle.readline() 362 if marker_re.match(line) or not line: 363 yield _bytes_to_string(key), start_offset, length 364 start_offset = end_offset 365 break 366 length += len(line) 367 assert not line, repr(line)
368 369
370 -class UniprotRandomAccess(SequentialSeqFileRandomAccess):
371 """Random access to a UniProt XML file."""
372 - def __iter__(self):
373 handle = self._handle 374 handle.seek(0) 375 marker_re = self._marker_re 376 start_acc_marker = _as_bytes("<accession>") 377 end_acc_marker = _as_bytes("</accession>") 378 end_entry_marker = _as_bytes("</entry>") 379 less_than = _as_bytes("<") 380 # Skip any header before first record 381 while True: 382 start_offset = handle.tell() 383 line = handle.readline() 384 if marker_re.match(line) or not line: 385 break 386 # Should now be at the start of a record, or end of the file 387 while marker_re.match(line): 388 length = len(line) 389 # We expect the next line to be <accession>xxx</accession> 390 # (possibly with leading spaces) 391 # but allow it to be later on within the <entry> 392 key = None 393 while True: 394 line = handle.readline() 395 if key is None and start_acc_marker in line: 396 assert end_acc_marker in line, line 397 key = line[line.find( 398 start_acc_marker) + 11:].split(less_than, 1)[0] 399 length += len(line) 400 elif end_entry_marker in line: 401 end_offset = handle.tell() - len(line) \ 402 + line.find(end_entry_marker) + 8 403 break 404 elif marker_re.match(line) or not line: 405 # Start of next record or end of file 406 raise ValueError("Didn't find end of record") 407 else: 408 length += len(line) 409 if not key: 410 raise ValueError("Did not find <accession> line in bytes %i to %i" 411 % (start_offset, end_offset)) 412 yield _bytes_to_string(key), start_offset, length 413 # Find start of next record 414 while not marker_re.match(line) and line: 415 start_offset = handle.tell() 416 line = handle.readline() 417 assert not line, repr(line)
418
419 - def get_raw(self, offset):
420 """Similar to the get method, but returns the record as a raw string.""" 421 handle = self._handle 422 marker_re = self._marker_re 423 end_entry_marker = _as_bytes("</entry>") 424 handle.seek(offset) 425 data = [handle.readline()] 426 while True: 427 line = handle.readline() 428 i = line.find(end_entry_marker) 429 if i != -1: 430 data.append(line[:i + 8]) 431 break 432 if marker_re.match(line) or not line: 433 # End of file, or start of next record 434 raise ValueError("Didn't find end of record") 435 data.append(line) 436 return _as_bytes("").join(data)
437
438 - def get(self, offset):
439 # TODO - Can we handle this directly in the parser? 440 # This is a hack - use get_raw for <entry>...</entry> and wrap it with 441 # the apparently required XML header and footer. 442 data = """<?xml version='1.0' encoding='UTF-8'?> 443 <uniprot xmlns="http://uniprot.org/uniprot" 444 xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 445 xsi:schemaLocation="http://uniprot.org/uniprot 446 http://www.uniprot.org/support/docs/uniprot.xsd"> 447 %s 448 </uniprot> 449 """ % _bytes_to_string(self.get_raw(offset)) 450 # TODO - For consistency, this function should not accept a string: 451 return next(SeqIO.UniprotIO.UniprotIterator(data))
452 453
454 -class IntelliGeneticsRandomAccess(SeqFileRandomAccess):
455 """Random access to a IntelliGenetics file."""
456 - def __init__(self, filename, format, alphabet):
457 SeqFileRandomAccess.__init__(self, filename, format, alphabet) 458 self._marker_re = re.compile(_as_bytes("^;"))
459
460 - def __iter__(self):
461 handle = self._handle 462 handle.seek(0) 463 marker_re = self._marker_re 464 semi_char = _as_bytes(";") 465 while True: 466 offset = handle.tell() 467 line = handle.readline() 468 length = len(line) 469 if marker_re.match(line): 470 # Now look for the first line which doesn't start ";" 471 while True: 472 line = handle.readline() 473 if line[0:1] != semi_char and line.strip(): 474 key = line.split()[0] 475 yield _bytes_to_string(key), offset, length 476 break 477 if not line: 478 raise ValueError("Premature end of file?") 479 length += len(line) 480 elif not line: 481 # End of file 482 break
483
484 - def get_raw(self, offset):
485 handle = self._handle 486 handle.seek(offset) 487 marker_re = self._marker_re 488 lines = [] 489 line = handle.readline() 490 semi_char = _as_bytes(";") 491 while line.startswith(semi_char): 492 lines.append(line) 493 line = handle.readline() 494 while line and not line.startswith(semi_char): 495 lines.append(line) 496 line = handle.readline() 497 return _as_bytes("").join(lines)
498 499
500 -class TabRandomAccess(SeqFileRandomAccess):
501 """Random access to a simple tabbed file."""
502 - def __iter__(self):
503 handle = self._handle 504 handle.seek(0) 505 tab_char = _as_bytes("\t") 506 while True: 507 start_offset = handle.tell() 508 line = handle.readline() 509 if not line: 510 break # End of file 511 try: 512 key = line.split(tab_char)[0] 513 except ValueError as err: 514 if not line.strip(): 515 # Ignore blank lines 516 continue 517 else: 518 raise err 519 else: 520 yield _bytes_to_string(key), start_offset, len(line)
521
522 - def get_raw(self, offset):
523 """Like the get method, but returns the record as a raw string.""" 524 handle = self._handle 525 handle.seek(offset) 526 return handle.readline()
527 528 529 ########################## 530 # Now the FASTQ indexers # 531 ########################## 532
533 -class FastqRandomAccess(SeqFileRandomAccess):
534 """Random access to a FASTQ file (any supported variant). 535 536 With FASTQ the records all start with a "@" line, but so can quality lines. 537 Note this will cope with line-wrapped FASTQ files. 538 """
539 - def __iter__(self):
540 handle = self._handle 541 handle.seek(0) 542 id = None 543 start_offset = handle.tell() 544 line = handle.readline() 545 if not line: 546 # Empty file! 547 return 548 at_char = _as_bytes("@") 549 plus_char = _as_bytes("+") 550 if line[0:1] != at_char: 551 raise ValueError("Problem with FASTQ @ line:\n%r" % line) 552 while line: 553 # assert line[0]=="@" 554 # This record seems OK (so far) 555 id = line[1:].rstrip().split(None, 1)[0] 556 # Find the seq line(s) 557 seq_len = 0 558 length = len(line) 559 while line: 560 line = handle.readline() 561 length += len(line) 562 if line.startswith(plus_char): 563 break 564 seq_len += len(line.strip()) 565 if not line: 566 raise ValueError("Premature end of file in seq section") 567 # assert line[0]=="+" 568 # Find the qual line(s) 569 qual_len = 0 570 while line: 571 if seq_len == qual_len: 572 if seq_len == 0: 573 # Special case, quality line should be just "\n" 574 line = handle.readline() 575 if line.strip(): 576 raise ValueError("Expected blank quality line, not %r" % line) 577 # Should be end of record... 578 end_offset = handle.tell() 579 line = handle.readline() 580 if line and line[0:1] != at_char: 581 raise ValueError("Problem with line %r" % line) 582 break 583 else: 584 line = handle.readline() 585 qual_len += len(line.strip()) 586 length += len(line) 587 if seq_len != qual_len: 588 raise ValueError("Problem with quality section") 589 yield _bytes_to_string(id), start_offset, length 590 start_offset = end_offset
591 # print("EOF") 592
593 - def get_raw(self, offset):
594 """Similar to the get method, but returns the record as a raw string.""" 595 # TODO - Refactor this and the __init__ method to reduce code duplication? 596 handle = self._handle 597 handle.seek(offset) 598 line = handle.readline() 599 data = line 600 at_char = _as_bytes("@") 601 plus_char = _as_bytes("+") 602 if line[0:1] != at_char: 603 raise ValueError("Problem with FASTQ @ line:\n%r" % line) 604 # Find the seq line(s) 605 seq_len = 0 606 while line: 607 line = handle.readline() 608 data += line 609 if line.startswith(plus_char): 610 break 611 seq_len += len(line.strip()) 612 if not line: 613 raise ValueError("Premature end of file in seq section") 614 assert line[0:1] == plus_char 615 # Find the qual line(s) 616 qual_len = 0 617 while line: 618 if seq_len == qual_len: 619 if seq_len == 0: 620 # Special case, quality line should be just "\n" 621 line = handle.readline() 622 if line.strip(): 623 raise ValueError("Expected blank quality line, not %r" % line) 624 data += line 625 # Should be end of record... 626 line = handle.readline() 627 if line and line[0:1] != at_char: 628 raise ValueError("Problem with line %r" % line) 629 break 630 else: 631 line = handle.readline() 632 data += line 633 qual_len += len(line.strip()) 634 if seq_len != qual_len: 635 raise ValueError("Problem with quality section") 636 return data
637 638 639 ############################################################################### 640 641 _FormatToRandomAccess = {"ace": SequentialSeqFileRandomAccess, 642 "embl": EmblRandomAccess, 643 "fasta": SequentialSeqFileRandomAccess, 644 "fastq": FastqRandomAccess, # Class handles all three variants 645 "fastq-sanger": FastqRandomAccess, # alias of the above 646 "fastq-solexa": FastqRandomAccess, 647 "fastq-illumina": FastqRandomAccess, 648 "genbank": GenBankRandomAccess, 649 "gb": GenBankRandomAccess, # alias of the above 650 "ig": IntelliGeneticsRandomAccess, 651 "imgt": EmblRandomAccess, 652 "phd": SequentialSeqFileRandomAccess, 653 "pir": SequentialSeqFileRandomAccess, 654 "sff": SffRandomAccess, 655 "sff-trim": SffTrimedRandomAccess, 656 "swiss": SwissRandomAccess, 657 "tab": TabRandomAccess, 658 "qual": SequentialSeqFileRandomAccess, 659 "uniprot-xml": UniprotRandomAccess, 660 } 661