Package Bio :: Package SeqIO :: Module _index
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO._index

  1  # Copyright 2009-2011 by Peter Cock.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Dictionary like indexing of sequence files (PRIVATE). 
  6   
  7  You are not expected to access this module, or any of its code, directly. This 
  8  is all handled internally by the Bio.SeqIO.index(...) function which is the 
  9  public interface for this functionality. 
 10   
 11  The basic idea is that we scan over a sequence file, looking for new record 
 12  markers. We then try and extract the string that Bio.SeqIO.parse/read would 
 13  use as the record id, ideally without actually parsing the full record. We 
 14  then use a subclassed Python dictionary to record the file offset for the 
 15  record start against the record id. 
 16   
 17  Note that this means full parsing is on demand, so any invalid or problem 
 18  record may not trigger an exception until it is accessed. This is by design. 
 19   
 20  This means our dictionary like objects have in memory ALL the keys (all the 
 21  record identifiers), which shouldn't be a problem even with second generation 
 22  sequencing. If this is an issue later on, storing the keys and offsets in a 
 23  temp lookup file might be one idea (e.g. using SQLite or an OBDA style index). 
 24  """ 
 25   
 26  import re 
 27  from StringIO import StringIO 
 28   
 29  from Bio._py3k import _bytes_to_string, _as_bytes 
 30   
 31  from Bio import SeqIO 
 32  from Bio import Alphabet 
 33  from Bio import bgzf 
 34  from Bio.File import _IndexedSeqFileProxy, _open_for_random_access 
 35   
 36   
37 -class SeqFileRandomAccess(_IndexedSeqFileProxy):
38 - def __init__(self, filename, format, alphabet):
39 self._handle = _open_for_random_access(filename) 40 self._alphabet = alphabet 41 self._format = format 42 #Load the parser class/function once an avoid the dict lookup in each 43 #__getitem__ call: 44 i = SeqIO._FormatToIterator[format] 45 #The following alphabet code is a bit nasty... duplicates logic in 46 #Bio.SeqIO.parse() 47 if alphabet is None: 48 def _parse(handle): 49 """Dynamically generated parser function (PRIVATE).""" 50 return i(handle).next()
51 else: 52 #TODO - Detect alphabet support ONCE at __init__ 53 def _parse(handle): 54 """Dynamically generated parser function (PRIVATE).""" 55 try: 56 return i(handle, alphabet=alphabet).next() 57 except TypeError: 58 return SeqIO._force_alphabet(i(handle), 59 alphabet).next()
60 self._parse = _parse 61
62 - def get(self, offset):
63 """Returns SeqRecord.""" 64 #Should be overridden for binary file formats etc: 65 return self._parse(StringIO(_bytes_to_string(self.get_raw(offset))))
66 67 68 #################### 69 # Special indexers # 70 #################### 71 # Anything where the records cannot be read simply by parsing from 72 # the record start. For example, anything requiring information from 73 # a file header - e.g. SFF files where we would need to know the 74 # number of flows.
75 -class SffRandomAccess(SeqFileRandomAccess):
76 """Random access to a Standard Flowgram Format (SFF) file."""
77 - def __init__(self, filename, format, alphabet):
78 SeqFileRandomAccess.__init__(self, filename, format, alphabet) 79 header_length, index_offset, index_length, number_of_reads, \ 80 self._flows_per_read, self._flow_chars, self._key_sequence \ 81 = SeqIO.SffIO._sff_file_header(self._handle)
82
83 - def __iter__(self):
84 """Load any index block in the file, or build it the slow way (PRIVATE).""" 85 if self._alphabet is None: 86 self._alphabet = Alphabet.generic_dna 87 handle = self._handle 88 handle.seek(0) 89 #Alread did this in __init__ but need handle in right place 90 header_length, index_offset, index_length, number_of_reads, \ 91 self._flows_per_read, self._flow_chars, self._key_sequence \ 92 = SeqIO.SffIO._sff_file_header(handle) 93 if index_offset and index_length: 94 #There is an index provided, try this the fast way: 95 count = 0 96 try: 97 for name, offset in SeqIO.SffIO._sff_read_roche_index(handle): 98 yield name, offset, 0 99 count += 1 100 assert count == number_of_reads, \ 101 "Indexed %i records, expected %i" \ 102 % (count, number_of_reads) 103 return 104 except ValueError, err: 105 import warnings 106 warnings.warn("Could not parse the SFF index: %s" % err) 107 assert count == 0, "Partially populated index" 108 handle.seek(0) 109 #We used to give a warning in this case, but Ion Torrent's 110 #SFF files don't have an index so that would be annoying. 111 #Fall back on the slow way! 112 count = 0 113 for name, offset in SeqIO.SffIO._sff_do_slow_index(handle): 114 yield name, offset, 0 115 count += 1 116 assert count == number_of_reads, \ 117 "Indexed %i records, expected %i" % (count, number_of_reads)
118
119 - def get(self, offset):
120 handle = self._handle 121 handle.seek(offset) 122 return SeqIO.SffIO._sff_read_seq_record(handle, 123 self._flows_per_read, 124 self._flow_chars, 125 self._key_sequence, 126 self._alphabet)
127
128 - def get_raw(self, offset):
129 handle = self._handle 130 handle.seek(offset) 131 return SeqIO.SffIO._sff_read_raw_record(handle, self._flows_per_read)
132 133
134 -class SffTrimedRandomAccess(SffRandomAccess):
135 - def get(self, offset):
136 handle = self._handle 137 handle.seek(offset) 138 return SeqIO.SffIO._sff_read_seq_record(handle, 139 self._flows_per_read, 140 self._flow_chars, 141 self._key_sequence, 142 self._alphabet, 143 trim=True)
144 145 146 ################### 147 # Simple indexers # 148 ################### 149
150 -class SequentialSeqFileRandomAccess(SeqFileRandomAccess):
151 - def __init__(self, filename, format, alphabet):
152 SeqFileRandomAccess.__init__(self, filename, format, alphabet) 153 marker = {"ace": "CO ", 154 "embl": "ID ", 155 "fasta": ">", 156 "genbank": "LOCUS ", 157 "gb": "LOCUS ", 158 "imgt": "ID ", 159 "phd": "BEGIN_SEQUENCE", 160 "pir": ">..;", 161 "qual": ">", 162 "qual": ">", 163 "swiss": "ID ", 164 "uniprot-xml": "<entry ", 165 }[format] 166 self._marker = marker 167 self._marker_re = re.compile(_as_bytes("^%s" % marker))
168
169 - def __iter__(self):
170 """Returns (id,offset) tuples.""" 171 marker_offset = len(self._marker) 172 marker_re = self._marker_re 173 handle = self._handle 174 handle.seek(0) 175 #Skip and header before first record 176 while True: 177 start_offset = handle.tell() 178 line = handle.readline() 179 if marker_re.match(line) or not line: 180 break 181 #Should now be at the start of a record, or end of the file 182 while marker_re.match(line): 183 #Here we can assume the record.id is the first word after the 184 #marker. This is generally fine... but not for GenBank, EMBL, Swiss 185 id = line[marker_offset:].strip().split(None, 1)[0] 186 length = len(line) 187 while True: 188 end_offset = handle.tell() 189 line = handle.readline() 190 if marker_re.match(line) or not line: 191 yield _bytes_to_string(id), start_offset, length 192 start_offset = end_offset 193 break 194 else: 195 #Track this explicitly as can't do file offset difference on BGZF 196 length += len(line) 197 assert not line, repr(line)
198
199 - def get_raw(self, offset):
200 """Similar to the get method, but returns the record as a raw string.""" 201 #For non-trivial file formats this must be over-ridden in the subclass 202 handle = self._handle 203 marker_re = self._marker_re 204 handle.seek(offset) 205 lines = [handle.readline()] 206 while True: 207 line = handle.readline() 208 if marker_re.match(line) or not line: 209 #End of file, or start of next record => end of this record 210 break 211 lines.append(line) 212 return _as_bytes("").join(lines)
213 214 215 ####################################### 216 # Fiddly indexers: GenBank, EMBL, ... # 217 ####################################### 218
219 -class GenBankRandomAccess(SequentialSeqFileRandomAccess):
220 """Indexed dictionary like access to a GenBank file."""
221 - def __iter__(self):
222 handle = self._handle 223 handle.seek(0) 224 marker_re = self._marker_re 225 dot_char = _as_bytes(".") 226 accession_marker = _as_bytes("ACCESSION ") 227 version_marker = _as_bytes("VERSION ") 228 #Skip and header before first record 229 while True: 230 start_offset = handle.tell() 231 line = handle.readline() 232 if marker_re.match(line) or not line: 233 break 234 #Should now be at the start of a record, or end of the file 235 while marker_re.match(line): 236 #We cannot assume the record.id is the first word after LOCUS, 237 #normally the first entry on the VERSION or ACCESSION line is used. 238 key = None 239 length = len(line) 240 while True: 241 end_offset = handle.tell() 242 line = handle.readline() 243 if marker_re.match(line) or not line: 244 if not key: 245 raise ValueError( 246 "Did not find ACCESSION/VERSION lines") 247 yield _bytes_to_string(key), start_offset, length 248 start_offset = end_offset 249 break 250 elif line.startswith(accession_marker): 251 key = line.rstrip().split()[1] 252 elif line.startswith(version_marker): 253 version_id = line.rstrip().split()[1] 254 if version_id.count(dot_char) == 1 and version_id.split(dot_char)[1].isdigit(): 255 #This should mimic the GenBank parser... 256 key = version_id 257 length += len(line) 258 assert not line, repr(line)
259 260
261 -class EmblRandomAccess(SequentialSeqFileRandomAccess):
262 """Indexed dictionary like access to an EMBL file."""
263 - def __iter__(self):
264 handle = self._handle 265 handle.seek(0) 266 marker_re = self._marker_re 267 semi_char = _as_bytes(";") 268 dot_char = _as_bytes(".") 269 sv_marker = _as_bytes("SV ") 270 #Skip any header before first record 271 while True: 272 start_offset = handle.tell() 273 line = handle.readline() 274 if marker_re.match(line) or not line: 275 break 276 #Should now be at the start of a record, or end of the file 277 while marker_re.match(line): 278 #We cannot assume the record.id is the first word after ID, 279 #normally the SV line is used. 280 length = len(line) 281 if line[2:].count(semi_char) == 6: 282 #Looks like the semi colon separated style introduced in 2006 283 parts = line[3:].rstrip().split(semi_char) 284 if parts[1].strip().startswith(sv_marker): 285 #The SV bit gives the version 286 key = parts[0].strip() + dot_char + \ 287 parts[1].strip().split()[1] 288 else: 289 key = parts[0].strip() 290 elif line[2:].count(semi_char) == 3: 291 #Looks like the pre 2006 style, take first word only 292 key = line[3:].strip().split(None, 1)[0] 293 else: 294 raise ValueError( 295 'Did not recognise the ID line layout:\n' + line) 296 while True: 297 end_offset = handle.tell() 298 line = handle.readline() 299 if marker_re.match(line) or not line: 300 end_offset = handle.tell() - len(line) 301 yield _bytes_to_string(key), start_offset, length 302 start_offset = end_offset 303 break 304 elif line.startswith(sv_marker): 305 key = line.rstrip().split()[1] 306 length += len(line) 307 assert not line, repr(line)
308 309
310 -class SwissRandomAccess(SequentialSeqFileRandomAccess):
311 """Random access to a SwissProt file."""
312 - def __iter__(self):
313 handle = self._handle 314 handle.seek(0) 315 marker_re = self._marker_re 316 semi_char = _as_bytes(";") 317 #Skip any header before first record 318 while True: 319 start_offset = handle.tell() 320 line = handle.readline() 321 if marker_re.match(line) or not line: 322 break 323 #Should now be at the start of a record, or end of the file 324 while marker_re.match(line): 325 length = len(line) 326 #We cannot assume the record.id is the first word after ID, 327 #normally the following AC line is used. 328 line = handle.readline() 329 length += len(line) 330 assert line.startswith(_as_bytes("AC ")) 331 key = line[3:].strip().split(semi_char)[0].strip() 332 while True: 333 end_offset = handle.tell() 334 line = handle.readline() 335 if marker_re.match(line) or not line: 336 yield _bytes_to_string(key), start_offset, length 337 start_offset = end_offset 338 break 339 length += len(line) 340 assert not line, repr(line)
341 342
343 -class UniprotRandomAccess(SequentialSeqFileRandomAccess):
344 """Random access to a UniProt XML file."""
345 - def __iter__(self):
346 handle = self._handle 347 handle.seek(0) 348 marker_re = self._marker_re 349 start_acc_marker = _as_bytes("<accession>") 350 end_acc_marker = _as_bytes("</accession>") 351 end_entry_marker = _as_bytes("</entry>") 352 less_than = _as_bytes("<") 353 #Skip any header before first record 354 while True: 355 start_offset = handle.tell() 356 line = handle.readline() 357 if marker_re.match(line) or not line: 358 break 359 #Should now be at the start of a record, or end of the file 360 while marker_re.match(line): 361 length = len(line) 362 #We expect the next line to be <accession>xxx</accession> 363 #(possibly with leading spaces) 364 #but allow it to be later on within the <entry> 365 key = None 366 while True: 367 line = handle.readline() 368 if key is None and start_acc_marker in line: 369 assert end_acc_marker in line, line 370 key = line[line.find( 371 start_acc_marker) + 11:].split(less_than, 1)[0] 372 length += len(line) 373 elif end_entry_marker in line: 374 end_offset = handle.tell() - len(line) \ 375 + line.find(end_entry_marker) + 8 376 break 377 elif marker_re.match(line) or not line: 378 #Start of next record or end of file 379 raise ValueError("Didn't find end of record") 380 else: 381 length += len(line) 382 if not key: 383 raise ValueError("Did not find <accession> line in bytes %i to %i" 384 % (start_offset, end_offset)) 385 yield _bytes_to_string(key), start_offset, length 386 #Find start of next record 387 while not marker_re.match(line) and line: 388 start_offset = handle.tell() 389 line = handle.readline() 390 assert not line, repr(line)
391
392 - def get_raw(self, offset):
393 """Similar to the get method, but returns the record as a raw string.""" 394 handle = self._handle 395 marker_re = self._marker_re 396 end_entry_marker = _as_bytes("</entry>") 397 handle.seek(offset) 398 data = [handle.readline()] 399 while True: 400 line = handle.readline() 401 i = line.find(end_entry_marker) 402 if i != -1: 403 data.append(line[:i + 8]) 404 break 405 if marker_re.match(line) or not line: 406 #End of file, or start of next record 407 raise ValueError("Didn't find end of record") 408 data.append(line) 409 return _as_bytes("").join(data)
410
411 - def get(self, offset):
412 #TODO - Can we handle this directly in the parser? 413 #This is a hack - use get_raw for <entry>...</entry> and wrap it with 414 #the apparently required XML header and footer. 415 data = """<?xml version='1.0' encoding='UTF-8'?> 416 <uniprot xmlns="http://uniprot.org/uniprot" 417 xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 418 xsi:schemaLocation="http://uniprot.org/uniprot 419 http://www.uniprot.org/support/docs/uniprot.xsd"> 420 %s 421 </uniprot> 422 """ % _bytes_to_string(self.get_raw(offset)) 423 #TODO - For consistency, this function should not accept a string: 424 return SeqIO.UniprotIO.UniprotIterator(data).next()
425 426
427 -class IntelliGeneticsRandomAccess(SeqFileRandomAccess):
428 """Random access to a IntelliGenetics file."""
429 - def __init__(self, filename, format, alphabet):
430 SeqFileRandomAccess.__init__(self, filename, format, alphabet) 431 self._marker_re = re.compile(_as_bytes("^;"))
432
433 - def __iter__(self):
434 handle = self._handle 435 handle.seek(0) 436 marker_re = self._marker_re 437 semi_char = _as_bytes(";") 438 while True: 439 offset = handle.tell() 440 line = handle.readline() 441 length = len(line) 442 if marker_re.match(line): 443 #Now look for the first line which doesn't start ";" 444 while True: 445 line = handle.readline() 446 if line[0:1] != semi_char and line.strip(): 447 key = line.split()[0] 448 yield _bytes_to_string(key), offset, length 449 break 450 if not line: 451 raise ValueError("Premature end of file?") 452 length += len(line) 453 elif not line: 454 #End of file 455 break
456
457 - def get_raw(self, offset):
458 handle = self._handle 459 handle.seek(offset) 460 marker_re = self._marker_re 461 lines = [] 462 line = handle.readline() 463 semi_char = _as_bytes(";") 464 while line.startswith(semi_char): 465 lines.append(line) 466 line = handle.readline() 467 while line and not line.startswith(semi_char): 468 lines.append(line) 469 line = handle.readline() 470 return _as_bytes("").join(lines)
471 472
473 -class TabRandomAccess(SeqFileRandomAccess):
474 """Random access to a simple tabbed file."""
475 - def __iter__(self):
476 handle = self._handle 477 handle.seek(0) 478 tab_char = _as_bytes("\t") 479 while True: 480 start_offset = handle.tell() 481 line = handle.readline() 482 if not line: 483 break # End of file 484 try: 485 key = line.split(tab_char)[0] 486 except ValueError, err: 487 if not line.strip(): 488 #Ignore blank lines 489 continue 490 else: 491 raise err 492 else: 493 yield _bytes_to_string(key), start_offset, len(line)
494
495 - def get_raw(self, offset):
496 """Like the get method, but returns the record as a raw string.""" 497 handle = self._handle 498 handle.seek(offset) 499 return handle.readline()
500 501 502 ########################## 503 # Now the FASTQ indexers # 504 ########################## 505
506 -class FastqRandomAccess(SeqFileRandomAccess):
507 """Random access to a FASTQ file (any supported variant). 508 509 With FASTQ the records all start with a "@" line, but so can quality lines. 510 Note this will cope with line-wrapped FASTQ files. 511 """
512 - def __iter__(self):
513 handle = self._handle 514 handle.seek(0) 515 id = None 516 start_offset = handle.tell() 517 line = handle.readline() 518 if not line: 519 #Empty file! 520 return 521 at_char = _as_bytes("@") 522 plus_char = _as_bytes("+") 523 if line[0:1] != at_char: 524 raise ValueError("Problem with FASTQ @ line:\n%s" % repr(line)) 525 while line: 526 #assert line[0]=="@" 527 #This record seems OK (so far) 528 id = line[1:].rstrip().split(None, 1)[0] 529 #Find the seq line(s) 530 seq_len = 0 531 length = len(line) 532 while line: 533 line = handle.readline() 534 length += len(line) 535 if line.startswith(plus_char): 536 break 537 seq_len += len(line.strip()) 538 if not line: 539 raise ValueError("Premature end of file in seq section") 540 #assert line[0]=="+" 541 #Find the qual line(s) 542 qual_len = 0 543 while line: 544 if seq_len == qual_len: 545 #Should be end of record... 546 end_offset = handle.tell() 547 line = handle.readline() 548 if line and line[0:1] != at_char: 549 ValueError("Problem with line %s" % repr(line)) 550 break 551 else: 552 line = handle.readline() 553 qual_len += len(line.strip()) 554 length += len(line) 555 if seq_len != qual_len: 556 raise ValueError("Problem with quality section") 557 yield _bytes_to_string(id), start_offset, length 558 start_offset = end_offset
559 #print "EOF" 560
561 - def get_raw(self, offset):
562 """Similar to the get method, but returns the record as a raw string.""" 563 #TODO - Refactor this and the __init__ method to reduce code duplication? 564 handle = self._handle 565 handle.seek(offset) 566 line = handle.readline() 567 data = line 568 at_char = _as_bytes("@") 569 plus_char = _as_bytes("+") 570 if line[0:1] != at_char: 571 raise ValueError("Problem with FASTQ @ line:\n%s" % repr(line)) 572 #Find the seq line(s) 573 seq_len = 0 574 while line: 575 line = handle.readline() 576 data += line 577 if line.startswith(plus_char): 578 break 579 seq_len += len(line.strip()) 580 if not line: 581 raise ValueError("Premature end of file in seq section") 582 assert line[0:1] == plus_char 583 #Find the qual line(s) 584 qual_len = 0 585 while line: 586 if seq_len == qual_len: 587 #Should be end of record... 588 line = handle.readline() 589 if line and line[0:1] != at_char: 590 ValueError("Problem with line %s" % repr(line)) 591 break 592 else: 593 line = handle.readline() 594 data += line 595 qual_len += len(line.strip()) 596 if seq_len != qual_len: 597 raise ValueError("Problem with quality section") 598 return data
599 600 601 ############################################################################### 602 603 _FormatToRandomAccess = {"ace": SequentialSeqFileRandomAccess, 604 "embl": EmblRandomAccess, 605 "fasta": SequentialSeqFileRandomAccess, 606 "fastq": FastqRandomAccess, # Class handles all three variants 607 "fastq-sanger": FastqRandomAccess, # alias of the above 608 "fastq-solexa": FastqRandomAccess, 609 "fastq-illumina": FastqRandomAccess, 610 "genbank": GenBankRandomAccess, 611 "gb": GenBankRandomAccess, # alias of the above 612 "ig": IntelliGeneticsRandomAccess, 613 "imgt": EmblRandomAccess, 614 "phd": SequentialSeqFileRandomAccess, 615 "pir": SequentialSeqFileRandomAccess, 616 "sff": SffRandomAccess, 617 "sff-trim": SffTrimedRandomAccess, 618 "swiss": SwissRandomAccess, 619 "tab": TabRandomAccess, 620 "qual": SequentialSeqFileRandomAccess, 621 "uniprot-xml": UniprotRandomAccess, 622 } 623