Package Bio :: Package Blast :: Module NCBIStandalone
[hide private]
[frames] | no frames]

Source Code for Module Bio.Blast.NCBIStandalone

   1  # Copyright 1999-2000 by Jeffrey Chang.  All rights reserved. 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license.  Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5  # Patches by Mike Poidinger to support multiple databases. 
   6  # Updated by Peter Cock in 2007 to do a better job on BLAST 2.2.15 
   7   
   8  """Code for calling standalone BLAST and parsing plain text output (DEPRECATED). 
   9   
  10  Rather than parsing the human readable plain text BLAST output (which seems to 
  11  change with every update to BLAST), we and the NBCI recommend you parse the 
  12  XML output instead. The plain text parser in this module still works at the 
  13  time of writing, but is considered obsolete and updating it to cope with the 
  14  latest versions of BLAST is not a priority for us. 
  15   
  16  This module also provides code to work with the "legacy" standalone version of 
  17  NCBI BLAST, tools blastall, rpsblast and blastpgp via three helper functions of 
  18  the same name. These functions are very limited for dealing with the output as 
  19  files rather than handles, for which the wrappers in Bio.Blast.Applications are 
  20  preferred. Furthermore, the NCBI themselves regard these command line tools as 
  21  "legacy", and encourage using the new BLAST+ tools instead. Biopython has 
  22  wrappers for these under Bio.Blast.Applications (see the tutorial). 
  23   
  24  Classes: 
  25  LowQualityBlastError     Except that indicates low quality query sequences. 
  26  BlastParser              Parses output from blast. 
  27  BlastErrorParser         Parses output and tries to diagnose possible errors. 
  28  PSIBlastParser           Parses output from psi-blast. 
  29  Iterator                 Iterates over a file of blast results. 
  30   
  31  _Scanner                 Scans output from standalone BLAST. 
  32  _BlastConsumer           Consumes output from blast. 
  33  _PSIBlastConsumer        Consumes output from psi-blast. 
  34  _HeaderConsumer          Consumes header information. 
  35  _DescriptionConsumer     Consumes description information. 
  36  _AlignmentConsumer       Consumes alignment information. 
  37  _HSPConsumer             Consumes hsp information. 
  38  _DatabaseReportConsumer  Consumes database report information. 
  39  _ParametersConsumer      Consumes parameters information. 
  40   
  41  """ 
  42   
  43  from __future__ import print_function 
  44   
  45  from Bio import BiopythonDeprecationWarning 
  46  import warnings 
  47  warnings.warn("This module has been deprecated. Consider Bio.SearchIO for " 
  48                "parsing BLAST output instead.", BiopythonDeprecationWarning) 
  49   
  50  import os 
  51  import re 
  52  from Bio._py3k import StringIO 
  53   
  54  from Bio import File 
  55  from Bio.ParserSupport import * 
  56  from Bio.Blast import Record 
  57  from Bio.Application import _escape_filename 
  58   
  59   
60 -class LowQualityBlastError(Exception):
61 """Error caused by running a low quality sequence through BLAST. 62 63 When low quality sequences (like GenBank entries containing only 64 stretches of a single nucleotide) are BLASTed, they will result in 65 BLAST generating an error and not being able to perform the BLAST. 66 search. This error should be raised for the BLAST reports produced 67 in this case. 68 """ 69 pass
70 71
72 -class ShortQueryBlastError(Exception):
73 """Error caused by running a short query sequence through BLAST. 74 75 If the query sequence is too short, BLAST outputs warnings and errors: 76 Searching[blastall] WARNING: [000.000] AT1G08320: SetUpBlastSearch failed. 77 [blastall] ERROR: [000.000] AT1G08320: Blast: 78 [blastall] ERROR: [000.000] AT1G08320: Blast: Query must be at least wordsize 79 done 80 81 This exception is raised when that condition is detected. 82 83 """ 84 pass
85 86
87 -class _Scanner(object):
88 """Scan BLAST output from blastall or blastpgp. 89 90 Tested with blastall and blastpgp v2.0.10, v2.0.11 91 92 Methods: 93 feed Feed data into the scanner. 94 """
95 - def feed(self, handle, consumer):
96 """S.feed(handle, consumer) 97 98 Feed in a BLAST report for scanning. handle is a file-like 99 object that contains the BLAST report. consumer is a Consumer 100 object that will receive events as the report is scanned. 101 102 """ 103 if isinstance(handle, File.UndoHandle): 104 uhandle = handle 105 else: 106 uhandle = File.UndoHandle(handle) 107 108 # Try to fast-forward to the beginning of the blast report. 109 read_and_call_until(uhandle, consumer.noevent, contains='BLAST') 110 # Now scan the BLAST report. 111 self._scan_header(uhandle, consumer) 112 self._scan_rounds(uhandle, consumer) 113 self._scan_database_report(uhandle, consumer) 114 self._scan_parameters(uhandle, consumer)
115
116 - def _scan_header(self, uhandle, consumer):
117 # BLASTP 2.0.10 [Aug-26-1999] 118 # 119 # 120 # Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Schaf 121 # Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), 122 # "Gapped BLAST and PSI-BLAST: a new generation of protein database sea 123 # programs", Nucleic Acids Res. 25:3389-3402. 124 # 125 # Query= test 126 # (140 letters) 127 # 128 # Database: sdqib40-1.35.seg.fa 129 # 1323 sequences; 223,339 total letters 130 # 131 # ======================================================== 132 # This next example is from the online version of Blast, 133 # note there are TWO references, an RID line, and also 134 # the database is BEFORE the query line. 135 # Note there possibleuse of non-ASCII in the author names. 136 # ======================================================== 137 # 138 # BLASTP 2.2.15 [Oct-15-2006] 139 # Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Sch??ffer, 140 # Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman 141 # (1997), "Gapped BLAST and PSI-BLAST: a new generation of 142 # protein database search programs", Nucleic Acids Res. 25:3389-3402. 143 # 144 # Reference: Sch??ffer, Alejandro A., L. Aravind, Thomas L. Madden, Sergei 145 # Shavirin, John L. Spouge, Yuri I. Wolf, Eugene V. Koonin, and 146 # Stephen F. Altschul (2001), "Improving the accuracy of PSI-BLAST 147 # protein database searches with composition-based statistics 148 # and other refinements", Nucleic Acids Res. 29:2994-3005. 149 # 150 # RID: 1166022616-19998-65316425856.BLASTQ1 151 # 152 # 153 # Database: All non-redundant GenBank CDS 154 # translations+PDB+SwissProt+PIR+PRF excluding environmental samples 155 # 4,254,166 sequences; 1,462,033,012 total letters 156 # Query= gi:16127998 157 # Length=428 158 # 159 160 consumer.start_header() 161 162 read_and_call(uhandle, consumer.version, contains='BLAST') 163 read_and_call_while(uhandle, consumer.noevent, blank=1) 164 165 # There might be a <pre> line, for qblast output. 166 attempt_read_and_call(uhandle, consumer.noevent, start="<pre>") 167 168 # Read the reference(s) 169 while attempt_read_and_call(uhandle, 170 consumer.reference, start='Reference'): 171 # References are normally multiline terminated by a blank line 172 # (or, based on the old code, the RID line) 173 while True: 174 line = uhandle.readline() 175 if is_blank_line(line): 176 consumer.noevent(line) 177 break 178 elif line.startswith("RID"): 179 break 180 else: 181 #More of the reference 182 consumer.reference(line) 183 184 #Deal with the optional RID: ... 185 read_and_call_while(uhandle, consumer.noevent, blank=1) 186 attempt_read_and_call(uhandle, consumer.reference, start="RID:") 187 read_and_call_while(uhandle, consumer.noevent, blank=1) 188 189 # blastpgp may have a reference for compositional score matrix 190 # adjustment (see Bug 2502): 191 if attempt_read_and_call( 192 uhandle, consumer.reference, start="Reference"): 193 read_and_call_until(uhandle, consumer.reference, blank=1) 194 read_and_call_while(uhandle, consumer.noevent, blank=1) 195 196 # blastpgp has a Reference for composition-based statistics. 197 if attempt_read_and_call( 198 uhandle, consumer.reference, start="Reference"): 199 read_and_call_until(uhandle, consumer.reference, blank=1) 200 read_and_call_while(uhandle, consumer.noevent, blank=1) 201 202 line = uhandle.peekline() 203 assert line.strip() != "" 204 assert not line.startswith("RID:") 205 if line.startswith("Query="): 206 #This is an old style query then database... 207 208 # Read the Query lines and the following blank line. 209 read_and_call(uhandle, consumer.query_info, start='Query=') 210 read_and_call_until(uhandle, consumer.query_info, blank=1) 211 read_and_call_while(uhandle, consumer.noevent, blank=1) 212 213 # Read the database lines and the following blank line. 214 read_and_call_until(uhandle, consumer.database_info, end='total letters') 215 read_and_call(uhandle, consumer.database_info, contains='sequences') 216 read_and_call_while(uhandle, consumer.noevent, blank=1) 217 elif line.startswith("Database:"): 218 #This is a new style database then query... 219 read_and_call_until(uhandle, consumer.database_info, end='total letters') 220 read_and_call(uhandle, consumer.database_info, contains='sequences') 221 read_and_call_while(uhandle, consumer.noevent, blank=1) 222 223 # Read the Query lines and the following blank line. 224 # Or, on BLAST 2.2.22+ there is no blank link - need to spot 225 # the "... Score E" line instead. 226 read_and_call(uhandle, consumer.query_info, start='Query=') 227 # BLAST 2.2.25+ has a blank line before Length= 228 read_and_call_until(uhandle, consumer.query_info, start='Length=') 229 while True: 230 line = uhandle.peekline() 231 if not line.strip() or "Score E" in line: 232 break 233 #It is more of the query (and its length) 234 read_and_call(uhandle, consumer.query_info) 235 read_and_call_while(uhandle, consumer.noevent, blank=1) 236 else: 237 raise ValueError("Invalid header?") 238 239 consumer.end_header()
240
241 - def _scan_rounds(self, uhandle, consumer):
242 # Scan a bunch of rounds. 243 # Each round begins with either a "Searching......" line 244 # or a 'Score E' line followed by descriptions and alignments. 245 # The email server doesn't give the "Searching....." line. 246 # If there is no 'Searching.....' line then you'll first see a 247 # 'Results from round' line 248 249 while not self._eof(uhandle): 250 line = safe_peekline(uhandle) 251 if not line.startswith('Searching') and \ 252 not line.startswith('Results from round') and \ 253 re.search(r"Score +E", line) is None and \ 254 'No hits found' not in line: 255 break 256 self._scan_descriptions(uhandle, consumer) 257 self._scan_alignments(uhandle, consumer)
258
259 - def _scan_descriptions(self, uhandle, consumer):
260 # Searching..................................................done 261 # Results from round 2 262 # 263 # 264 # Sc 265 # Sequences producing significant alignments: (b 266 # Sequences used in model and found again: 267 # 268 # d1tde_2 3.4.1.4.4 (119-244) Thioredoxin reductase [Escherichia ... 269 # d1tcob_ 1.31.1.5.16 Calcineurin regulatory subunit (B-chain) [B... 270 # d1symb_ 1.31.1.2.2 Calcyclin (S100) [RAT (RATTUS NORVEGICUS)] 271 # 272 # Sequences not found previously or not previously below threshold: 273 # 274 # d1osa__ 1.31.1.5.11 Calmodulin [Paramecium tetraurelia] 275 # d1aoza3 2.5.1.3.3 (339-552) Ascorbate oxidase [zucchini (Cucurb... 276 # 277 278 # If PSI-BLAST, may also have: 279 # 280 # CONVERGED! 281 282 consumer.start_descriptions() 283 284 # Read 'Searching' 285 # This line seems to be missing in BLASTN 2.1.2 (others?) 286 attempt_read_and_call(uhandle, consumer.noevent, start='Searching') 287 288 # blastpgp 2.0.10 from NCBI 9/19/99 for Solaris sometimes crashes here. 289 # If this happens, the handle will yield no more information. 290 if not uhandle.peekline(): 291 raise ValueError("Unexpected end of blast report. " + 292 "Looks suspiciously like a PSI-BLAST crash.") 293 294 # BLASTN 2.2.3 sometimes spews a bunch of warnings and errors here: 295 # Searching[blastall] WARNING: [000.000] AT1G08320: SetUpBlastSearch 296 # [blastall] ERROR: [000.000] AT1G08320: Blast: 297 # [blastall] ERROR: [000.000] AT1G08320: Blast: Query must be at leas 298 # done 299 # Reported by David Weisman. 300 # Check for these error lines and ignore them for now. Let 301 # the BlastErrorParser deal with them. 302 line = uhandle.peekline() 303 if "ERROR:" in line or line.startswith("done"): 304 read_and_call_while(uhandle, consumer.noevent, contains="ERROR:") 305 read_and_call(uhandle, consumer.noevent, start="done") 306 307 # Check to see if this is PSI-BLAST. 308 # If it is, the 'Searching' line will be followed by: 309 # (version 2.0.10) 310 # Searching............................. 311 # Results from round 2 312 # or (version 2.0.11) 313 # Searching............................. 314 # 315 # 316 # Results from round 2 317 318 # Skip a bunch of blank lines. 319 read_and_call_while(uhandle, consumer.noevent, blank=1) 320 # Check for the results line if it's there. 321 if attempt_read_and_call(uhandle, consumer.round, start='Results'): 322 read_and_call_while(uhandle, consumer.noevent, blank=1) 323 324 # Three things can happen here: 325 # 1. line contains 'Score E' 326 # 2. line contains "No hits found" 327 # 3. no descriptions 328 # The first one begins a bunch of descriptions. The last two 329 # indicates that no descriptions follow, and we should go straight 330 # to the alignments. 331 if not attempt_read_and_call( 332 uhandle, consumer.description_header, 333 has_re=re.compile(r'Score +E')): 334 # Either case 2 or 3. Look for "No hits found". 335 attempt_read_and_call(uhandle, consumer.no_hits, 336 contains='No hits found') 337 try: 338 read_and_call_while(uhandle, consumer.noevent, blank=1) 339 except ValueError as err: 340 if str(err) != "Unexpected end of stream.": 341 raise err 342 343 consumer.end_descriptions() 344 # Stop processing. 345 return 346 347 # Read the score header lines 348 read_and_call(uhandle, consumer.description_header, 349 start='Sequences producing') 350 351 # If PSI-BLAST, read the 'Sequences used in model' line. 352 attempt_read_and_call(uhandle, consumer.model_sequences, 353 start='Sequences used in model') 354 read_and_call_while(uhandle, consumer.noevent, blank=1) 355 356 # In BLAT, rather than a "No hits found" line, we just 357 # get no descriptions (and no alignments). This can be 358 # spotted because the next line is the database block: 359 if safe_peekline(uhandle).startswith(" Database:"): 360 consumer.end_descriptions() 361 # Stop processing. 362 return 363 364 # Read the descriptions and the following blank lines, making 365 # sure that there are descriptions. 366 if not uhandle.peekline().startswith('Sequences not found'): 367 read_and_call_until(uhandle, consumer.description, blank=1) 368 read_and_call_while(uhandle, consumer.noevent, blank=1) 369 370 # If PSI-BLAST, read the 'Sequences not found' line followed 371 # by more descriptions. However, I need to watch out for the 372 # case where there were no sequences not found previously, in 373 # which case there will be no more descriptions. 374 if attempt_read_and_call(uhandle, consumer.nonmodel_sequences, 375 start='Sequences not found'): 376 # Read the descriptions and the following blank lines. 377 read_and_call_while(uhandle, consumer.noevent, blank=1) 378 l = safe_peekline(uhandle) 379 # Brad -- added check for QUERY. On some PSI-BLAST outputs 380 # there will be a 'Sequences not found' line followed by no 381 # descriptions. Check for this case since the first thing you'll 382 # get is a blank line and then 'QUERY' 383 if not l.startswith('CONVERGED') and l[0] != '>' \ 384 and not l.startswith('QUERY'): 385 read_and_call_until(uhandle, consumer.description, blank=1) 386 read_and_call_while(uhandle, consumer.noevent, blank=1) 387 388 attempt_read_and_call(uhandle, consumer.converged, start='CONVERGED') 389 read_and_call_while(uhandle, consumer.noevent, blank=1) 390 391 consumer.end_descriptions()
392
393 - def _scan_alignments(self, uhandle, consumer):
394 if self._eof(uhandle): 395 return 396 397 # qblast inserts a helpful line here. 398 attempt_read_and_call(uhandle, consumer.noevent, start="ALIGNMENTS") 399 400 # First, check to see if I'm at the database report. 401 line = safe_peekline(uhandle) 402 if not line: 403 #EOF 404 return 405 elif line.startswith(' Database') or line.startswith("Lambda"): 406 return 407 elif line[0] == '>': 408 # XXX make a better check here between pairwise and masterslave 409 self._scan_pairwise_alignments(uhandle, consumer) 410 elif line.startswith('Effective'): 411 return 412 else: 413 # XXX put in a check to make sure I'm in a masterslave alignment 414 self._scan_masterslave_alignment(uhandle, consumer)
415
416 - def _scan_pairwise_alignments(self, uhandle, consumer):
417 while not self._eof(uhandle): 418 line = safe_peekline(uhandle) 419 if line[0] != '>': 420 break 421 self._scan_one_pairwise_alignment(uhandle, consumer)
422
423 - def _scan_one_pairwise_alignment(self, uhandle, consumer):
424 if self._eof(uhandle): 425 return 426 consumer.start_alignment() 427 428 self._scan_alignment_header(uhandle, consumer) 429 430 # Scan a bunch of score/alignment pairs. 431 while True: 432 if self._eof(uhandle): 433 #Shouldn't have issued that _scan_alignment_header event... 434 break 435 line = safe_peekline(uhandle) 436 if not line.startswith(' Score'): 437 break 438 self._scan_hsp(uhandle, consumer) 439 consumer.end_alignment()
440
441 - def _scan_alignment_header(self, uhandle, consumer):
442 # >d1rip__ 2.24.7.1.1 Ribosomal S17 protein [Bacillus 443 # stearothermophilus] 444 # Length = 81 445 # 446 # Or, more recently with different white space: 447 # 448 # >gi|15799684|ref|NP_285696.1| threonine synthase ... 449 # gi|15829258|ref|NP_308031.1| threonine synthase 450 # ... 451 # Length=428 452 read_and_call(uhandle, consumer.title, start='>') 453 while True: 454 line = safe_readline(uhandle) 455 if line.lstrip().startswith('Length =') \ 456 or line.lstrip().startswith('Length='): 457 consumer.length(line) 458 break 459 elif is_blank_line(line): 460 # Check to make sure I haven't missed the Length line 461 raise ValueError("I missed the Length in an alignment header") 462 consumer.title(line) 463 464 # Older versions of BLAST will have a line with some spaces. 465 # Version 2.0.14 (maybe 2.0.13?) and above print a true blank line. 466 if not attempt_read_and_call(uhandle, consumer.noevent, 467 start=' '): 468 read_and_call(uhandle, consumer.noevent, blank=1)
469
470 - def _scan_hsp(self, uhandle, consumer):
471 consumer.start_hsp() 472 self._scan_hsp_header(uhandle, consumer) 473 self._scan_hsp_alignment(uhandle, consumer) 474 consumer.end_hsp()
475
476 - def _scan_hsp_header(self, uhandle, consumer):
477 # Score = 22.7 bits (47), Expect = 2.5 478 # Identities = 10/36 (27%), Positives = 18/36 (49%) 479 # Strand = Plus / Plus 480 # Frame = +3 481 # 482 483 read_and_call(uhandle, consumer.score, start=' Score') 484 read_and_call(uhandle, consumer.identities, start=' Identities') 485 # BLASTN 486 attempt_read_and_call(uhandle, consumer.strand, start = ' Strand') 487 # BLASTX, TBLASTN, TBLASTX 488 attempt_read_and_call(uhandle, consumer.frame, start = ' Frame') 489 read_and_call(uhandle, consumer.noevent, blank=1)
490
491 - def _scan_hsp_alignment(self, uhandle, consumer):
492 # Query: 11 GRGVSACA-------TCDGFFYRNQKVAVIGGGNTAVEEALYLSNIASEVHLIHRRDGF 493 # GRGVS+ TC Y + + V GGG+ + EE L + I R+ 494 # Sbjct: 12 GRGVSSVVRRCIHKPTCKE--YAVKIIDVTGGGSFSAEEVQELREATLKEVDILRKVSG 495 # 496 # Query: 64 AEKILIKR 71 497 # I +K 498 # Sbjct: 70 PNIIQLKD 77 499 # 500 501 while True: 502 # Blastn adds an extra line filled with spaces before Query 503 attempt_read_and_call(uhandle, consumer.noevent, start=' ') 504 read_and_call(uhandle, consumer.query, start='Query') 505 read_and_call(uhandle, consumer.align, start=' ') 506 read_and_call(uhandle, consumer.sbjct, start='Sbjct') 507 try: 508 read_and_call_while(uhandle, consumer.noevent, blank=1) 509 except ValueError as err: 510 if str(err) != "Unexpected end of stream.": 511 raise err 512 # End of File (well, it looks like it with recent versions 513 # of BLAST for multiple queries after the Iterator class 514 # has broken up the whole file into chunks). 515 break 516 line = safe_peekline(uhandle) 517 # Alignment continues if I see a 'Query' or the spaces for Blastn. 518 if not (line.startswith('Query') or line.startswith(' ')): 519 break
520
521 - def _scan_masterslave_alignment(self, uhandle, consumer):
522 consumer.start_alignment() 523 while True: 524 line = safe_readline(uhandle) 525 # Check to see whether I'm finished reading the alignment. 526 # This is indicated by 1) database section, 2) next psi-blast 527 # round, which can also be a 'Results from round' if no 528 # searching line is present 529 # patch by chapmanb 530 if line.startswith('Searching') or \ 531 line.startswith('Results from round'): 532 uhandle.saveline(line) 533 break 534 elif line.startswith(' Database'): 535 uhandle.saveline(line) 536 break 537 elif is_blank_line(line): 538 consumer.noevent(line) 539 else: 540 consumer.multalign(line) 541 read_and_call_while(uhandle, consumer.noevent, blank=1) 542 consumer.end_alignment()
543
544 - def _eof(self, uhandle):
545 try: 546 line = safe_peekline(uhandle) 547 except ValueError as err: 548 if str(err) != "Unexpected end of stream.": 549 raise err 550 line = "" 551 return not line
552
553 - def _scan_database_report(self, uhandle, consumer):
554 # Database: sdqib40-1.35.seg.fa 555 # Posted date: Nov 1, 1999 4:25 PM 556 # Number of letters in database: 223,339 557 # Number of sequences in database: 1323 558 # 559 # Lambda K H 560 # 0.322 0.133 0.369 561 # 562 # Gapped 563 # Lambda K H 564 # 0.270 0.0470 0.230 565 # 566 ########################################## 567 # Or, more recently Blast 2.2.15 gives less blank lines 568 ########################################## 569 # Database: All non-redundant GenBank CDS translations+PDB+SwissProt+PIR+PRF excluding 570 # environmental samples 571 # Posted date: Dec 12, 2006 5:51 PM 572 # Number of letters in database: 667,088,753 573 # Number of sequences in database: 2,094,974 574 # Lambda K H 575 # 0.319 0.136 0.395 576 # Gapped 577 # Lambda K H 578 # 0.267 0.0410 0.140 579 580 if self._eof(uhandle): 581 return 582 583 consumer.start_database_report() 584 585 # Subset of the database(s) listed below 586 # Number of letters searched: 562,618,960 587 # Number of sequences searched: 228,924 588 if attempt_read_and_call(uhandle, consumer.noevent, start=" Subset"): 589 read_and_call(uhandle, consumer.noevent, contains="letters") 590 read_and_call(uhandle, consumer.noevent, contains="sequences") 591 read_and_call(uhandle, consumer.noevent, start=" ") 592 593 # Sameet Mehta reported seeing output from BLASTN 2.2.9 that 594 # was missing the "Database" stanza completely. 595 while attempt_read_and_call(uhandle, consumer.database, 596 start=' Database'): 597 # BLAT output ends abruptly here, without any of the other 598 # information. Check to see if this is the case. If so, 599 # then end the database report here gracefully. 600 if not uhandle.peekline().strip() \ 601 or uhandle.peekline().startswith("BLAST"): 602 consumer.end_database_report() 603 return 604 605 # Database can span multiple lines. 606 read_and_call_until(uhandle, consumer.database, start=' Posted') 607 read_and_call(uhandle, consumer.posted_date, start=' Posted') 608 read_and_call(uhandle, consumer.num_letters_in_database, 609 start=' Number of letters') 610 read_and_call(uhandle, consumer.num_sequences_in_database, 611 start=' Number of sequences') 612 #There may not be a line starting with spaces... 613 attempt_read_and_call(uhandle, consumer.noevent, start=' ') 614 615 line = safe_readline(uhandle) 616 uhandle.saveline(line) 617 if 'Lambda' in line: 618 break 619 620 try: 621 read_and_call(uhandle, consumer.noevent, start='Lambda') 622 read_and_call(uhandle, consumer.ka_params) 623 except: 624 pass 625 626 #This blank line is optional: 627 attempt_read_and_call(uhandle, consumer.noevent, blank=1) 628 629 # not BLASTP 630 attempt_read_and_call(uhandle, consumer.gapped, start='Gapped') 631 # not TBLASTX 632 if attempt_read_and_call(uhandle, consumer.noevent, start='Lambda'): 633 read_and_call(uhandle, consumer.ka_params_gap) 634 635 # Blast 2.2.4 can sometimes skip the whole parameter section. 636 # Thus, I need to be careful not to read past the end of the 637 # file. 638 try: 639 read_and_call_while(uhandle, consumer.noevent, blank=1) 640 except ValueError as x: 641 if str(x) != "Unexpected end of stream.": 642 raise 643 consumer.end_database_report()
644
645 - def _scan_parameters(self, uhandle, consumer):
646 # Matrix: BLOSUM62 647 # Gap Penalties: Existence: 11, Extension: 1 648 # Number of Hits to DB: 50604 649 # Number of Sequences: 1323 650 # Number of extensions: 1526 651 # Number of successful extensions: 6 652 # Number of sequences better than 10.0: 5 653 # Number of HSP's better than 10.0 without gapping: 5 654 # Number of HSP's successfully gapped in prelim test: 0 655 # Number of HSP's that attempted gapping in prelim test: 1 656 # Number of HSP's gapped (non-prelim): 5 657 # length of query: 140 658 # length of database: 223,339 659 # effective HSP length: 39 660 # effective length of query: 101 661 # effective length of database: 171,742 662 # effective search space: 17345942 663 # effective search space used: 17345942 664 # T: 11 665 # A: 40 666 # X1: 16 ( 7.4 bits) 667 # X2: 38 (14.8 bits) 668 # X3: 64 (24.9 bits) 669 # S1: 41 (21.9 bits) 670 # S2: 42 (20.8 bits) 671 ########################################## 672 # Or, more recently Blast(x) 2.2.15 gives 673 ########################################## 674 # Matrix: BLOSUM62 675 # Gap Penalties: Existence: 11, Extension: 1 676 # Number of Sequences: 4535438 677 # Number of Hits to DB: 2,588,844,100 678 # Number of extensions: 60427286 679 # Number of successful extensions: 126433 680 # Number of sequences better than 2.0: 30 681 # Number of HSP's gapped: 126387 682 # Number of HSP's successfully gapped: 35 683 # Length of query: 291 684 # Length of database: 1,573,298,872 685 # Length adjustment: 130 686 # Effective length of query: 161 687 # Effective length of database: 983,691,932 688 # Effective search space: 158374401052 689 # Effective search space used: 158374401052 690 # Neighboring words threshold: 12 691 # Window for multiple hits: 40 692 # X1: 16 ( 7.3 bits) 693 # X2: 38 (14.6 bits) 694 # X3: 64 (24.7 bits) 695 # S1: 41 (21.7 bits) 696 # S2: 32 (16.9 bits) 697 698 # Blast 2.2.4 can sometimes skip the whole parameter section. 699 # BLAT also skips the whole parameter section. 700 # Thus, check to make sure that the parameter section really 701 # exists. 702 if not uhandle.peekline().strip(): 703 return 704 705 # BLASTN 2.2.9 looks like it reverses the "Number of Hits" and 706 # "Number of Sequences" lines. 707 consumer.start_parameters() 708 709 # Matrix line may be missing in BLASTN 2.2.9 710 attempt_read_and_call(uhandle, consumer.matrix, start='Matrix') 711 # not TBLASTX 712 attempt_read_and_call(uhandle, consumer.gap_penalties, start='Gap') 713 714 attempt_read_and_call(uhandle, consumer.num_sequences, 715 start='Number of Sequences') 716 attempt_read_and_call(uhandle, consumer.num_hits, 717 start='Number of Hits') 718 attempt_read_and_call(uhandle, consumer.num_sequences, 719 start='Number of Sequences') 720 attempt_read_and_call(uhandle, consumer.num_extends, 721 start='Number of extensions') 722 attempt_read_and_call(uhandle, consumer.num_good_extends, 723 start='Number of successful') 724 725 attempt_read_and_call(uhandle, consumer.num_seqs_better_e, 726 start='Number of sequences') 727 728 # not BLASTN, TBLASTX 729 if attempt_read_and_call(uhandle, consumer.hsps_no_gap, 730 start="Number of HSP's better"): 731 # BLASTN 2.2.9 732 if attempt_read_and_call(uhandle, consumer.noevent, 733 start="Number of HSP's gapped:"): 734 read_and_call(uhandle, consumer.noevent, 735 start="Number of HSP's successfully") 736 #This is omitted in 2.2.15 737 attempt_read_and_call(uhandle, consumer.noevent, 738 start="Number of extra gapped extensions") 739 else: 740 read_and_call(uhandle, consumer.hsps_prelim_gapped, 741 start="Number of HSP's successfully") 742 read_and_call(uhandle, consumer.hsps_prelim_gap_attempted, 743 start="Number of HSP's that") 744 read_and_call(uhandle, consumer.hsps_gapped, 745 start="Number of HSP's gapped") 746 #e.g. BLASTX 2.2.15 where the "better" line is missing 747 elif attempt_read_and_call(uhandle, consumer.noevent, 748 start="Number of HSP's gapped"): 749 read_and_call(uhandle, consumer.noevent, 750 start="Number of HSP's successfully") 751 752 # not in blastx 2.2.1 753 attempt_read_and_call(uhandle, consumer.query_length, 754 has_re=re.compile(r"[Ll]ength of query")) 755 # Not in BLASTX 2.2.22+ 756 attempt_read_and_call(uhandle, consumer.database_length, 757 has_re=re.compile(r"[Ll]ength of \s*[Dd]atabase")) 758 759 # BLASTN 2.2.9 760 attempt_read_and_call(uhandle, consumer.noevent, 761 start="Length adjustment") 762 attempt_read_and_call(uhandle, consumer.effective_hsp_length, 763 start='effective HSP') 764 # Not in blastx 2.2.1 765 attempt_read_and_call( 766 uhandle, consumer.effective_query_length, 767 has_re=re.compile(r'[Ee]ffective length of query')) 768 769 # This is not in BLASTP 2.2.15 770 attempt_read_and_call( 771 uhandle, consumer.effective_database_length, 772 has_re=re.compile(r'[Ee]ffective length of \s*[Dd]atabase')) 773 # Not in blastx 2.2.1, added a ':' to distinguish between 774 # this and the 'effective search space used' line 775 attempt_read_and_call( 776 uhandle, consumer.effective_search_space, 777 has_re=re.compile(r'[Ee]ffective search space:')) 778 # Does not appear in BLASTP 2.0.5 779 attempt_read_and_call( 780 uhandle, consumer.effective_search_space_used, 781 has_re=re.compile(r'[Ee]ffective search space used')) 782 783 # BLASTX, TBLASTN, TBLASTX 784 attempt_read_and_call(uhandle, consumer.frameshift, start='frameshift') 785 786 # not in BLASTN 2.2.9 787 attempt_read_and_call(uhandle, consumer.threshold, start='T') 788 # In BLASTX 2.2.15 replaced by: "Neighboring words threshold: 12" 789 attempt_read_and_call(uhandle, consumer.threshold, start='Neighboring words threshold') 790 791 # not in BLASTX 2.2.15 792 attempt_read_and_call(uhandle, consumer.window_size, start='A') 793 # get this instead: "Window for multiple hits: 40" 794 attempt_read_and_call(uhandle, consumer.window_size, start='Window for multiple hits') 795 796 # not in BLASTX 2.2.22+ 797 attempt_read_and_call(uhandle, consumer.dropoff_1st_pass, start='X1') 798 # not TBLASTN 799 attempt_read_and_call(uhandle, consumer.gap_x_dropoff, start='X2') 800 801 # not BLASTN, TBLASTX 802 attempt_read_and_call(uhandle, consumer.gap_x_dropoff_final, 803 start='X3') 804 805 # not TBLASTN 806 attempt_read_and_call(uhandle, consumer.gap_trigger, start='S1') 807 # not in blastx 2.2.1 808 # first we make sure we have additional lines to work with, if 809 # not then the file is done and we don't have a final S2 810 if not is_blank_line(uhandle.peekline(), allow_spaces=1): 811 read_and_call(uhandle, consumer.blast_cutoff, start='S2') 812 813 consumer.end_parameters()
814 815
816 -class BlastParser(AbstractParser):
817 """Parses BLAST data into a Record.Blast object. 818 819 """
820 - def __init__(self):
821 """__init__(self)""" 822 self._scanner = _Scanner() 823 self._consumer = _BlastConsumer()
824
825 - def parse(self, handle):
826 """parse(self, handle)""" 827 self._scanner.feed(handle, self._consumer) 828 return self._consumer.data
829 830
831 -class PSIBlastParser(AbstractParser):
832 """Parses BLAST data into a Record.PSIBlast object. 833 834 """
835 - def __init__(self):
836 """__init__(self)""" 837 self._scanner = _Scanner() 838 self._consumer = _PSIBlastConsumer()
839
840 - def parse(self, handle):
841 """parse(self, handle)""" 842 self._scanner.feed(handle, self._consumer) 843 return self._consumer.data
844 845
846 -class _HeaderConsumer(object):
847 - def start_header(self):
848 self._header = Record.Header()
849
850 - def version(self, line):
851 c = line.split() 852 self._header.application = c[0] 853 self._header.version = c[1] 854 if len(c) > 2: 855 #The date is missing in the new C++ output from blastx 2.2.22+ 856 #Just get "BLASTX 2.2.22+\n" and that's all. 857 self._header.date = c[2][1:-1]
858
859 - def reference(self, line):
860 if line.startswith('Reference: '): 861 self._header.reference = line[11:] 862 else: 863 self._header.reference = self._header.reference + line
864
865 - def query_info(self, line):
866 if line.startswith('Query= '): 867 self._header.query = line[7:].lstrip() 868 elif line.startswith('Length='): 869 #New style way to give the query length in BLAST 2.2.22+ (the C++ code) 870 self._header.query_letters = _safe_int(line[7:].strip()) 871 elif not line.startswith(' '): # continuation of query_info 872 self._header.query = "%s%s" % (self._header.query, line) 873 else: 874 #Hope it is the old style way to give the query length: 875 letters, = _re_search( 876 r"([0-9,]+) letters", line, 877 "I could not find the number of letters in line\n%s" % line) 878 self._header.query_letters = _safe_int(letters)
879
880 - def database_info(self, line):
881 line = line.rstrip() 882 if line.startswith('Database: '): 883 self._header.database = line[10:] 884 elif not line.endswith('total letters'): 885 if self._header.database: 886 #Need to include a space when merging multi line datase descr 887 self._header.database = self._header.database + " " + line.strip() 888 else: 889 self._header.database = line.strip() 890 else: 891 sequences, letters = _re_search( 892 r"([0-9,]+) sequences; ([0-9,-]+) total letters", line, 893 "I could not find the sequences and letters in line\n%s" %line) 894 self._header.database_sequences = _safe_int(sequences) 895 self._header.database_letters = _safe_int(letters)
896
897 - def end_header(self):
898 # Get rid of the trailing newlines 899 self._header.reference = self._header.reference.rstrip() 900 self._header.query = self._header.query.rstrip()
901 902
903 -class _DescriptionConsumer(object):
904 - def start_descriptions(self):
905 self._descriptions = [] 906 self._model_sequences = [] 907 self._nonmodel_sequences = [] 908 self._converged = 0 909 self._type = None 910 self._roundnum = None 911 912 self.__has_n = 0 # Does the description line contain an N value?
913
914 - def description_header(self, line):
915 if line.startswith('Sequences producing'): 916 cols = line.split() 917 if cols[-1] == 'N': 918 self.__has_n = 1
919
920 - def description(self, line):
921 dh = self._parse(line) 922 if self._type == 'model': 923 self._model_sequences.append(dh) 924 elif self._type == 'nonmodel': 925 self._nonmodel_sequences.append(dh) 926 else: 927 self._descriptions.append(dh)
928
929 - def model_sequences(self, line):
930 self._type = 'model'
931
932 - def nonmodel_sequences(self, line):
933 self._type = 'nonmodel'
934
935 - def converged(self, line):
936 self._converged = 1
937
938 - def no_hits(self, line):
939 pass
940
941 - def round(self, line):
942 if not line.startswith('Results from round'): 943 raise ValueError("I didn't understand the round line\n%s" % line) 944 self._roundnum = _safe_int(line[18:].strip())
945
946 - def end_descriptions(self):
947 pass
948
949 - def _parse(self, description_line):
950 line = description_line # for convenience 951 dh = Record.Description() 952 953 # I need to separate the score and p-value from the title. 954 # sp|P21297|FLBT_CAUCR FLBT PROTEIN [snip] 284 7e-77 955 # sp|P21297|FLBT_CAUCR FLBT PROTEIN [snip] 284 7e-77 1 956 # special cases to handle: 957 # - title must be preserved exactly (including whitespaces) 958 # - score could be equal to e-value (not likely, but what if??) 959 # - sometimes there's an "N" score of '1'. 960 cols = line.split() 961 if len(cols) < 3: 962 raise ValueError( 963 "Line does not appear to contain description:\n%s" % line) 964 if self.__has_n: 965 i = line.rfind(cols[-1]) # find start of N 966 i = line.rfind(cols[-2], 0, i) # find start of p-value 967 i = line.rfind(cols[-3], 0, i) # find start of score 968 else: 969 i = line.rfind(cols[-1]) # find start of p-value 970 i = line.rfind(cols[-2], 0, i) # find start of score 971 if self.__has_n: 972 dh.title, dh.score, dh.e, dh.num_alignments = \ 973 line[:i].rstrip(), cols[-3], cols[-2], cols[-1] 974 else: 975 dh.title, dh.score, dh.e, dh.num_alignments = \ 976 line[:i].rstrip(), cols[-2], cols[-1], 1 977 dh.num_alignments = _safe_int(dh.num_alignments) 978 dh.score = _safe_int(dh.score) 979 dh.e = _safe_float(dh.e) 980 return dh
981 982
983 -class _AlignmentConsumer(object):
984 # This is a little bit tricky. An alignment can either be a 985 # pairwise alignment or a multiple alignment. Since it's difficult 986 # to know a-priori which one the blast record will contain, I'm going 987 # to make one class that can parse both of them.
988 - def start_alignment(self):
989 self._alignment = Record.Alignment() 990 self._multiple_alignment = Record.MultipleAlignment()
991
992 - def title(self, line):
993 if self._alignment.title: 994 self._alignment.title += " " 995 self._alignment.title += line.strip()
996
997 - def length(self, line):
998 #e.g. "Length = 81" or more recently, "Length=428" 999 parts = line.replace(" ", "").split("=") 1000 assert len(parts)==2, "Unrecognised format length line" 1001 self._alignment.length = parts[1] 1002 self._alignment.length = _safe_int(self._alignment.length)
1003
1004 - def multalign(self, line):
1005 # Standalone version uses 'QUERY', while WWW version uses blast_tmp. 1006 if line.startswith('QUERY') or line.startswith('blast_tmp'): 1007 # If this is the first line of the multiple alignment, 1008 # then I need to figure out how the line is formatted. 1009 1010 # Format of line is: 1011 # QUERY 1 acttg...gccagaggtggtttattcagtctccataagagaggggacaaacg 60 1012 try: 1013 name, start, seq, end = line.split() 1014 except ValueError: 1015 raise ValueError("I do not understand the line\n%s" % line) 1016 self._start_index = line.index(start, len(name)) 1017 self._seq_index = line.index(seq, 1018 self._start_index+len(start)) 1019 # subtract 1 for the space 1020 self._name_length = self._start_index - 1 1021 self._start_length = self._seq_index - self._start_index - 1 1022 self._seq_length = line.rfind(end) - self._seq_index - 1 1023 1024 #self._seq_index = line.index(seq) 1025 ## subtract 1 for the space 1026 #self._seq_length = line.rfind(end) - self._seq_index - 1 1027 #self._start_index = line.index(start) 1028 #self._start_length = self._seq_index - self._start_index - 1 1029 #self._name_length = self._start_index 1030 1031 # Extract the information from the line 1032 name = line[:self._name_length] 1033 name = name.rstrip() 1034 start = line[self._start_index:self._start_index+self._start_length] 1035 start = start.rstrip() 1036 if start: 1037 start = _safe_int(start) 1038 end = line[self._seq_index+self._seq_length:].rstrip() 1039 if end: 1040 end = _safe_int(end) 1041 seq = line[self._seq_index:self._seq_index+self._seq_length].rstrip() 1042 # right pad the sequence with spaces if necessary 1043 if len(seq) < self._seq_length: 1044 seq = seq + ' '*(self._seq_length-len(seq)) 1045 1046 # I need to make sure the sequence is aligned correctly with the query. 1047 # First, I will find the length of the query. Then, if necessary, 1048 # I will pad my current sequence with spaces so that they will line 1049 # up correctly. 1050 1051 # Two possible things can happen: 1052 # QUERY 1053 # 504 1054 # 1055 # QUERY 1056 # 403 1057 # 1058 # Sequence 504 will need padding at the end. Since I won't know 1059 # this until the end of the alignment, this will be handled in 1060 # end_alignment. 1061 # Sequence 403 will need padding before being added to the alignment. 1062 1063 align = self._multiple_alignment.alignment # for convenience 1064 align.append((name, start, seq, end))
1065 1066 # This is old code that tried to line up all the sequences 1067 # in a multiple alignment by using the sequence title's as 1068 # identifiers. The problem with this is that BLAST assigns 1069 # different HSP's from the same sequence the same id. Thus, 1070 # in one alignment block, there may be multiple sequences with 1071 # the same id. I'm not sure how to handle this, so I'm not 1072 # going to. 1073 1074 # # If the sequence is the query, then just add it. 1075 # if name == 'QUERY': 1076 # if len(align) == 0: 1077 # align.append((name, start, seq)) 1078 # else: 1079 # aname, astart, aseq = align[0] 1080 # if name != aname: 1081 # raise ValueError, "Query is not the first sequence" 1082 # aseq = aseq + seq 1083 # align[0] = aname, astart, aseq 1084 # else: 1085 # if len(align) == 0: 1086 # raise ValueError, "I could not find the query sequence" 1087 # qname, qstart, qseq = align[0] 1088 # 1089 # # Now find my sequence in the multiple alignment. 1090 # for i in range(1, len(align)): 1091 # aname, astart, aseq = align[i] 1092 # if name == aname: 1093 # index = i 1094 # break 1095 # else: 1096 # # If I couldn't find it, then add a new one. 1097 # align.append((None, None, None)) 1098 # index = len(align)-1 1099 # # Make sure to left-pad it. 1100 # aname, astart, aseq = name, start, ' '*(len(qseq)-len(seq)) 1101 # 1102 # if len(qseq) != len(aseq) + len(seq): 1103 # # If my sequences are shorter than the query sequence, 1104 # # then I will need to pad some spaces to make them line up. 1105 # # Since I've already right padded seq, that means aseq 1106 # # must be too short. 1107 # aseq = aseq + ' '*(len(qseq)-len(aseq)-len(seq)) 1108 # aseq = aseq + seq 1109 # if astart is None: 1110 # astart = start 1111 # align[index] = aname, astart, aseq 1112
1113 - def end_alignment(self):
1114 # Remove trailing newlines 1115 if self._alignment: 1116 self._alignment.title = self._alignment.title.rstrip() 1117 1118 # This code is also obsolete. See note above. 1119 # If there's a multiple alignment, I will need to make sure 1120 # all the sequences are aligned. That is, I may need to 1121 # right-pad the sequences. 1122 # if self._multiple_alignment is not None: 1123 # align = self._multiple_alignment.alignment 1124 # seqlen = None 1125 # for i in range(len(align)): 1126 # name, start, seq = align[i] 1127 # if seqlen is None: 1128 # seqlen = len(seq) 1129 # else: 1130 # if len(seq) < seqlen: 1131 # seq = seq + ' '*(seqlen - len(seq)) 1132 # align[i] = name, start, seq 1133 # elif len(seq) > seqlen: 1134 # raise ValueError, \ 1135 # "Sequence %s is longer than the query" % name 1136 1137 # Clean up some variables, if they exist. 1138 try: 1139 del self._seq_index 1140 del self._seq_length 1141 del self._start_index 1142 del self._start_length 1143 del self._name_length 1144 except AttributeError: 1145 pass
1146 1147
1148 -class _HSPConsumer(object):
1149 - def start_hsp(self):
1150 self._hsp = Record.HSP()
1151
1152 - def score(self, line):
1153 self._hsp.bits, self._hsp.score = _re_search( 1154 r"Score =\s*([0-9.e+]+) bits \(([0-9]+)\)", line, 1155 "I could not find the score in line\n%s" % line) 1156 self._hsp.score = _safe_float(self._hsp.score) 1157 self._hsp.bits = _safe_float(self._hsp.bits) 1158 1159 x, y = _re_search( 1160 r"Expect\(?(\d*)\)? = +([0-9.e\-|\+]+)", line, 1161 "I could not find the expect in line\n%s" % line) 1162 if x: 1163 self._hsp.num_alignments = _safe_int(x) 1164 else: 1165 self._hsp.num_alignments = 1 1166 self._hsp.expect = _safe_float(y)
1167
1168 - def identities(self, line):
1169 x, y = _re_search( 1170 r"Identities = (\d+)\/(\d+)", line, 1171 "I could not find the identities in line\n%s" % line) 1172 self._hsp.identities = _safe_int(x), _safe_int(y) 1173 self._hsp.align_length = _safe_int(y) 1174 1175 if 'Positives' in line: 1176 x, y = _re_search( 1177 r"Positives = (\d+)\/(\d+)", line, 1178 "I could not find the positives in line\n%s" % line) 1179 self._hsp.positives = _safe_int(x), _safe_int(y) 1180 assert self._hsp.align_length == _safe_int(y) 1181 1182 if 'Gaps' in line: 1183 x, y = _re_search( 1184 r"Gaps = (\d+)\/(\d+)", line, 1185 "I could not find the gaps in line\n%s" % line) 1186 self._hsp.gaps = _safe_int(x), _safe_int(y) 1187 assert self._hsp.align_length == _safe_int(y)
1188
1189 - def strand(self, line):
1190 self._hsp.strand = _re_search( 1191 r"Strand\s?=\s?(\w+)\s?/\s?(\w+)", line, 1192 "I could not find the strand in line\n%s" % line)
1193
1194 - def frame(self, line):
1195 # Frame can be in formats: 1196 # Frame = +1 1197 # Frame = +2 / +2 1198 if '/' in line: 1199 self._hsp.frame = _re_search( 1200 r"Frame\s?=\s?([-+][123])\s?/\s?([-+][123])", line, 1201 "I could not find the frame in line\n%s" % line) 1202 else: 1203 self._hsp.frame = _re_search( 1204 r"Frame = ([-+][123])", line, 1205 "I could not find the frame in line\n%s" % line)
1206 1207 # Match a space, if one is available. Masahir Ishikawa found a 1208 # case where there's no space between the start and the sequence: 1209 # Query: 100tt 101 1210 # line below modified by Yair Benita, Sep 2004 1211 # Note that the colon is not always present. 2006 1212 _query_re = re.compile(r"Query(:?) \s*(\d+)\s*(.+) (\d+)") 1213
1214 - def query(self, line):
1215 m = self._query_re.search(line) 1216 if m is None: 1217 raise ValueError("I could not find the query in line\n%s" % line) 1218 1219 # line below modified by Yair Benita, Sep 2004. 1220 # added the end attribute for the query 1221 colon, start, seq, end = m.groups() 1222 self._hsp.query = self._hsp.query + seq 1223 if self._hsp.query_start is None: 1224 self._hsp.query_start = _safe_int(start) 1225 1226 # line below added by Yair Benita, Sep 2004. 1227 # added the end attribute for the query 1228 self._hsp.query_end = _safe_int(end) 1229 1230 #Get index for sequence start (regular expression element 3) 1231 self._query_start_index = m.start(3) 1232 self._query_len = len(seq)
1233
1234 - def align(self, line):
1235 seq = line[self._query_start_index:].rstrip() 1236 if len(seq) < self._query_len: 1237 # Make sure the alignment is the same length as the query 1238 seq = seq + ' ' * (self._query_len-len(seq)) 1239 elif len(seq) < self._query_len: 1240 raise ValueError("Match is longer than the query in line\n%s" 1241 % line) 1242 self._hsp.match = self._hsp.match + seq
1243 1244 # To match how we do the query, cache the regular expression. 1245 # Note that the colon is not always present. 1246 _sbjct_re = re.compile(r"Sbjct(:?) \s*(\d+)\s*(.+) (\d+)") 1247
1248 - def sbjct(self, line):
1249 m = self._sbjct_re.search(line) 1250 if m is None: 1251 raise ValueError("I could not find the sbjct in line\n%s" % line) 1252 colon, start, seq, end = m.groups() 1253 #mikep 26/9/00 1254 #On occasion, there is a blast hit with no subject match 1255 #so far, it only occurs with 1-line short "matches" 1256 #I have decided to let these pass as they appear 1257 if not seq.strip(): 1258 seq = ' ' * self._query_len 1259 self._hsp.sbjct = self._hsp.sbjct + seq 1260 if self._hsp.sbjct_start is None: 1261 self._hsp.sbjct_start = _safe_int(start) 1262 1263 self._hsp.sbjct_end = _safe_int(end) 1264 if len(seq) != self._query_len: 1265 raise ValueError( 1266 "QUERY and SBJCT sequence lengths don't match in line\n%s" 1267 % line) 1268 1269 del self._query_start_index # clean up unused variables 1270 del self._query_len
1271
1272 - def end_hsp(self):
1273 pass
1274 1275
1276 -class _DatabaseReportConsumer(object):
1277
1278 - def start_database_report(self):
1279 self._dr = Record.DatabaseReport()
1280
1281 - def database(self, line):
1282 m = re.search(r"Database: (.+)$", line) 1283 if m: 1284 self._dr.database_name.append(m.group(1)) 1285 elif self._dr.database_name: 1286 # This must be a continuation of the previous name. 1287 self._dr.database_name[-1] = "%s%s" % (self._dr.database_name[-1], 1288 line.strip())
1289
1290 - def posted_date(self, line):
1291 self._dr.posted_date.append(_re_search( 1292 r"Posted date:\s*(.+)$", line, 1293 "I could not find the posted date in line\n%s" % line))
1294
1295 - def num_letters_in_database(self, line):
1296 letters, = _get_cols( 1297 line, (-1,), ncols=6, expected={2:"letters", 4:"database:"}) 1298 self._dr.num_letters_in_database.append(_safe_int(letters))
1299
1300 - def num_sequences_in_database(self, line):
1301 sequences, = _get_cols( 1302 line, (-1,), ncols=6, expected={2:"sequences", 4:"database:"}) 1303 self._dr.num_sequences_in_database.append(_safe_int(sequences))
1304
1305 - def ka_params(self, line):
1306 self._dr.ka_params = [_safe_float(x) for x in line.split()]
1307
1308 - def gapped(self, line):
1309 self._dr.gapped = 1
1310
1311 - def ka_params_gap(self, line):
1312 self._dr.ka_params_gap = [_safe_float(x) for x in line.split()]
1313
1314 - def end_database_report(self):
1315 pass
1316 1317
1318 -class _ParametersConsumer(object):
1319 - def start_parameters(self):
1320 self._params = Record.Parameters()
1321
1322 - def matrix(self, line):
1323 self._params.matrix = line[8:].rstrip()
1324
1325 - def gap_penalties(self, line):
1326 self._params.gap_penalties = [_safe_float(x) for x in _get_cols( 1327 line, (3, 5), ncols=6, expected={2:"Existence:", 4:"Extension:"})]
1328
1329 - def num_hits(self, line):
1330 if '1st pass' in line: 1331 x, = _get_cols(line, (-4,), ncols=11, expected={2:"Hits"}) 1332 self._params.num_hits = _safe_int(x) 1333 else: 1334 x, = _get_cols(line, (-1,), ncols=6, expected={2:"Hits"}) 1335 self._params.num_hits = _safe_int(x)
1336
1337 - def num_sequences(self, line):
1338 if '1st pass' in line: 1339 x, = _get_cols(line, (-4,), ncols=9, expected={2:"Sequences:"}) 1340 self._params.num_sequences = _safe_int(x) 1341 else: 1342 x, = _get_cols(line, (-1,), ncols=4, expected={2:"Sequences:"}) 1343 self._params.num_sequences = _safe_int(x)
1344
1345 - def num_extends(self, line):
1346 if '1st pass' in line: 1347 x, = _get_cols(line, (-4,), ncols=9, expected={2:"extensions:"}) 1348 self._params.num_extends = _safe_int(x) 1349 else: 1350 x, = _get_cols(line, (-1,), ncols=4, expected={2:"extensions:"}) 1351 self._params.num_extends = _safe_int(x)
1352
1353 - def num_good_extends(self, line):
1354 if '1st pass' in line: 1355 x, = _get_cols(line, (-4,), ncols=10, expected={3:"extensions:"}) 1356 self._params.num_good_extends = _safe_int(x) 1357 else: 1358 x, = _get_cols(line, (-1,), ncols=5, expected={3:"extensions:"}) 1359 self._params.num_good_extends = _safe_int(x)
1360
1361 - def num_seqs_better_e(self, line):
1362 self._params.num_seqs_better_e, = _get_cols( 1363 line, (-1,), ncols=7, expected={2:"sequences"}) 1364 self._params.num_seqs_better_e = _safe_int( 1365 self._params.num_seqs_better_e)
1366
1367 - def hsps_no_gap(self, line):
1368 self._params.hsps_no_gap, = _get_cols( 1369 line, (-1,), ncols=9, expected={3:"better", 7:"gapping:"}) 1370 self._params.hsps_no_gap = _safe_int(self._params.hsps_no_gap)
1371
1372 - def hsps_prelim_gapped(self, line):
1373 self._params.hsps_prelim_gapped, = _get_cols( 1374 line, (-1,), ncols=9, expected={4:"gapped", 6:"prelim"}) 1375 self._params.hsps_prelim_gapped = _safe_int( 1376 self._params.hsps_prelim_gapped)
1377
1378 - def hsps_prelim_gapped_attempted(self, line):
1379 self._params.hsps_prelim_gapped_attempted, = _get_cols( 1380 line, (-1,), ncols=10, expected={4:"attempted", 7:"prelim"}) 1381 self._params.hsps_prelim_gapped_attempted = _safe_int( 1382 self._params.hsps_prelim_gapped_attempted)
1383
1384 - def hsps_gapped(self, line):
1385 self._params.hsps_gapped, = _get_cols( 1386 line, (-1,), ncols=6, expected={3:"gapped"}) 1387 self._params.hsps_gapped = _safe_int(self._params.hsps_gapped)
1388
1389 - def query_length(self, line):
1390 self._params.query_length, = _get_cols( 1391 line.lower(), (-1,), ncols=4, expected={0:"length", 2:"query:"}) 1392 self._params.query_length = _safe_int(self._params.query_length)
1393
1394 - def database_length(self, line):
1395 self._params.database_length, = _get_cols( 1396 line.lower(), (-1,), ncols=4, expected={0:"length", 2:"database:"}) 1397 self._params.database_length = _safe_int(self._params.database_length)
1398
1399 - def effective_hsp_length(self, line):
1400 self._params.effective_hsp_length, = _get_cols( 1401 line, (-1,), ncols=4, expected={1:"HSP", 2:"length:"}) 1402 self._params.effective_hsp_length = _safe_int( 1403 self._params.effective_hsp_length)
1404
1405 - def effective_query_length(self, line):
1406 self._params.effective_query_length, = _get_cols( 1407 line, (-1,), ncols=5, expected={1:"length", 3:"query:"}) 1408 self._params.effective_query_length = _safe_int( 1409 self._params.effective_query_length)
1410
1411 - def effective_database_length(self, line):
1412 self._params.effective_database_length, = _get_cols( 1413 line.lower(), (-1,), ncols=5, expected={1:"length", 3:"database:"}) 1414 self._params.effective_database_length = _safe_int( 1415 self._params.effective_database_length)
1416
1417 - def effective_search_space(self, line):
1418 self._params.effective_search_space, = _get_cols( 1419 line, (-1,), ncols=4, expected={1:"search"}) 1420 self._params.effective_search_space = _safe_int( 1421 self._params.effective_search_space)
1422
1423 - def effective_search_space_used(self, line):
1424 self._params.effective_search_space_used, = _get_cols( 1425 line, (-1,), ncols=5, expected={1:"search", 3:"used:"}) 1426 self._params.effective_search_space_used = _safe_int( 1427 self._params.effective_search_space_used)
1428
1429 - def frameshift(self, line):
1430 self._params.frameshift = _get_cols( 1431 line, (4, 5), ncols=6, expected={0:"frameshift", 2:"decay"})
1432
1433 - def threshold(self, line):
1434 if line[:2] == "T:": 1435 #Assume its an old stlye line like "T: 123" 1436 self._params.threshold, = _get_cols( 1437 line, (1,), ncols=2, expected={0:"T:"}) 1438 elif line[:28] == "Neighboring words threshold:": 1439 self._params.threshold, = _get_cols( 1440 line, (3,), ncols=4, expected={0:"Neighboring", 1:"words", 2:"threshold:"}) 1441 else: 1442 raise ValueError("Unrecognised threshold line:\n%s" % line) 1443 self._params.threshold = _safe_int(self._params.threshold)
1444
1445 - def window_size(self, line):
1446 if line[:2] == "A:": 1447 self._params.window_size, = _get_cols( 1448 line, (1,), ncols=2, expected={0:"A:"}) 1449 elif line[:25] == "Window for multiple hits:": 1450 self._params.window_size, = _get_cols( 1451 line, (4,), ncols=5, expected={0:"Window", 2:"multiple", 3:"hits:"}) 1452 else: 1453 raise ValueError("Unrecognised window size line:\n%s" % line) 1454 self._params.window_size = _safe_int(self._params.window_size)
1455
1456 - def dropoff_1st_pass(self, line):
1457 score, bits = _re_search( 1458 r"X1: (\d+) \(\s*([0-9,.]+) bits\)", line, 1459 "I could not find the dropoff in line\n%s" % line) 1460 self._params.dropoff_1st_pass = _safe_int(score), _safe_float(bits)
1461
1462 - def gap_x_dropoff(self, line):
1463 score, bits = _re_search( 1464 r"X2: (\d+) \(\s*([0-9,.]+) bits\)", line, 1465 "I could not find the gap dropoff in line\n%s" % line) 1466 self._params.gap_x_dropoff = _safe_int(score), _safe_float(bits)
1467
1468 - def gap_x_dropoff_final(self, line):
1469 score, bits = _re_search( 1470 r"X3: (\d+) \(\s*([0-9,.]+) bits\)", line, 1471 "I could not find the gap dropoff final in line\n%s" % line) 1472 self._params.gap_x_dropoff_final = _safe_int(score), _safe_float(bits)
1473
1474 - def gap_trigger(self, line):
1475 score, bits = _re_search( 1476 r"S1: (\d+) \(\s*([0-9,.]+) bits\)", line, 1477 "I could not find the gap trigger in line\n%s" % line) 1478 self._params.gap_trigger = _safe_int(score), _safe_float(bits)
1479
1480 - def blast_cutoff(self, line):
1481 score, bits = _re_search( 1482 r"S2: (\d+) \(\s*([0-9,.]+) bits\)", line, 1483 "I could not find the blast cutoff in line\n%s" % line) 1484 self._params.blast_cutoff = _safe_int(score), _safe_float(bits)
1485
1486 - def end_parameters(self):
1487 pass
1488 1489
1490 -class _BlastConsumer(AbstractConsumer, 1491 _HeaderConsumer, 1492 _DescriptionConsumer, 1493 _AlignmentConsumer, 1494 _HSPConsumer, 1495 _DatabaseReportConsumer, 1496 _ParametersConsumer 1497 ):
1498 # This Consumer is inherits from many other consumer classes that handle 1499 # the actual dirty work. An alternate way to do it is to create objects 1500 # of those classes and then delegate the parsing tasks to them in a 1501 # decorator-type pattern. The disadvantage of that is that the method 1502 # names will need to be resolved in this classes. However, using 1503 # a decorator will retain more control in this class (which may or 1504 # may not be a bad thing). In addition, having each sub-consumer as 1505 # its own object prevents this object's dictionary from being cluttered 1506 # with members and reduces the chance of member collisions.
1507 - def __init__(self):
1508 self.data = None
1509
1510 - def round(self, line):
1511 # Make sure nobody's trying to pass me PSI-BLAST data! 1512 raise ValueError("This consumer doesn't handle PSI-BLAST data")
1513
1514 - def start_header(self):
1515 self.data = Record.Blast() 1516 _HeaderConsumer.start_header(self)
1517
1518 - def end_header(self):
1519 _HeaderConsumer.end_header(self) 1520 self.data.__dict__.update(self._header.__dict__)
1521
1522 - def end_descriptions(self):
1523 self.data.descriptions = self._descriptions
1524
1525 - def end_alignment(self):
1526 _AlignmentConsumer.end_alignment(self) 1527 if self._alignment.hsps: 1528 self.data.alignments.append(self._alignment) 1529 if self._multiple_alignment.alignment: 1530 self.data.multiple_alignment = self._multiple_alignment
1531
1532 - def end_hsp(self):
1533 _HSPConsumer.end_hsp(self) 1534 try: 1535 self._alignment.hsps.append(self._hsp) 1536 except AttributeError: 1537 raise ValueError("Found an HSP before an alignment")
1538
1539 - def end_database_report(self):
1540 _DatabaseReportConsumer.end_database_report(self) 1541 self.data.__dict__.update(self._dr.__dict__)
1542
1543 - def end_parameters(self):
1544 _ParametersConsumer.end_parameters(self) 1545 self.data.__dict__.update(self._params.__dict__)
1546 1547
1548 -class _PSIBlastConsumer(AbstractConsumer, 1549 _HeaderConsumer, 1550 _DescriptionConsumer, 1551 _AlignmentConsumer, 1552 _HSPConsumer, 1553 _DatabaseReportConsumer, 1554 _ParametersConsumer 1555 ):
1556 - def __init__(self):
1557 self.data = None
1558
1559 - def start_header(self):
1560 self.data = Record.PSIBlast() 1561 _HeaderConsumer.start_header(self)
1562
1563 - def end_header(self):
1564 _HeaderConsumer.end_header(self) 1565 self.data.__dict__.update(self._header.__dict__)
1566
1567 - def start_descriptions(self):
1568 self._round = Record.Round() 1569 self.data.rounds.append(self._round) 1570 _DescriptionConsumer.start_descriptions(self)
1571
1572 - def end_descriptions(self):
1573 _DescriptionConsumer.end_descriptions(self) 1574 self._round.number = self._roundnum 1575 if self._descriptions: 1576 self._round.new_seqs.extend(self._descriptions) 1577 self._round.reused_seqs.extend(self._model_sequences) 1578 self._round.new_seqs.extend(self._nonmodel_sequences) 1579 if self._converged: 1580 self.data.converged = 1
1581
1582 - def end_alignment(self):
1583 _AlignmentConsumer.end_alignment(self) 1584 if self._alignment.hsps: 1585 self._round.alignments.append(self._alignment) 1586 if self._multiple_alignment: 1587 self._round.multiple_alignment = self._multiple_alignment
1588
1589 - def end_hsp(self):
1590 _HSPConsumer.end_hsp(self) 1591 try: 1592 self._alignment.hsps.append(self._hsp) 1593 except AttributeError: 1594 raise ValueError("Found an HSP before an alignment")
1595
1596 - def end_database_report(self):
1597 _DatabaseReportConsumer.end_database_report(self) 1598 self.data.__dict__.update(self._dr.__dict__)
1599
1600 - def end_parameters(self):
1601 _ParametersConsumer.end_parameters(self) 1602 self.data.__dict__.update(self._params.__dict__)
1603 1604
1605 -class Iterator(object):
1606 """Iterates over a file of multiple BLAST results. 1607 1608 Methods: 1609 next Return the next record from the stream, or None. 1610 1611 """
1612 - def __init__(self, handle, parser=None):
1613 """__init__(self, handle, parser=None) 1614 1615 Create a new iterator. handle is a file-like object. parser 1616 is an optional Parser object to change the results into another form. 1617 If set to None, then the raw contents of the file will be returned. 1618 1619 """ 1620 try: 1621 handle.readline 1622 except AttributeError: 1623 raise ValueError( 1624 "I expected a file handle or file-like object, got %s" 1625 % type(handle)) 1626 self._uhandle = File.UndoHandle(handle) 1627 self._parser = parser 1628 self._header = []
1629
1630 - def __next__(self):
1631 """next(self) -> object 1632 1633 Return the next Blast record from the file. If no more records, 1634 return None. 1635 1636 """ 1637 lines = [] 1638 query = False 1639 while True: 1640 line = self._uhandle.readline() 1641 if not line: 1642 break 1643 # If I've reached the next one, then put the line back and stop. 1644 if lines and (line.startswith('BLAST') 1645 or line.startswith('BLAST', 1) 1646 or line.startswith('<?xml ')): 1647 self._uhandle.saveline(line) 1648 break 1649 # New style files omit the BLAST line to mark a new query: 1650 if line.startswith("Query="): 1651 if not query: 1652 if not self._header: 1653 self._header = lines[:] 1654 query = True 1655 else: 1656 #Start of another record 1657 self._uhandle.saveline(line) 1658 break 1659 lines.append(line) 1660 1661 if query and "BLAST" not in lines[0]: 1662 #Cheat and re-insert the header 1663 #print "-"*50 1664 #print "".join(self._header) 1665 #print "-"*50 1666 #print "".join(lines) 1667 #print "-"*50 1668 lines = self._header + lines 1669 1670 if not lines: 1671 return None 1672 1673 data = ''.join(lines) 1674 if self._parser is not None: 1675 return self._parser.parse(StringIO(data)) 1676 return data
1677 1678 if sys.version_info[0] < 3:
1679 - def next(self):
1680 """Python 2 style alias for Python 3 style __next__ method.""" 1681 return self.__next__()
1682
1683 - def __iter__(self):
1684 return iter(self.__next__, None)
1685 1686
1687 -def _re_search(regex, line, error_msg):
1688 m = re.search(regex, line) 1689 if not m: 1690 raise ValueError(error_msg) 1691 return m.groups()
1692 1693
1694 -def _get_cols(line, cols_to_get, ncols=None, expected={}):
1695 cols = line.split() 1696 1697 # Check to make sure number of columns is correct 1698 if ncols is not None and len(cols) != ncols: 1699 raise ValueError("I expected %d columns (got %d) in line\n%s" 1700 % (ncols, len(cols), line)) 1701 1702 # Check to make sure columns contain the correct data 1703 for k in expected: 1704 if cols[k] != expected[k]: 1705 raise ValueError("I expected '%s' in column %d in line\n%s" 1706 % (expected[k], k, line)) 1707 1708 # Construct the answer tuple 1709 results = [] 1710 for c in cols_to_get: 1711 results.append(cols[c]) 1712 return tuple(results)
1713 1714
1715 -def _safe_int(str):
1716 try: 1717 return int(str) 1718 except ValueError: 1719 # Something went wrong. Try to clean up the string. 1720 # Remove all commas from the string 1721 str = str.replace(',', '') 1722 # try again after removing commas. 1723 # Note int() will return a long rather than overflow 1724 try: 1725 return int(str) 1726 except ValueError: 1727 pass 1728 # Call float to handle things like "54.3", note could lose precision, e.g. 1729 # >>> int("5399354557888517312") 1730 # 5399354557888517312 1731 # >>> int(float("5399354557888517312")) 1732 # 5399354557888517120 1733 return int(float(str))
1734 1735
1736 -def _safe_float(str):
1737 # Thomas Rosleff Soerensen (rosleff@mpiz-koeln.mpg.de) noted that 1738 # float('e-172') does not produce an error on his platform. Thus, 1739 # we need to check the string for this condition. 1740 1741 # Sometimes BLAST leaves of the '1' in front of an exponent. 1742 if str and str[0] in ['E', 'e']: 1743 str = '1' + str 1744 try: 1745 return float(str) 1746 except ValueError: 1747 # Remove all commas from the string 1748 str = str.replace(',', '') 1749 # try again. 1750 return float(str)
1751 1752
1753 -class _BlastErrorConsumer(_BlastConsumer):
1754 - def __init__(self):
1756
1757 - def noevent(self, line):
1758 if 'Query must be at least wordsize' in line: 1759 raise ShortQueryBlastError("Query must be at least wordsize") 1760 # Now pass the line back up to the superclass. 1761 method = getattr(_BlastConsumer, 'noevent', 1762 _BlastConsumer.__getattr__(self, 'noevent')) 1763 method(line)
1764 1765
1766 -class BlastErrorParser(AbstractParser):
1767 """Attempt to catch and diagnose BLAST errors while parsing. 1768 1769 This utilizes the BlastParser module but adds an additional layer 1770 of complexity on top of it by attempting to diagnose ValueErrors 1771 that may actually indicate problems during BLAST parsing. 1772 1773 Current BLAST problems this detects are: 1774 o LowQualityBlastError - When BLASTing really low quality sequences 1775 (ie. some GenBank entries which are just short streches of a single 1776 nucleotide), BLAST will report an error with the sequence and be 1777 unable to search with this. This will lead to a badly formatted 1778 BLAST report that the parsers choke on. The parser will convert the 1779 ValueError to a LowQualityBlastError and attempt to provide useful 1780 information. 1781 """
1782 - def __init__(self, bad_report_handle = None):
1783 """Initialize a parser that tries to catch BlastErrors. 1784 1785 Arguments: 1786 o bad_report_handle - An optional argument specifying a handle 1787 where bad reports should be sent. This would allow you to save 1788 all of the bad reports to a file, for instance. If no handle 1789 is specified, the bad reports will not be saved. 1790 """ 1791 self._bad_report_handle = bad_report_handle 1792 1793 #self._b_parser = BlastParser() 1794 self._scanner = _Scanner() 1795 self._consumer = _BlastErrorConsumer()
1796
1797 - def parse(self, handle):
1798 """Parse a handle, attempting to diagnose errors. 1799 """ 1800 results = handle.read() 1801 1802 try: 1803 self._scanner.feed(StringIO(results), self._consumer) 1804 except ValueError: 1805 # if we have a bad_report_file, save the info to it first 1806 if self._bad_report_handle: 1807 # send the info to the error handle 1808 self._bad_report_handle.write(