Package Bio :: Package Blast :: Module NCBIStandalone
[hide private]
[frames] | no frames]

Source Code for Module Bio.Blast.NCBIStandalone

   1  # Copyright 1999-2000 by Jeffrey Chang.  All rights reserved. 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license.  Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5  # Patches by Mike Poidinger to support multiple databases. 
   6  # Updated by Peter Cock in 2007 to do a better job on BLAST 2.2.15 
   7   
   8  """Code for calling standalone BLAST and parsing plain text output (OBSOLETE). 
   9   
  10  Rather than parsing the human readable plain text BLAST output (which seems to 
  11  change with every update to BLAST), we and the NBCI recommend you parse the 
  12  XML output instead. The plain text parser in this module still works at the 
  13  time of writing, but is considered obsolete and updating it to cope with the 
  14  latest versions of BLAST is not a priority for us. 
  15   
  16  This module also provides code to work with the "legacy" standalone version of 
  17  NCBI BLAST, tools blastall, rpsblast and blastpgp via three helper functions of 
  18  the same name. These functions are very limited for dealing with the output as 
  19  files rather than handles, for which the wrappers in Bio.Blast.Applications are 
  20  preferred. Furthermore, the NCBI themselves regard these command line tools as 
  21  "legacy", and encourage using the new BLAST+ tools instead. Biopython has 
  22  wrappers for these under Bio.Blast.Applications (see the tutorial). 
  23   
  24  Classes: 
  25  LowQualityBlastError     Except that indicates low quality query sequences. 
  26  BlastParser              Parses output from blast. 
  27  BlastErrorParser         Parses output and tries to diagnose possible errors. 
  28  PSIBlastParser           Parses output from psi-blast. 
  29  Iterator                 Iterates over a file of blast results. 
  30   
  31  _Scanner                 Scans output from standalone BLAST. 
  32  _BlastConsumer           Consumes output from blast. 
  33  _PSIBlastConsumer        Consumes output from psi-blast. 
  34  _HeaderConsumer          Consumes header information. 
  35  _DescriptionConsumer     Consumes description information. 
  36  _AlignmentConsumer       Consumes alignment information. 
  37  _HSPConsumer             Consumes hsp information. 
  38  _DatabaseReportConsumer  Consumes database report information. 
  39  _ParametersConsumer      Consumes parameters information. 
  40   
  41  Functions: 
  42  blastall        Execute blastall (OBSOLETE). 
  43  blastpgp        Execute blastpgp (OBSOLETE). 
  44  rpsblast        Execute rpsblast (OBSOLETE). 
  45   
  46  For calling the BLAST command line tools, we encourage you to use the 
  47  command line wrappers in Bio.Blast.Applications - the three functions 
  48  blastall, blastpgp and rpsblast are considered to be obsolete now, and 
  49  are likely to be deprecated and then removed in future releases. 
  50  """ 
  51   
  52  import warnings 
  53  warnings.warn("The plain text parser in this module still works at the time of writing, but is considered obsolete and updating it to cope with the latest versions of BLAST is not a priority for us.", PendingDeprecationWarning) 
  54   
  55  from Bio import BiopythonDeprecationWarning 
  56   
  57  import os 
  58  import re 
  59  import StringIO 
  60   
  61  from Bio import File 
  62  from Bio.ParserSupport import * 
  63  from Bio.Blast import Record 
  64  from Bio.Application import _escape_filename 
  65   
  66   
67 -class LowQualityBlastError(Exception):
68 """Error caused by running a low quality sequence through BLAST. 69 70 When low quality sequences (like GenBank entries containing only 71 stretches of a single nucleotide) are BLASTed, they will result in 72 BLAST generating an error and not being able to perform the BLAST. 73 search. This error should be raised for the BLAST reports produced 74 in this case. 75 """ 76 pass
77 78
79 -class ShortQueryBlastError(Exception):
80 """Error caused by running a short query sequence through BLAST. 81 82 If the query sequence is too short, BLAST outputs warnings and errors: 83 Searching[blastall] WARNING: [000.000] AT1G08320: SetUpBlastSearch failed. 84 [blastall] ERROR: [000.000] AT1G08320: Blast: 85 [blastall] ERROR: [000.000] AT1G08320: Blast: Query must be at least wordsize 86 done 87 88 This exception is raised when that condition is detected. 89 90 """ 91 pass
92 93
94 -class _Scanner(object):
95 """Scan BLAST output from blastall or blastpgp. 96 97 Tested with blastall and blastpgp v2.0.10, v2.0.11 98 99 Methods: 100 feed Feed data into the scanner. 101 """
102 - def feed(self, handle, consumer):
103 """S.feed(handle, consumer) 104 105 Feed in a BLAST report for scanning. handle is a file-like 106 object that contains the BLAST report. consumer is a Consumer 107 object that will receive events as the report is scanned. 108 109 """ 110 if isinstance(handle, File.UndoHandle): 111 uhandle = handle 112 else: 113 uhandle = File.UndoHandle(handle) 114 115 # Try to fast-forward to the beginning of the blast report. 116 read_and_call_until(uhandle, consumer.noevent, contains='BLAST') 117 # Now scan the BLAST report. 118 self._scan_header(uhandle, consumer) 119 self._scan_rounds(uhandle, consumer) 120 self._scan_database_report(uhandle, consumer) 121 self._scan_parameters(uhandle, consumer)
122
123 - def _scan_header(self, uhandle, consumer):
124 # BLASTP 2.0.10 [Aug-26-1999] 125 # 126 # 127 # Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Schaf 128 # Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), 129 # "Gapped BLAST and PSI-BLAST: a new generation of protein database sea 130 # programs", Nucleic Acids Res. 25:3389-3402. 131 # 132 # Query= test 133 # (140 letters) 134 # 135 # Database: sdqib40-1.35.seg.fa 136 # 1323 sequences; 223,339 total letters 137 # 138 # ======================================================== 139 # This next example is from the online version of Blast, 140 # note there are TWO references, an RID line, and also 141 # the database is BEFORE the query line. 142 # Note there possibleuse of non-ASCII in the author names. 143 # ======================================================== 144 # 145 # BLASTP 2.2.15 [Oct-15-2006] 146 # Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Sch??ffer, 147 # Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman 148 # (1997), "Gapped BLAST and PSI-BLAST: a new generation of 149 # protein database search programs", Nucleic Acids Res. 25:3389-3402. 150 # 151 # Reference: Sch??ffer, Alejandro A., L. Aravind, Thomas L. Madden, Sergei 152 # Shavirin, John L. Spouge, Yuri I. Wolf, Eugene V. Koonin, and 153 # Stephen F. Altschul (2001), "Improving the accuracy of PSI-BLAST 154 # protein database searches with composition-based statistics 155 # and other refinements", Nucleic Acids Res. 29:2994-3005. 156 # 157 # RID: 1166022616-19998-65316425856.BLASTQ1 158 # 159 # 160 # Database: All non-redundant GenBank CDS 161 # translations+PDB+SwissProt+PIR+PRF excluding environmental samples 162 # 4,254,166 sequences; 1,462,033,012 total letters 163 # Query= gi:16127998 164 # Length=428 165 # 166 167 consumer.start_header() 168 169 read_and_call(uhandle, consumer.version, contains='BLAST') 170 read_and_call_while(uhandle, consumer.noevent, blank=1) 171 172 # There might be a <pre> line, for qblast output. 173 attempt_read_and_call(uhandle, consumer.noevent, start="<pre>") 174 175 # Read the reference(s) 176 while attempt_read_and_call(uhandle, 177 consumer.reference, start='Reference'): 178 # References are normally multiline terminated by a blank line 179 # (or, based on the old code, the RID line) 180 while 1: 181 line = uhandle.readline() 182 if is_blank_line(line): 183 consumer.noevent(line) 184 break 185 elif line.startswith("RID"): 186 break 187 else: 188 #More of the reference 189 consumer.reference(line) 190 191 #Deal with the optional RID: ... 192 read_and_call_while(uhandle, consumer.noevent, blank=1) 193 attempt_read_and_call(uhandle, consumer.reference, start="RID:") 194 read_and_call_while(uhandle, consumer.noevent, blank=1) 195 196 # blastpgp may have a reference for compositional score matrix 197 # adjustment (see Bug 2502): 198 if attempt_read_and_call( 199 uhandle, consumer.reference, start="Reference"): 200 read_and_call_until(uhandle, consumer.reference, blank=1) 201 read_and_call_while(uhandle, consumer.noevent, blank=1) 202 203 # blastpgp has a Reference for composition-based statistics. 204 if attempt_read_and_call( 205 uhandle, consumer.reference, start="Reference"): 206 read_and_call_until(uhandle, consumer.reference, blank=1) 207 read_and_call_while(uhandle, consumer.noevent, blank=1) 208 209 line = uhandle.peekline() 210 assert line.strip() != "" 211 assert not line.startswith("RID:") 212 if line.startswith("Query="): 213 #This is an old style query then database... 214 215 # Read the Query lines and the following blank line. 216 read_and_call(uhandle, consumer.query_info, start='Query=') 217 read_and_call_until(uhandle, consumer.query_info, blank=1) 218 read_and_call_while(uhandle, consumer.noevent, blank=1) 219 220 # Read the database lines and the following blank line. 221 read_and_call_until(uhandle, consumer.database_info, end='total letters') 222 read_and_call(uhandle, consumer.database_info, contains='sequences') 223 read_and_call_while(uhandle, consumer.noevent, blank=1) 224 elif line.startswith("Database:"): 225 #This is a new style database then query... 226 read_and_call_until(uhandle, consumer.database_info, end='total letters') 227 read_and_call(uhandle, consumer.database_info, contains='sequences') 228 read_and_call_while(uhandle, consumer.noevent, blank=1) 229 230 # Read the Query lines and the following blank line. 231 # Or, on BLAST 2.2.22+ there is no blank link - need to spot 232 # the "... Score E" line instead. 233 read_and_call(uhandle, consumer.query_info, start='Query=') 234 # BLAST 2.2.25+ has a blank line before Length= 235 read_and_call_until(uhandle, consumer.query_info, start='Length=') 236 while True: 237 line = uhandle.peekline() 238 if not line.strip() or "Score E" in line: 239 break 240 #It is more of the query (and its length) 241 read_and_call(uhandle, consumer.query_info) 242 read_and_call_while(uhandle, consumer.noevent, blank=1) 243 else: 244 raise ValueError("Invalid header?") 245 246 consumer.end_header()
247
248 - def _scan_rounds(self, uhandle, consumer):
249 # Scan a bunch of rounds. 250 # Each round begins with either a "Searching......" line 251 # or a 'Score E' line followed by descriptions and alignments. 252 # The email server doesn't give the "Searching....." line. 253 # If there is no 'Searching.....' line then you'll first see a 254 # 'Results from round' line 255 256 while not self._eof(uhandle): 257 line = safe_peekline(uhandle) 258 if not line.startswith('Searching') and \ 259 not line.startswith('Results from round') and \ 260 re.search(r"Score +E", line) is None and \ 261 'No hits found' not in line: 262 break 263 self._scan_descriptions(uhandle, consumer) 264 self._scan_alignments(uhandle, consumer)
265
266 - def _scan_descriptions(self, uhandle, consumer):
267 # Searching..................................................done 268 # Results from round 2 269 # 270 # 271 # Sc 272 # Sequences producing significant alignments: (b 273 # Sequences used in model and found again: 274 # 275 # d1tde_2 3.4.1.4.4 (119-244) Thioredoxin reductase [Escherichia ... 276 # d1tcob_ 1.31.1.5.16 Calcineurin regulatory subunit (B-chain) [B... 277 # d1symb_ 1.31.1.2.2 Calcyclin (S100) [RAT (RATTUS NORVEGICUS)] 278 # 279 # Sequences not found previously or not previously below threshold: 280 # 281 # d1osa__ 1.31.1.5.11 Calmodulin [Paramecium tetraurelia] 282 # d1aoza3 2.5.1.3.3 (339-552) Ascorbate oxidase [zucchini (Cucurb... 283 # 284 285 # If PSI-BLAST, may also have: 286 # 287 # CONVERGED! 288 289 consumer.start_descriptions() 290 291 # Read 'Searching' 292 # This line seems to be missing in BLASTN 2.1.2 (others?) 293 attempt_read_and_call(uhandle, consumer.noevent, start='Searching') 294 295 # blastpgp 2.0.10 from NCBI 9/19/99 for Solaris sometimes crashes here. 296 # If this happens, the handle will yield no more information. 297 if not uhandle.peekline(): 298 raise ValueError("Unexpected end of blast report. " + 299 "Looks suspiciously like a PSI-BLAST crash.") 300 301 # BLASTN 2.2.3 sometimes spews a bunch of warnings and errors here: 302 # Searching[blastall] WARNING: [000.000] AT1G08320: SetUpBlastSearch 303 # [blastall] ERROR: [000.000] AT1G08320: Blast: 304 # [blastall] ERROR: [000.000] AT1G08320: Blast: Query must be at leas 305 # done 306 # Reported by David Weisman. 307 # Check for these error lines and ignore them for now. Let 308 # the BlastErrorParser deal with them. 309 line = uhandle.peekline() 310 if "ERROR:" in line or line.startswith("done"): 311 read_and_call_while(uhandle, consumer.noevent, contains="ERROR:") 312 read_and_call(uhandle, consumer.noevent, start="done") 313 314 # Check to see if this is PSI-BLAST. 315 # If it is, the 'Searching' line will be followed by: 316 # (version 2.0.10) 317 # Searching............................. 318 # Results from round 2 319 # or (version 2.0.11) 320 # Searching............................. 321 # 322 # 323 # Results from round 2 324 325 # Skip a bunch of blank lines. 326 read_and_call_while(uhandle, consumer.noevent, blank=1) 327 # Check for the results line if it's there. 328 if attempt_read_and_call(uhandle, consumer.round, start='Results'): 329 read_and_call_while(uhandle, consumer.noevent, blank=1) 330 331 # Three things can happen here: 332 # 1. line contains 'Score E' 333 # 2. line contains "No hits found" 334 # 3. no descriptions 335 # The first one begins a bunch of descriptions. The last two 336 # indicates that no descriptions follow, and we should go straight 337 # to the alignments. 338 if not attempt_read_and_call( 339 uhandle, consumer.description_header, 340 has_re=re.compile(r'Score +E')): 341 # Either case 2 or 3. Look for "No hits found". 342 attempt_read_and_call(uhandle, consumer.no_hits, 343 contains='No hits found') 344 try: 345 read_and_call_while(uhandle, consumer.noevent, blank=1) 346 except ValueError, err: 347 if str(err) != "Unexpected end of stream.": 348 raise err 349 350 consumer.end_descriptions() 351 # Stop processing. 352 return 353 354 # Read the score header lines 355 read_and_call(uhandle, consumer.description_header, 356 start='Sequences producing') 357 358 # If PSI-BLAST, read the 'Sequences used in model' line. 359 attempt_read_and_call(uhandle, consumer.model_sequences, 360 start='Sequences used in model') 361 read_and_call_while(uhandle, consumer.noevent, blank=1) 362 363 # In BLAT, rather than a "No hits found" line, we just 364 # get no descriptions (and no alignments). This can be 365 # spotted because the next line is the database block: 366 if safe_peekline(uhandle).startswith(" Database:"): 367 consumer.end_descriptions() 368 # Stop processing. 369 return 370 371 # Read the descriptions and the following blank lines, making 372 # sure that there are descriptions. 373 if not uhandle.peekline().startswith('Sequences not found'): 374 read_and_call_until(uhandle, consumer.description, blank=1) 375 read_and_call_while(uhandle, consumer.noevent, blank=1) 376 377 # If PSI-BLAST, read the 'Sequences not found' line followed 378 # by more descriptions. However, I need to watch out for the 379 # case where there were no sequences not found previously, in 380 # which case there will be no more descriptions. 381 if attempt_read_and_call(uhandle, consumer.nonmodel_sequences, 382 start='Sequences not found'): 383 # Read the descriptions and the following blank lines. 384 read_and_call_while(uhandle, consumer.noevent, blank=1) 385 l = safe_peekline(uhandle) 386 # Brad -- added check for QUERY. On some PSI-BLAST outputs 387 # there will be a 'Sequences not found' line followed by no 388 # descriptions. Check for this case since the first thing you'll 389 # get is a blank line and then 'QUERY' 390 if not l.startswith('CONVERGED') and l[0] != '>' \ 391 and not l.startswith('QUERY'): 392 read_and_call_until(uhandle, consumer.description, blank=1) 393 read_and_call_while(uhandle, consumer.noevent, blank=1) 394 395 attempt_read_and_call(uhandle, consumer.converged, start='CONVERGED') 396 read_and_call_while(uhandle, consumer.noevent, blank=1) 397 398 consumer.end_descriptions()
399
400 - def _scan_alignments(self, uhandle, consumer):
401 if self._eof(uhandle): 402 return 403 404 # qblast inserts a helpful line here. 405 attempt_read_and_call(uhandle, consumer.noevent, start="ALIGNMENTS") 406 407 # First, check to see if I'm at the database report. 408 line = safe_peekline(uhandle) 409 if not line: 410 #EOF 411 return 412 elif line.startswith(' Database') or line.startswith("Lambda"): 413 return 414 elif line[0] == '>': 415 # XXX make a better check here between pairwise and masterslave 416 self._scan_pairwise_alignments(uhandle, consumer) 417 elif line.startswith('Effective'): 418 return 419 else: 420 # XXX put in a check to make sure I'm in a masterslave alignment 421 self._scan_masterslave_alignment(uhandle, consumer)
422
423 - def _scan_pairwise_alignments(self, uhandle, consumer):
424 while not self._eof(uhandle): 425 line = safe_peekline(uhandle) 426 if line[0] != '>': 427 break 428 self._scan_one_pairwise_alignment(uhandle, consumer)
429
430 - def _scan_one_pairwise_alignment(self, uhandle, consumer):
431 if self._eof(uhandle): 432 return 433 consumer.start_alignment() 434 435 self._scan_alignment_header(uhandle, consumer) 436 437 # Scan a bunch of score/alignment pairs. 438 while 1: 439 if self._eof(uhandle): 440 #Shouldn't have issued that _scan_alignment_header event... 441 break 442 line = safe_peekline(uhandle) 443 if not line.startswith(' Score'): 444 break 445 self._scan_hsp(uhandle, consumer) 446 consumer.end_alignment()
447
448 - def _scan_alignment_header(self, uhandle, consumer):
449 # >d1rip__ 2.24.7.1.1 Ribosomal S17 protein [Bacillus 450 # stearothermophilus] 451 # Length = 81 452 # 453 # Or, more recently with different white space: 454 # 455 # >gi|15799684|ref|NP_285696.1| threonine synthase ... 456 # gi|15829258|ref|NP_308031.1| threonine synthase 457 # ... 458 # Length=428 459 read_and_call(uhandle, consumer.title, start='>') 460 while 1: 461 line = safe_readline(uhandle) 462 if line.lstrip().startswith('Length =') \ 463 or line.lstrip().startswith('Length='): 464 consumer.length(line) 465 break 466 elif is_blank_line(line): 467 # Check to make sure I haven't missed the Length line 468 raise ValueError("I missed the Length in an alignment header") 469 consumer.title(line) 470 471 # Older versions of BLAST will have a line with some spaces. 472 # Version 2.0.14 (maybe 2.0.13?) and above print a true blank line. 473 if not attempt_read_and_call(uhandle, consumer.noevent, 474 start=' '): 475 read_and_call(uhandle, consumer.noevent, blank=1)
476
477 - def _scan_hsp(self, uhandle, consumer):
478 consumer.start_hsp() 479 self._scan_hsp_header(uhandle, consumer) 480 self._scan_hsp_alignment(uhandle, consumer) 481 consumer.end_hsp()
482
483 - def _scan_hsp_header(self, uhandle, consumer):
484 # Score = 22.7 bits (47), Expect = 2.5 485 # Identities = 10/36 (27%), Positives = 18/36 (49%) 486 # Strand = Plus / Plus 487 # Frame = +3 488 # 489 490 read_and_call(uhandle, consumer.score, start=' Score') 491 read_and_call(uhandle, consumer.identities, start=' Identities') 492 # BLASTN 493 attempt_read_and_call(uhandle, consumer.strand, start = ' Strand') 494 # BLASTX, TBLASTN, TBLASTX 495 attempt_read_and_call(uhandle, consumer.frame, start = ' Frame') 496 read_and_call(uhandle, consumer.noevent, blank=1)
497
498 - def _scan_hsp_alignment(self, uhandle, consumer):
499 # Query: 11 GRGVSACA-------TCDGFFYRNQKVAVIGGGNTAVEEALYLSNIASEVHLIHRRDGF 500 # GRGVS+ TC Y + + V GGG+ + EE L + I R+ 501 # Sbjct: 12 GRGVSSVVRRCIHKPTCKE--YAVKIIDVTGGGSFSAEEVQELREATLKEVDILRKVSG 502 # 503 # Query: 64 AEKILIKR 71 504 # I +K 505 # Sbjct: 70 PNIIQLKD 77 506 # 507 508 while 1: 509 # Blastn adds an extra line filled with spaces before Query 510 attempt_read_and_call(uhandle, consumer.noevent, start=' ') 511 read_and_call(uhandle, consumer.query, start='Query') 512 read_and_call(uhandle, consumer.align, start=' ') 513 read_and_call(uhandle, consumer.sbjct, start='Sbjct') 514 try: 515 read_and_call_while(uhandle, consumer.noevent, blank=1) 516 except ValueError, err: 517 if str(err) != "Unexpected end of stream.": 518 raise err 519 # End of File (well, it looks like it with recent versions 520 # of BLAST for multiple queries after the Iterator class 521 # has broken up the whole file into chunks). 522 break 523 line = safe_peekline(uhandle) 524 # Alignment continues if I see a 'Query' or the spaces for Blastn. 525 if not (line.startswith('Query') or line.startswith(' ')): 526 break
527
528 - def _scan_masterslave_alignment(self, uhandle, consumer):
529 consumer.start_alignment() 530 while 1: 531 line = safe_readline(uhandle) 532 # Check to see whether I'm finished reading the alignment. 533 # This is indicated by 1) database section, 2) next psi-blast 534 # round, which can also be a 'Results from round' if no 535 # searching line is present 536 # patch by chapmanb 537 if line.startswith('Searching') or \ 538 line.startswith('Results from round'): 539 uhandle.saveline(line) 540 break 541 elif line.startswith(' Database'): 542 uhandle.saveline(line) 543 break 544 elif is_blank_line(line): 545 consumer.noevent(line) 546 else: 547 consumer.multalign(line) 548 read_and_call_while(uhandle, consumer.noevent, blank=1) 549 consumer.end_alignment()
550
551 - def _eof(self, uhandle):
552 try: 553 line = safe_peekline(uhandle) 554 except ValueError, err: 555 if str(err) != "Unexpected end of stream.": 556 raise err 557 line = "" 558 return not line
559
560 - def _scan_database_report(self, uhandle, consumer):
561 # Database: sdqib40-1.35.seg.fa 562 # Posted date: Nov 1, 1999 4:25 PM 563 # Number of letters in database: 223,339 564 # Number of sequences in database: 1323 565 # 566 # Lambda K H 567 # 0.322 0.133 0.369 568 # 569 # Gapped 570 # Lambda K H 571 # 0.270 0.0470 0.230 572 # 573 ########################################## 574 # Or, more recently Blast 2.2.15 gives less blank lines 575 ########################################## 576 # Database: All non-redundant GenBank CDS translations+PDB+SwissProt+PIR+PRF excluding 577 # environmental samples 578 # Posted date: Dec 12, 2006 5:51 PM 579 # Number of letters in database: 667,088,753 580 # Number of sequences in database: 2,094,974 581 # Lambda K H 582 # 0.319 0.136 0.395 583 # Gapped 584 # Lambda K H 585 # 0.267 0.0410 0.140 586 587 if self._eof(uhandle): 588 return 589 590 consumer.start_database_report() 591 592 # Subset of the database(s) listed below 593 # Number of letters searched: 562,618,960 594 # Number of sequences searched: 228,924 595 if attempt_read_and_call(uhandle, consumer.noevent, start=" Subset"): 596 read_and_call(uhandle, consumer.noevent, contains="letters") 597 read_and_call(uhandle, consumer.noevent, contains="sequences") 598 read_and_call(uhandle, consumer.noevent, start=" ") 599 600 # Sameet Mehta reported seeing output from BLASTN 2.2.9 that 601 # was missing the "Database" stanza completely. 602 while attempt_read_and_call(uhandle, consumer.database, 603 start=' Database'): 604 # BLAT output ends abruptly here, without any of the other 605 # information. Check to see if this is the case. If so, 606 # then end the database report here gracefully. 607 if not uhandle.peekline().strip() \ 608 or uhandle.peekline().startswith("BLAST"): 609 consumer.end_database_report() 610 return 611 612 # Database can span multiple lines. 613 read_and_call_until(uhandle, consumer.database, start=' Posted') 614 read_and_call(uhandle, consumer.posted_date, start=' Posted') 615 read_and_call(uhandle, consumer.num_letters_in_database, 616 start=' Number of letters') 617 read_and_call(uhandle, consumer.num_sequences_in_database, 618 start=' Number of sequences') 619 #There may not be a line starting with spaces... 620 attempt_read_and_call(uhandle, consumer.noevent, start=' ') 621 622 line = safe_readline(uhandle) 623 uhandle.saveline(line) 624 if 'Lambda' in line: 625 break 626 627 try: 628 read_and_call(uhandle, consumer.noevent, start='Lambda') 629 read_and_call(uhandle, consumer.ka_params) 630 except: 631 pass 632 633 #This blank line is optional: 634 attempt_read_and_call(uhandle, consumer.noevent, blank=1) 635 636 # not BLASTP 637 attempt_read_and_call(uhandle, consumer.gapped, start='Gapped') 638 # not TBLASTX 639 if attempt_read_and_call(uhandle, consumer.noevent, start='Lambda'): 640 read_and_call(uhandle, consumer.ka_params_gap) 641 642 # Blast 2.2.4 can sometimes skip the whole parameter section. 643 # Thus, I need to be careful not to read past the end of the 644 # file. 645 try: 646 read_and_call_while(uhandle, consumer.noevent, blank=1) 647 except ValueError, x: 648 if str(x) != "Unexpected end of stream.": 649 raise 650 consumer.end_database_report()
651
652 - def _scan_parameters(self, uhandle, consumer):
653 # Matrix: BLOSUM62 654 # Gap Penalties: Existence: 11, Extension: 1 655 # Number of Hits to DB: 50604 656 # Number of Sequences: 1323 657 # Number of extensions: 1526 658 # Number of successful extensions: 6 659 # Number of sequences better than 10.0: 5 660 # Number of HSP's better than 10.0 without gapping: 5 661 # Number of HSP's successfully gapped in prelim test: 0 662 # Number of HSP's that attempted gapping in prelim test: 1 663 # Number of HSP's gapped (non-prelim): 5 664 # length of query: 140 665 # length of database: 223,339 666 # effective HSP length: 39 667 # effective length of query: 101 668 # effective length of database: 171,742 669 # effective search space: 17345942 670 # effective search space used: 17345942 671 # T: 11 672 # A: 40 673 # X1: 16 ( 7.4 bits) 674 # X2: 38 (14.8 bits) 675 # X3: 64 (24.9 bits) 676 # S1: 41 (21.9 bits) 677 # S2: 42 (20.8 bits) 678 ########################################## 679 # Or, more recently Blast(x) 2.2.15 gives 680 ########################################## 681 # Matrix: BLOSUM62 682 # Gap Penalties: Existence: 11, Extension: 1 683 # Number of Sequences: 4535438 684 # Number of Hits to DB: 2,588,844,100 685 # Number of extensions: 60427286 686 # Number of successful extensions: 126433 687 # Number of sequences better than 2.0: 30 688 # Number of HSP's gapped: 126387 689 # Number of HSP's successfully gapped: 35 690 # Length of query: 291 691 # Length of database: 1,573,298,872 692 # Length adjustment: 130 693 # Effective length of query: 161 694 # Effective length of database: 983,691,932 695 # Effective search space: 158374401052 696 # Effective search space used: 158374401052 697 # Neighboring words threshold: 12 698 # Window for multiple hits: 40 699 # X1: 16 ( 7.3 bits) 700 # X2: 38 (14.6 bits) 701 # X3: 64 (24.7 bits) 702 # S1: 41 (21.7 bits) 703 # S2: 32 (16.9 bits) 704 705 # Blast 2.2.4 can sometimes skip the whole parameter section. 706 # BLAT also skips the whole parameter section. 707 # Thus, check to make sure that the parameter section really 708 # exists. 709 if not uhandle.peekline().strip(): 710 return 711 712 # BLASTN 2.2.9 looks like it reverses the "Number of Hits" and 713 # "Number of Sequences" lines. 714 consumer.start_parameters() 715 716 # Matrix line may be missing in BLASTN 2.2.9 717 attempt_read_and_call(uhandle, consumer.matrix, start='Matrix') 718 # not TBLASTX 719 attempt_read_and_call(uhandle, consumer.gap_penalties, start='Gap') 720 721 attempt_read_and_call(uhandle, consumer.num_sequences, 722 start='Number of Sequences') 723 attempt_read_and_call(uhandle, consumer.num_hits, 724 start='Number of Hits') 725 attempt_read_and_call(uhandle, consumer.num_sequences, 726 start='Number of Sequences') 727 attempt_read_and_call(uhandle, consumer.num_extends, 728 start='Number of extensions') 729 attempt_read_and_call(uhandle, consumer.num_good_extends, 730 start='Number of successful') 731 732 attempt_read_and_call(uhandle, consumer.num_seqs_better_e, 733 start='Number of sequences') 734 735 # not BLASTN, TBLASTX 736 if attempt_read_and_call(uhandle, consumer.hsps_no_gap, 737 start="Number of HSP's better"): 738 # BLASTN 2.2.9 739 if attempt_read_and_call(uhandle, consumer.noevent, 740 start="Number of HSP's gapped:"): 741 read_and_call(uhandle, consumer.noevent, 742 start="Number of HSP's successfully") 743 #This is omitted in 2.2.15 744 attempt_read_and_call(uhandle, consumer.noevent, 745 start="Number of extra gapped extensions") 746 else: 747 read_and_call(uhandle, consumer.hsps_prelim_gapped, 748 start="Number of HSP's successfully") 749 read_and_call(uhandle, consumer.hsps_prelim_gap_attempted, 750 start="Number of HSP's that") 751 read_and_call(uhandle, consumer.hsps_gapped, 752 start="Number of HSP's gapped") 753 #e.g. BLASTX 2.2.15 where the "better" line is missing 754 elif attempt_read_and_call(uhandle, consumer.noevent, 755 start="Number of HSP's gapped"): 756 read_and_call(uhandle, consumer.noevent, 757 start="Number of HSP's successfully") 758 759 # not in blastx 2.2.1 760 attempt_read_and_call(uhandle, consumer.query_length, 761 has_re=re.compile(r"[Ll]ength of query")) 762 # Not in BLASTX 2.2.22+ 763 attempt_read_and_call(uhandle, consumer.database_length, 764 has_re=re.compile(r"[Ll]ength of \s*[Dd]atabase")) 765 766 # BLASTN 2.2.9 767 attempt_read_and_call(uhandle, consumer.noevent, 768 start="Length adjustment") 769 attempt_read_and_call(uhandle, consumer.effective_hsp_length, 770 start='effective HSP') 771 # Not in blastx 2.2.1 772 attempt_read_and_call( 773 uhandle, consumer.effective_query_length, 774 has_re=re.compile(r'[Ee]ffective length of query')) 775 776 # This is not in BLASTP 2.2.15 777 attempt_read_and_call( 778 uhandle, consumer.effective_database_length, 779 has_re=re.compile(r'[Ee]ffective length of \s*[Dd]atabase')) 780 # Not in blastx 2.2.1, added a ':' to distinguish between 781 # this and the 'effective search space used' line 782 attempt_read_and_call( 783 uhandle, consumer.effective_search_space, 784 has_re=re.compile(r'[Ee]ffective search space:')) 785 # Does not appear in BLASTP 2.0.5 786 attempt_read_and_call( 787 uhandle, consumer.effective_search_space_used, 788 has_re=re.compile(r'[Ee]ffective search space used')) 789 790 # BLASTX, TBLASTN, TBLASTX 791 attempt_read_and_call(uhandle, consumer.frameshift, start='frameshift') 792 793 # not in BLASTN 2.2.9 794 attempt_read_and_call(uhandle, consumer.threshold, start='T') 795 # In BLASTX 2.2.15 replaced by: "Neighboring words threshold: 12" 796 attempt_read_and_call(uhandle, consumer.threshold, start='Neighboring words threshold') 797 798 # not in BLASTX 2.2.15 799 attempt_read_and_call(uhandle, consumer.window_size, start='A') 800 # get this instead: "Window for multiple hits: 40" 801 attempt_read_and_call(uhandle, consumer.window_size, start='Window for multiple hits') 802 803 # not in BLASTX 2.2.22+ 804 attempt_read_and_call(uhandle, consumer.dropoff_1st_pass, start='X1') 805 # not TBLASTN 806 attempt_read_and_call(uhandle, consumer.gap_x_dropoff, start='X2') 807 808 # not BLASTN, TBLASTX 809 attempt_read_and_call(uhandle, consumer.gap_x_dropoff_final, 810 start='X3') 811 812 # not TBLASTN 813 attempt_read_and_call(uhandle, consumer.gap_trigger, start='S1') 814 # not in blastx 2.2.1 815 # first we make sure we have additional lines to work with, if 816 # not then the file is done and we don't have a final S2 817 if not is_blank_line(uhandle.peekline(), allow_spaces=1): 818 read_and_call(uhandle, consumer.blast_cutoff, start='S2') 819 820 consumer.end_parameters()
821 822
823 -class BlastParser(AbstractParser):
824 """Parses BLAST data into a Record.Blast object. 825 826 """
827 - def __init__(self):
828 """__init__(self)""" 829 self._scanner = _Scanner() 830 self._consumer = _BlastConsumer()
831
832 - def parse(self, handle):
833 """parse(self, handle)""" 834 self._scanner.feed(handle, self._consumer) 835 return self._consumer.data
836 837
838 -class PSIBlastParser(AbstractParser):
839 """Parses BLAST data into a Record.PSIBlast object. 840 841 """
842 - def __init__(self):
843 """__init__(self)""" 844 self._scanner = _Scanner() 845 self._consumer = _PSIBlastConsumer()
846
847 - def parse(self, handle):
848 """parse(self, handle)""" 849 self._scanner.feed(handle, self._consumer) 850 return self._consumer.data
851 852
853 -class _HeaderConsumer(object):
854 - def start_header(self):
855 self._header = Record.Header()
856
857 - def version(self, line):
858 c = line.split() 859 self._header.application = c[0] 860 self._header.version = c[1] 861 if len(c) > 2: 862 #The date is missing in the new C++ output from blastx 2.2.22+ 863 #Just get "BLASTX 2.2.22+\n" and that's all. 864 self._header.date = c[2][1:-1]
865
866 - def reference(self, line):
867 if line.startswith('Reference: '): 868 self._header.reference = line[11:] 869 else: 870 self._header.reference = self._header.reference + line
871
872 - def query_info(self, line):
873 if line.startswith('Query= '): 874 self._header.query = line[7:].lstrip() 875 elif line.startswith('Length='): 876 #New style way to give the query length in BLAST 2.2.22+ (the C++ code) 877 self._header.query_letters = _safe_int(line[7:].strip()) 878 elif not line.startswith(' '): # continuation of query_info 879 self._header.query = "%s%s" % (self._header.query, line) 880 else: 881 #Hope it is the old style way to give the query length: 882 letters, = _re_search( 883 r"([0-9,]+) letters", line, 884 "I could not find the number of letters in line\n%s" % line) 885 self._header.query_letters = _safe_int(letters)
886
887 - def database_info(self, line):
888 line = line.rstrip() 889 if line.startswith('Database: '): 890 self._header.database = line[10:] 891 elif not line.endswith('total letters'): 892 if self._header.database: 893 #Need to include a space when merging multi line datase descr 894 self._header.database = self._header.database + " " + line.strip() 895 else: 896 self._header.database = line.strip() 897 else: 898 sequences, letters =_re_search( 899 r"([0-9,]+) sequences; ([0-9,-]+) total letters", line, 900 "I could not find the sequences and letters in line\n%s" %line) 901 self._header.database_sequences = _safe_int(sequences) 902 self._header.database_letters = _safe_int(letters)
903
904 - def end_header(self):
905 # Get rid of the trailing newlines 906 self._header.reference = self._header.reference.rstrip() 907 self._header.query = self._header.query.rstrip()
908 909
910 -class _DescriptionConsumer(object):
911 - def start_descriptions(self):
912 self._descriptions = [] 913 self._model_sequences = [] 914 self._nonmodel_sequences = [] 915 self._converged = 0 916 self._type = None 917 self._roundnum = None 918 919 self.__has_n = 0 # Does the description line contain an N value?
920
921 - def description_header(self, line):
922 if line.startswith('Sequences producing'): 923 cols = line.split() 924 if cols[-1] == 'N': 925 self.__has_n = 1
926
927 - def description(self, line):
928 dh = self._parse(line) 929 if self._type == 'model': 930 self._model_sequences.append(dh) 931 elif self._type == 'nonmodel': 932 self._nonmodel_sequences.append(dh) 933 else: 934 self._descriptions.append(dh)
935
936 - def model_sequences(self, line):
937 self._type = 'model'
938
939 - def nonmodel_sequences(self, line):
940 self._type = 'nonmodel'
941
942 - def converged(self, line):
943 self._converged = 1
944
945 - def no_hits(self, line):
946 pass
947
948 - def round(self, line):
949 if not line.startswith('Results from round'): 950 raise ValueError("I didn't understand the round line\n%s" % line) 951 self._roundnum = _safe_int(line[18:].strip())
952
953 - def end_descriptions(self):
954 pass
955
956 - def _parse(self, description_line):
957 line = description_line # for convenience 958 dh = Record.Description() 959 960 # I need to separate the score and p-value from the title. 961 # sp|P21297|FLBT_CAUCR FLBT PROTEIN [snip] 284 7e-77 962 # sp|P21297|FLBT_CAUCR FLBT PROTEIN [snip] 284 7e-77 1 963 # special cases to handle: 964 # - title must be preserved exactly (including whitespaces) 965 # - score could be equal to e-value (not likely, but what if??) 966 # - sometimes there's an "N" score of '1'. 967 cols = line.split() 968 if len(cols) < 3: 969 raise ValueError( 970 "Line does not appear to contain description:\n%s" % line) 971 if self.__has_n: 972 i = line.rfind(cols[-1]) # find start of N 973 i = line.rfind(cols[-2], 0, i) # find start of p-value 974 i = line.rfind(cols[-3], 0, i) # find start of score 975 else: 976 i = line.rfind(cols[-1]) # find start of p-value 977 i = line.rfind(cols[-2], 0, i) # find start of score 978 if self.__has_n: 979 dh.title, dh.score, dh.e, dh.num_alignments = \ 980 line[:i].rstrip(), cols[-3], cols[-2], cols[-1] 981 else: 982 dh.title, dh.score, dh.e, dh.num_alignments = \ 983 line[:i].rstrip(), cols[-2], cols[-1], 1 984 dh.num_alignments = _safe_int(dh.num_alignments) 985 dh.score = _safe_int(dh.score) 986 dh.e = _safe_float(dh.e) 987 return dh
988 989
990 -class _AlignmentConsumer(object):
991 # This is a little bit tricky. An alignment can either be a 992 # pairwise alignment or a multiple alignment. Since it's difficult 993 # to know a-priori which one the blast record will contain, I'm going 994 # to make one class that can parse both of them.
995 - def start_alignment(self):
996 self._alignment = Record.Alignment() 997 self._multiple_alignment = Record.MultipleAlignment()
998
999 - def title(self, line):
1000 if self._alignment.title: 1001 self._alignment.title += " " 1002 self._alignment.title += line.strip()
1003
1004 - def length(self, line):
1005 #e.g. "Length = 81" or more recently, "Length=428" 1006 parts = line.replace(" ","").split("=") 1007 assert len(parts)==2, "Unrecognised format length line" 1008 self._alignment.length = parts[1] 1009 self._alignment.length = _safe_int(self._alignment.length)
1010
1011 - def multalign(self, line):
1012 # Standalone version uses 'QUERY', while WWW version uses blast_tmp. 1013 if line.startswith('QUERY') or line.startswith('blast_tmp'): 1014 # If this is the first line of the multiple alignment, 1015 # then I need to figure out how the line is formatted. 1016 1017 # Format of line is: 1018 # QUERY 1 acttg...gccagaggtggtttattcagtctccataagagaggggacaaacg 60 1019 try: 1020 name, start, seq, end = line.split() 1021 except ValueError: 1022 raise ValueError("I do not understand the line\n%s" % line) 1023 self._start_index = line.index(start, len(name)) 1024 self._seq_index = line.index(seq, 1025 self._start_index+len(start)) 1026 # subtract 1 for the space 1027 self._name_length = self._start_index - 1 1028 self._start_length = self._seq_index - self._start_index - 1 1029 self._seq_length = line.rfind(end) - self._seq_index - 1 1030 1031 #self._seq_index = line.index(seq) 1032 ## subtract 1 for the space 1033 #self._seq_length = line.rfind(end) - self._seq_index - 1 1034 #self._start_index = line.index(start) 1035 #self._start_length = self._seq_index - self._start_index - 1 1036 #self._name_length = self._start_index 1037 1038 # Extract the information from the line 1039 name = line[:self._name_length] 1040 name = name.rstrip() 1041 start = line[self._start_index:self._start_index+self._start_length] 1042 start = start.rstrip() 1043 if start: 1044 start = _safe_int(start) 1045 end = line[self._seq_index+self._seq_length:].rstrip() 1046 if end: 1047 end = _safe_int(end) 1048 seq = line[self._seq_index:self._seq_index+self._seq_length].rstrip() 1049 # right pad the sequence with spaces if necessary 1050 if len(seq) < self._seq_length: 1051 seq = seq + ' '*(self._seq_length-len(seq)) 1052 1053 # I need to make sure the sequence is aligned correctly with the query. 1054 # First, I will find the length of the query. Then, if necessary, 1055 # I will pad my current sequence with spaces so that they will line 1056 # up correctly. 1057 1058 # Two possible things can happen: 1059 # QUERY 1060 # 504 1061 # 1062 # QUERY 1063 # 403 1064 # 1065 # Sequence 504 will need padding at the end. Since I won't know 1066 # this until the end of the alignment, this will be handled in 1067 # end_alignment. 1068 # Sequence 403 will need padding before being added to the alignment. 1069 1070 align = self._multiple_alignment.alignment # for convenience 1071 align.append((name, start, seq, end))
1072 1073 # This is old code that tried to line up all the sequences 1074 # in a multiple alignment by using the sequence title's as 1075 # identifiers. The problem with this is that BLAST assigns 1076 # different HSP's from the same sequence the same id. Thus, 1077 # in one alignment block, there may be multiple sequences with 1078 # the same id. I'm not sure how to handle this, so I'm not 1079 # going to. 1080 1081 # # If the sequence is the query, then just add it. 1082 # if name == 'QUERY': 1083 # if len(align) == 0: 1084 # align.append((name, start, seq)) 1085 # else: 1086 # aname, astart, aseq = align[0] 1087 # if name != aname: 1088 # raise ValueError, "Query is not the first sequence" 1089 # aseq = aseq + seq 1090 # align[0] = aname, astart, aseq 1091 # else: 1092 # if len(align) == 0: 1093 # raise ValueError, "I could not find the query sequence" 1094 # qname, qstart, qseq = align[0] 1095 # 1096 # # Now find my sequence in the multiple alignment. 1097 # for i in range(1, len(align)): 1098 # aname, astart, aseq = align[i] 1099 # if name == aname: 1100 # index = i 1101 # break 1102 # else: 1103 # # If I couldn't find it, then add a new one. 1104 # align.append((None, None, None)) 1105 # index = len(align)-1 1106 # # Make sure to left-pad it. 1107 # aname, astart, aseq = name, start, ' '*(len(qseq)-len(seq)) 1108 # 1109 # if len(qseq) != len(aseq) + len(seq): 1110 # # If my sequences are shorter than the query sequence, 1111 # # then I will need to pad some spaces to make them line up. 1112 # # Since I've already right padded seq, that means aseq 1113 # # must be too short. 1114 # aseq = aseq + ' '*(len(qseq)-len(aseq)-len(seq)) 1115 # aseq = aseq + seq 1116 # if astart is None: 1117 # astart = start 1118 # align[index] = aname, astart, aseq 1119
1120 - def end_alignment(self):
1121 # Remove trailing newlines 1122 if self._alignment: 1123 self._alignment.title = self._alignment.title.rstrip() 1124 1125 # This code is also obsolete. See note above. 1126 # If there's a multiple alignment, I will need to make sure 1127 # all the sequences are aligned. That is, I may need to 1128 # right-pad the sequences. 1129 # if self._multiple_alignment is not None: 1130 # align = self._multiple_alignment.alignment 1131 # seqlen = None 1132 # for i in range(len(align)): 1133 # name, start, seq = align[i] 1134 # if seqlen is None: 1135 # seqlen = len(seq) 1136 # else: 1137 # if len(seq) < seqlen: 1138 # seq = seq + ' '*(seqlen - len(seq)) 1139 # align[i] = name, start, seq 1140 # elif len(seq) > seqlen: 1141 # raise ValueError, \ 1142 # "Sequence %s is longer than the query" % name 1143 1144 # Clean up some variables, if they exist. 1145 try: 1146 del self._seq_index 1147 del self._seq_length 1148 del self._start_index 1149 del self._start_length 1150 del self._name_length 1151 except AttributeError: 1152 pass
1153 1154
1155 -class _HSPConsumer(object):
1156 - def start_hsp(self):
1157 self._hsp = Record.HSP()
1158
1159 - def score(self, line):
1160 self._hsp.bits, self._hsp.score = _re_search( 1161 r"Score =\s*([0-9.e+]+) bits \(([0-9]+)\)", line, 1162 "I could not find the score in line\n%s" % line) 1163 self._hsp.score = _safe_float(self._hsp.score) 1164 self._hsp.bits = _safe_float(self._hsp.bits) 1165 1166 x, y = _re_search( 1167 r"Expect\(?(\d*)\)? = +([0-9.e\-|\+]+)", line, 1168 "I could not find the expect in line\n%s" % line) 1169 if x: 1170 self._hsp.num_alignments = _safe_int(x) 1171 else: 1172 self._hsp.num_alignments = 1 1173 self._hsp.expect = _safe_float(y)
1174
1175 - def identities(self, line):
1176 x, y = _re_search( 1177 r"Identities = (\d+)\/(\d+)", line, 1178 "I could not find the identities in line\n%s" % line) 1179 self._hsp.identities = _safe_int(x), _safe_int(y) 1180 self._hsp.align_length = _safe_int(y) 1181 1182 if 'Positives' in line: 1183 x, y = _re_search( 1184 r"Positives = (\d+)\/(\d+)", line, 1185 "I could not find the positives in line\n%s" % line) 1186 self._hsp.positives = _safe_int(x), _safe_int(y) 1187 assert self._hsp.align_length == _safe_int(y) 1188 1189 if 'Gaps' in line: 1190 x, y = _re_search( 1191 r"Gaps = (\d+)\/(\d+)", line, 1192 "I could not find the gaps in line\n%s" % line) 1193 self._hsp.gaps = _safe_int(x), _safe_int(y) 1194 assert self._hsp.align_length == _safe_int(y)
1195
1196 - def strand(self, line):
1197 self._hsp.strand = _re_search( 1198 r"Strand\s?=\s?(\w+)\s?/\s?(\w+)", line, 1199 "I could not find the strand in line\n%s" % line)
1200
1201 - def frame(self, line):
1202 # Frame can be in formats: 1203 # Frame = +1 1204 # Frame = +2 / +2 1205 if '/' in line: 1206 self._hsp.frame = _re_search( 1207 r"Frame\s?=\s?([-+][123])\s?/\s?([-+][123])", line, 1208 "I could not find the frame in line\n%s" % line) 1209 else: 1210 self._hsp.frame = _re_search( 1211 r"Frame = ([-+][123])", line, 1212 "I could not find the frame in line\n%s" % line)
1213 1214 # Match a space, if one is available. Masahir Ishikawa found a 1215 # case where there's no space between the start and the sequence: 1216 # Query: 100tt 101 1217 # line below modified by Yair Benita, Sep 2004 1218 # Note that the colon is not always present. 2006 1219 _query_re = re.compile(r"Query(:?) \s*(\d+)\s*(.+) (\d+)") 1220
1221 - def query(self, line):
1222 m = self._query_re.search(line) 1223 if m is None: 1224 raise ValueError("I could not find the query in line\n%s" % line) 1225 1226 # line below modified by Yair Benita, Sep 2004. 1227 # added the end attribute for the query 1228 colon, start, seq, end = m.groups() 1229 self._hsp.query = self._hsp.query + seq 1230 if self._hsp.query_start is None: 1231 self._hsp.query_start = _safe_int(start) 1232 1233 # line below added by Yair Benita, Sep 2004. 1234 # added the end attribute for the query 1235 self._hsp.query_end = _safe_int(end) 1236 1237 #Get index for sequence start (regular expression element 3) 1238 self._query_start_index = m.start(3) 1239 self._query_len = len(seq)
1240
1241 - def align(self, line):
1242 seq = line[self._query_start_index:].rstrip() 1243 if len(seq) < self._query_len: 1244 # Make sure the alignment is the same length as the query 1245 seq = seq + ' ' * (self._query_len-len(seq)) 1246 elif len(seq) < self._query_len: 1247 raise ValueError("Match is longer than the query in line\n%s" 1248 % line) 1249 self._hsp.match = self._hsp.match + seq
1250 1251 # To match how we do the query, cache the regular expression. 1252 # Note that the colon is not always present. 1253 _sbjct_re = re.compile(r"Sbjct(:?) \s*(\d+)\s*(.+) (\d+)") 1254
1255 - def sbjct(self, line):
1256 m = self._sbjct_re.search(line) 1257 if m is None: 1258 raise ValueError("I could not find the sbjct in line\n%s" % line) 1259 colon, start, seq, end = m.groups() 1260 #mikep 26/9/00 1261 #On occasion, there is a blast hit with no subject match 1262 #so far, it only occurs with 1-line short "matches" 1263 #I have decided to let these pass as they appear 1264 if not seq.strip(): 1265 seq = ' ' * self._query_len 1266 self._hsp.sbjct = self._hsp.sbjct + seq 1267 if self._hsp.sbjct_start is None: 1268 self._hsp.sbjct_start = _safe_int(start) 1269 1270 self._hsp.sbjct_end = _safe_int(end) 1271 if len(seq) != self._query_len: 1272 raise ValueError( 1273 "QUERY and SBJCT sequence lengths don't match in line\n%s" 1274 % line) 1275 1276 del self._query_start_index # clean up unused variables 1277 del self._query_len
1278
1279 - def end_hsp(self):
1280 pass
1281 1282
1283 -class _DatabaseReportConsumer(object):
1284
1285 - def start_database_report(self):
1286 self._dr = Record.DatabaseReport()
1287
1288 - def database(self, line):
1289 m = re.search(r"Database: (.+)$", line) 1290 if m: 1291 self._dr.database_name.append(m.group(1)) 1292 elif self._dr.database_name: 1293 # This must be a continuation of the previous name. 1294 self._dr.database_name[-1] = "%s%s" % (self._dr.database_name[-1], 1295 line.strip())
1296
1297 - def posted_date(self, line):
1298 self._dr.posted_date.append(_re_search( 1299 r"Posted date:\s*(.+)$", line, 1300 "I could not find the posted date in line\n%s" % line))
1301
1302 - def num_letters_in_database(self, line):
1303 letters, = _get_cols( 1304 line, (-1,), ncols=6, expected={2:"letters", 4:"database:"}) 1305 self._dr.num_letters_in_database.append(_safe_int(letters))
1306
1307 - def num_sequences_in_database(self, line):
1308 sequences, = _get_cols( 1309 line, (-1,), ncols=6, expected={2:"sequences", 4:"database:"}) 1310 self._dr.num_sequences_in_database.append(_safe_int(sequences))
1311
1312 - def ka_params(self, line):
1313 x = line.split() 1314 self._dr.ka_params = map(_safe_float, x)
1315
1316 - def gapped(self, line):
1317 self._dr.gapped = 1
1318
1319 - def ka_params_gap(self, line):
1320 x = line.split() 1321 self._dr.ka_params_gap = map(_safe_float, x)
1322
1323 - def end_database_report(self):
1324 pass
1325 1326
1327 -class _ParametersConsumer(object):
1328 - def start_parameters(self):
1329 self._params = Record.Parameters()
1330
1331 - def matrix(self, line):
1332 self._params.matrix = line[8:].rstrip()
1333
1334 - def gap_penalties(self, line):
1335 x = _get_cols( 1336 line, (3, 5), ncols=6, expected={2:"Existence:", 4:"Extension:"}) 1337 self._params.gap_penalties = map(_safe_float, x)
1338
1339 - def num_hits(self, line):
1340 if '1st pass' in line: 1341 x, = _get_cols(line, (-4,), ncols=11, expected={2:"Hits"}) 1342 self._params.num_hits = _safe_int(x) 1343 else: 1344 x, = _get_cols(line, (-1,), ncols=6, expected={2:"Hits"}) 1345 self._params.num_hits = _safe_int(x)
1346
1347 - def num_sequences(self, line):
1348 if '1st pass' in line: 1349 x, = _get_cols(line, (-4,), ncols=9, expected={2:"Sequences:"}) 1350 self._params.num_sequences = _safe_int(x) 1351 else: 1352 x, = _get_cols(line, (-1,), ncols=4, expected={2:"Sequences:"}) 1353 self._params.num_sequences = _safe_int(x)
1354
1355 - def num_extends(self, line):
1356 if '1st pass' in line: 1357 x, = _get_cols(line, (-4,), ncols=9, expected={2:"extensions:"}) 1358 self._params.num_extends = _safe_int(x) 1359 else: 1360 x, = _get_cols(line, (-1,), ncols=4, expected={2:"extensions:"}) 1361 self._params.num_extends = _safe_int(x)
1362
1363 - def num_good_extends(self, line):
1364 if '1st pass' in line: 1365 x, = _get_cols(line, (-4,), ncols=10, expected={3:"extensions:"}) 1366 self._params.num_good_extends = _safe_int(x) 1367 else: 1368 x, = _get_cols(line, (-1,), ncols=5, expected={3:"extensions:"}) 1369 self._params.num_good_extends = _safe_int(x)
1370
1371 - def num_seqs_better_e(self, line):
1372 self._params.num_seqs_better_e, = _get_cols( 1373 line, (-1,), ncols=7, expected={2:"sequences"}) 1374 self._params.num_seqs_better_e = _safe_int( 1375 self._params.num_seqs_better_e)
1376
1377 - def hsps_no_gap(self, line):
1378 self._params.hsps_no_gap, = _get_cols( 1379 line, (-1,), ncols=9, expected={3:"better", 7:"gapping:"}) 1380 self._params.hsps_no_gap = _safe_int(self._params.hsps_no_gap)
1381
1382 - def hsps_prelim_gapped(self, line):
1383 self._params.hsps_prelim_gapped, = _get_cols( 1384 line, (-1,), ncols=9, expected={4:"gapped", 6:"prelim"}) 1385 self._params.hsps_prelim_gapped = _safe_int( 1386 self._params.hsps_prelim_gapped)
1387
1388 - def hsps_prelim_gapped_attempted(self, line):
1389 self._params.hsps_prelim_gapped_attempted, = _get_cols( 1390 line, (-1,), ncols=10, expected={4:"attempted", 7:"prelim"}) 1391 self._params.hsps_prelim_gapped_attempted = _safe_int( 1392 self._params.hsps_prelim_gapped_attempted)
1393
1394 - def hsps_gapped(self, line):
1395 self._params.hsps_gapped, = _get_cols( 1396 line, (-1,), ncols=6, expected={3:"gapped"}) 1397 self._params.hsps_gapped = _safe_int(self._params.hsps_gapped)
1398
1399 - def query_length(self, line):
1400 self._params.query_length, = _get_cols( 1401 line.lower(), (-1,), ncols=4, expected={0:"length", 2:"query:"}) 1402 self._params.query_length = _safe_int(self._params.query_length)
1403
1404 - def database_length(self, line):
1405 self._params.database_length, = _get_cols( 1406 line.lower(), (-1,), ncols=4, expected={0:"length", 2:"database:"}) 1407 self._params.database_length = _safe_int(self._params.database_length)
1408
1409 - def effective_hsp_length(self, line):
1410 self._params.effective_hsp_length, = _get_cols( 1411 line, (-1,), ncols=4, expected={1:"HSP", 2:"length:"}) 1412 self._params.effective_hsp_length = _safe_int( 1413 self._params.effective_hsp_length)
1414
1415 - def effective_query_length(self, line):
1416 self._params.effective_query_length, = _get_cols( 1417 line, (-1,), ncols=5, expected={1:"length", 3:"query:"}) 1418 self._params.effective_query_length = _safe_int( 1419 self._params.effective_query_length)
1420
1421 - def effective_database_length(self, line):
1422 self._params.effective_database_length, = _get_cols( 1423 line.lower(), (-1,), ncols=5, expected={1:"length", 3:"database:"}) 1424 self._params.effective_database_length = _safe_int( 1425 self._params.effective_database_length)
1426
1427 - def effective_search_space(self, line):
1428 self._params.effective_search_space, = _get_cols( 1429 line, (-1,), ncols=4, expected={1:"search"}) 1430 self._params.effective_search_space = _safe_int( 1431 self._params.effective_search_space)
1432
1433 - def effective_search_space_used(self, line):
1434 self._params.effective_search_space_used, = _get_cols( 1435 line, (-1,), ncols=5, expected={1:"search", 3:"used:"}) 1436 self._params.effective_search_space_used = _safe_int( 1437 self._params.effective_search_space_used)
1438
1439 - def frameshift(self, line):
1440 self._params.frameshift = _get_cols( 1441 line, (4, 5), ncols=6, expected={0:"frameshift", 2:"decay"})
1442
1443 - def threshold(self, line):
1444 if line[:2] == "T:": 1445 #Assume its an old stlye line like "T: 123" 1446 self._params.threshold, = _get_cols( 1447 line, (1,), ncols=2, expected={0:"T:"}) 1448 elif line[:28] == "Neighboring words threshold:": 1449 self._params.threshold, = _get_cols( 1450 line, (3,), ncols=4, expected={0:"Neighboring", 1:"words", 2:"threshold:"}) 1451 else: 1452 raise ValueError("Unrecognised threshold line:\n%s" % line) 1453 self._params.threshold = _safe_int(self._params.threshold)
1454
1455 - def window_size(self, line):
1456 if line[:2] == "A:": 1457 self._params.window_size, = _get_cols( 1458 line, (1,), ncols=2, expected={0:"A:"}) 1459 elif line[:25] == "Window for multiple hits:": 1460 self._params.window_size, = _get_cols( 1461 line, (4,), ncols=5, expected={0:"Window", 2:"multiple", 3:"hits:"}) 1462 else: 1463 raise ValueError("Unrecognised window size line:\n%s" % line) 1464 self._params.window_size = _safe_int(self._params.window_size)
1465
1466 - def dropoff_1st_pass(self, line):
1467 score, bits = _re_search( 1468 r"X1: (\d+) \(\s*([0-9,.]+) bits\)", line, 1469 "I could not find the dropoff in line\n%s" % line) 1470 self._params.dropoff_1st_pass = _safe_int(score), _safe_float(bits)
1471
1472 - def gap_x_dropoff(self, line):
1473 score, bits = _re_search( 1474 r"X2: (\d+) \(\s*([0-9,.]+) bits\)", line, 1475 "I could not find the gap dropoff in line\n%s" % line) 1476 self._params.gap_x_dropoff = _safe_int(score), _safe_float(bits)
1477
1478 - def gap_x_dropoff_final(self, line):
1479 score, bits = _re_search( 1480 r"X3: (\d+) \(\s*([0-9,.]+) bits\)", line, 1481 "I could not find the gap dropoff final in line\n%s" % line) 1482 self._params.gap_x_dropoff_final = _safe_int(score), _safe_float(bits)
1483
1484 - def gap_trigger(self, line):
1485 score, bits = _re_search( 1486 r"S1: (\d+) \(\s*([0-9,.]+) bits\)", line, 1487 "I could not find the gap trigger in line\n%s" % line) 1488 self._params.gap_trigger = _safe_int(score), _safe_float(bits)
1489
1490 - def blast_cutoff(self, line):
1491 score, bits = _re_search( 1492 r"S2: (\d+) \(\s*([0-9,.]+) bits\)", line, 1493 "I could not find the blast cutoff in line\n%s" % line) 1494 self._params.blast_cutoff = _safe_int(score), _safe_float(bits)
1495
1496 - def end_parameters(self):
1497 pass
1498 1499
1500 -class _BlastConsumer(AbstractConsumer, 1501 _HeaderConsumer, 1502 _DescriptionConsumer, 1503 _AlignmentConsumer, 1504 _HSPConsumer, 1505 _DatabaseReportConsumer, 1506 _ParametersConsumer 1507 ):
1508 # This Consumer is inherits from many other consumer classes that handle 1509 # the actual dirty work. An alternate way to do it is to create objects 1510 # of those classes and then delegate the parsing tasks to them in a 1511 # decorator-type pattern. The disadvantage of that is that the method 1512 # names will need to be resolved in this classes. However, using 1513 # a decorator will retain more control in this class (which may or 1514 # may not be a bad thing). In addition, having each sub-consumer as 1515 # its own object prevents this object's dictionary from being cluttered 1516 # with members and reduces the chance of member collisions.
1517 - def __init__(self):
1518 self.data = None
1519
1520 - def round(self, line):
1521 # Make sure nobody's trying to pass me PSI-BLAST data! 1522 raise ValueError("This consumer doesn't handle PSI-BLAST data")
1523
1524 - def start_header(self):
1525 self.data = Record.Blast() 1526 _HeaderConsumer.start_header(self)
1527
1528 - def end_header(self):
1529 _HeaderConsumer.end_header(self) 1530 self.data.__dict__.update(self._header.__dict__)
1531
1532 - def end_descriptions(self):
1533 self.data.descriptions = self._descriptions
1534
1535 - def end_alignment(self):
1536 _AlignmentConsumer.end_alignment(self) 1537 if self._alignment.hsps: 1538 self.data.alignments.append(self._alignment) 1539 if self._multiple_alignment.alignment: 1540 self.data.multiple_alignment = self._multiple_alignment
1541
1542 - def end_hsp(self):
1543 _HSPConsumer.end_hsp(self) 1544 try: 1545 self._alignment.hsps.append(self._hsp) 1546 except AttributeError: 1547 raise ValueError("Found an HSP before an alignment")
1548
1549 - def end_database_report(self):
1550 _DatabaseReportConsumer.end_database_report(self) 1551 self.data.__dict__.update(self._dr.__dict__)
1552
1553 - def end_parameters(self):
1554 _ParametersConsumer.end_parameters(self) 1555 self.data.__dict__.update(self._params.__dict__)
1556 1557
1558 -class _PSIBlastConsumer(AbstractConsumer, 1559 _HeaderConsumer, 1560 _DescriptionConsumer, 1561 _AlignmentConsumer, 1562 _HSPConsumer, 1563 _DatabaseReportConsumer, 1564 _ParametersConsumer 1565 ):
1566 - def __init__(self):
1567 self.data = None
1568
1569 - def start_header(self):
1570 self.data = Record.PSIBlast() 1571 _HeaderConsumer.start_header(self)
1572
1573 - def end_header(self):
1574 _HeaderConsumer.end_header(self) 1575 self.data.__dict__.update(self._header.__dict__)
1576
1577 - def start_descriptions(self):
1578 self._round = Record.Round() 1579 self.data.rounds.append(self._round) 1580 _DescriptionConsumer.start_descriptions(self)
1581
1582 - def end_descriptions(self):
1583 _DescriptionConsumer.end_descriptions(self) 1584 self._round.number = self._roundnum 1585 if self._descriptions: 1586 self._round.new_seqs.extend(self._descriptions) 1587 self._round.reused_seqs.extend(self._model_sequences) 1588 self._round.new_seqs.extend(self._nonmodel_sequences) 1589 if self._converged: 1590 self.data.converged = 1
1591
1592 - def end_alignment(self):
1593 _AlignmentConsumer.end_alignment(self) 1594 if self._alignment.hsps: 1595 self._round.alignments.append(self._alignment) 1596 if self._multiple_alignment: 1597 self._round.multiple_alignment = self._multiple_alignment
1598
1599 - def end_hsp(self):
1600 _HSPConsumer.end_hsp(self) 1601 try: 1602 self._alignment.hsps.append(self._hsp) 1603 except AttributeError: 1604 raise ValueError("Found an HSP before an alignment")
1605
1606 - def end_database_report(self):
1607 _DatabaseReportConsumer.end_database_report(self) 1608 self.data.__dict__.update(self._dr.__dict__)
1609
1610 - def end_parameters(self):
1611 _ParametersConsumer.end_parameters(self) 1612 self.data.__dict__.update(self._params.__dict__)
1613 1614
1615 -class Iterator(object):
1616 """Iterates over a file of multiple BLAST results. 1617 1618 Methods: 1619 next Return the next record from the stream, or None. 1620 1621 """
1622 - def __init__(self, handle, parser=None):
1623 """__init__(self, handle, parser=None) 1624 1625 Create a new iterator. handle is a file-like object. parser 1626 is an optional Parser object to change the results into another form. 1627 If set to None, then the raw contents of the file will be returned. 1628 1629 """ 1630 try: 1631 handle.readline 1632 except AttributeError: 1633 raise ValueError( 1634 "I expected a file handle or file-like object, got %s" 1635 % type(handle)) 1636 self._uhandle = File.UndoHandle(handle) 1637 self._parser = parser 1638 self._header = []
1639
1640 - def next(self):
1641 """next(self) -> object 1642 1643 Return the next Blast record from the file. If no more records, 1644 return None. 1645 1646 """ 1647 lines = [] 1648 query = False 1649 while 1: 1650 line = self._uhandle.readline() 1651 if not line: 1652 break 1653 # If I've reached the next one, then put the line back and stop. 1654 if lines and (line.startswith('BLAST') 1655 or line.startswith('BLAST', 1) 1656 or line.startswith('<?xml ')): 1657 self._uhandle.saveline(line) 1658 break 1659 # New style files omit the BLAST line to mark a new query: 1660 if line.startswith("Query="): 1661 if not query: 1662 if not self._header: 1663 self._header = lines[:] 1664 query = True 1665 else: 1666 #Start of another record 1667 self._uhandle.saveline(line) 1668 break 1669 lines.append(line) 1670 1671 if query and "BLAST" not in lines[0]: 1672 #Cheat and re-insert the header 1673 #print "-"*50 1674 #print "".join(self._header) 1675 #print "-"*50 1676 #print "".join(lines) 1677 #print "-"*50 1678 lines = self._header + lines 1679 1680 if not lines: 1681 return None 1682 1683 data = ''.join(lines) 1684 if self._parser is not None: 1685 return self._parser.parse(StringIO.StringIO(data)) 1686 return data
1687
1688 - def __iter__(self):
1689 return iter(self.next, None)
1690 1691
1692 -def blastall(blastcmd, program, database, infile, align_view='7', **keywds):
1693 """Execute and retrieve data from standalone BLASTPALL as handles (DEPRECATED). 1694 1695 NOTE - This function is deprecated, you are encouraged to the command 1696 line wrapper Bio.Blast.Applications.BlastallCommandline instead, or 1697 better the BLAST+ command line wrappers in Bio.Blast.Applications. 1698 1699 Execute and retrieve data from blastall. blastcmd is the command 1700 used to launch the 'blastall' executable. program is the blast program 1701 to use, e.g. 'blastp', 'blastn', etc. database is the path to the database 1702 to search against. infile is the path to the file containing 1703 the sequence to search with. 1704 1705 The return values are two handles, for standard output and standard error. 1706 1707 You may pass more parameters to **keywds to change the behavior of 1708 the search. Otherwise, optional values will be chosen by blastall. 1709 The Blast output is by default in XML format. Use the align_view keyword 1710 for output in a different format. 1711 1712 Scoring 1713 matrix Matrix to use. 1714 gap_open Gap open penalty. 1715 gap_extend Gap extension penalty. 1716 nuc_match Nucleotide match reward. (BLASTN) 1717 nuc_mismatch Nucleotide mismatch penalty. (BLASTN) 1718 query_genetic_code Genetic code for Query. 1719 db_genetic_code Genetic code for database. (TBLAST[NX]) 1720 1721 Algorithm 1722 gapped Whether to do a gapped alignment. T/F (not for TBLASTX) 1723 expectation Expectation value cutoff. 1724 wordsize Word size. 1725 strands Query strands to search against database.([T]BLAST[NX]) 1726 keep_hits Number of best hits from a region to keep. 1727 xdrop Dropoff value (bits) for gapped alignments. 1728 hit_extend Threshold for extending hits. 1729 region_length Length of region used to judge hits. 1730 db_length Effective database length. 1731 search_length Effective length of search space. 1732 1733 Processing 1734 filter Filter query sequence for low complexity (with SEG)? T/F 1735 believe_query Believe the query defline. T/F 1736 restrict_gi Restrict search to these GI's. 1737 nprocessors Number of processors to use. 1738 oldengine Force use of old engine T/F 1739 1740 Formatting 1741 html Produce HTML output? T/F 1742 descriptions Number of one-line descriptions. 1743 alignments Number of alignments. 1744 align_view Alignment view. Integer 0-11, 1745 passed as a string or integer. 1746 show_gi Show GI's in deflines? T/F 1747 seqalign_file seqalign file to output. 1748 outfile Output file for report. Filename to write to, if 1749 omitted standard output is used (which you can access 1750 from the returned handles). 1751 """ 1752 1753 _security_check_parameters(keywds) 1754 1755 att2param = { 1756 'matrix' : '-M', 1757 'gap_open' : '-G', 1758 'gap_extend' : '-E', 1759 'nuc_match' : '-r', 1760 'nuc_mismatch' : '-q', 1761 'query_genetic_code' : '-Q', 1762 'db_genetic_code' : '-D', 1763 1764 'gapped' : '-g', 1765 'expectation' : '-e', 1766 'wordsize' : '-W', 1767 'strands' : '-S', 1768 'keep_hits' : '-K', 1769 'xdrop' : '-X', 1770 'hit_extend' : '-f', 1771 'region_length' : '-L', 1772 'db_length' : '-z', 1773 'search_length' : '-Y', 1774 1775 'program' : '-p', 1776 'database' : '-d', 1777 'infile' : '-i', 1778 'filter' : '-F', 1779 'believe_query' : '-J', 1780 'restrict_gi' : '-l', 1781 'nprocessors' : '-a', 1782 'oldengine' : '-V', 1783 1784 'html' : '-T', 1785 'descriptions' : '-v', 1786 'alignments' : '-b', 1787 'align_view' : '-m', 1788 'show_gi' : '-I', 1789 'seqalign_file' : '-O', 1790 'outfile' : '-o', 1791 } 1792 warnings.warn("This function is deprecated; you are encouraged to the command line wrapper Bio.Blast.Applications.BlastallCommandline instead.", BiopythonDeprecationWarning) 1793 from Applications import BlastallCommandline 1794 cline = BlastallCommandline(blastcmd) 1795 cline.set_parameter(att2param['program'], program) 1796 cline.set_parameter(att2param['database'], database) 1797 cline.set_parameter(att2param['infile'], infile) 1798 cline.set_parameter(att2param['align_view'], str(align_view)) 1799 for key, value in keywds.iteritems(): 1800 cline.set_parameter(att2param[key], str(value)) 1801 return _invoke_blast(cline)
1802 1803
1804 -def blastpgp(blastcmd, database, infile, align_view='7', **keywds):
1805 """Execute and retrieve data from standalone BLASTPGP as handles (DEPRECATED). 1806 1807 NOTE - This function is deprecated, you are encouraged to the command 1808 line wrapper Bio.Blast.Applications.BlastpgpCommandline instead, or 1809 better the BLAST+ tool psiblast via the NcbipsiblastCommandline wrapper. 1810 1811 Execute and retrieve data from blastpgp. blastcmd is the command 1812 used to launch the 'blastpgp' executable. database is the path to the 1813 database to search against. infile is the path to the file containing 1814 the sequence to search with. 1815 1816 The return values are two handles, for standard output and standard error. 1817 1818 You may pass more parameters to **keywds to change the behavior of 1819 the search. Otherwise, optional values will be chosen by blastpgp. 1820 The Blast output is by default in XML format. Use the align_view keyword 1821 for output in a different format. 1822 1823 Scoring 1824 matrix Matrix to use. 1825 gap_open Gap open penalty. 1826 gap_extend Gap extension penalty. 1827 window_size Multiple hits window size. 1828 npasses Number of passes. 1829 passes Hits/passes. Integer 0-2. 1830 1831 Algorithm 1832 gapped Whether to do a gapped alignment. T/F 1833 expectation Expectation value cutoff. 1834 wordsize Word size. 1835 keep_hits Number of beset hits from a region to keep. 1836 xdrop Dropoff value (bits) for gapped alignments. 1837 hit_extend Threshold for extending hits. 1838 region_length Length of region used to judge hits. 1839 db_length Effective database length. 1840 search_length Effective length of search space. 1841 nbits_gapping Number of bits to trigger gapping. 1842 pseudocounts Pseudocounts constants for multiple passes. 1843 xdrop_final X dropoff for final gapped alignment. 1844 xdrop_extension Dropoff for blast extensions. 1845 model_threshold E-value threshold to include in multipass model. 1846 required_start Start of required region in query. 1847 required_end End of required region in query. 1848 1849 Processing 1850 XXX should document default values 1851 program The blast program to use. (PHI-BLAST) 1852 filter Filter query sequence for low complexity (with SEG)? T/F 1853 believe_query Believe the query defline? T/F 1854 nprocessors Number of processors to use. 1855 1856 Formatting 1857 html Produce HTML output? T/F 1858 descriptions Number of one-line descriptions. 1859 alignments Number of alignments. 1860 align_view Alignment view. Integer 0-11, 1861 passed as a string or integer. 1862 show_gi Show GI's in deflines? T/F 1863 seqalign_file seqalign file to output. 1864 align_outfile Output file for alignment. 1865 checkpoint_outfile Output file for PSI-BLAST checkpointing. 1866 restart_infile Input file for PSI-BLAST restart. 1867 hit_infile Hit file for PHI-BLAST. 1868 matrix_outfile Output file for PSI-BLAST matrix in ASCII. 1869 align_outfile Output file for alignment. Filename to write to, if 1870 omitted standard output is used (which you can access 1871 from the returned handles). 1872 1873 align_infile Input alignment file for PSI-BLAST restart. 1874 """ 1875 1876 warnings.warn("This function is deprecated; you are encouraged to the command line wrapper Bio.Blast.Applications.BlastpgpCommandline instead.", BiopythonDeprecationWarning) 1877 _security_check_parameters(keywds) 1878 1879 att2param = { 1880 'matrix' : '-M', 1881 'gap_open' : '-G', 1882 'gap_extend' : '-E', 1883 'window_size' : '-A', 1884 'npasses' : '-j', 1885 'passes' : '-P', 1886 1887 'gapped' : '-g', 1888 'expectation' : '-e', 1889 'wordsize' : '-W', 1890 'keep_hits' : '-K', 1891 'xdrop' : '-X', 1892 'hit_extend' : '-f', 1893 'region_length' : '-L', 1894 'db_length' : '-Z', 1895 'search_length' : '-Y', 1896 'nbits_gapping' : '-N', 1897 'pseudocounts' : '-c', 1898 'xdrop_final' : '-Z', 1899 'xdrop_extension' : '-y', 1900 'model_threshold' : '-h', 1901 'required_start' : '-S', 1902 'required_end' : '-H', 1903 1904 'program' : '-p', 1905 'database' : '-d', 1906 'infile' : '-i', 1907 'filter' : '-F', 1908 'believe_query' : '-J', 1909 'nprocessors' : '-a', 1910 1911 'html' : '-T', 1912 'descriptions' : '-v', 1913 'alignments' : '-b', 1914 'align_view' : '-m', 1915 'show_gi' : '-I', 1916 'seqalign_file' : '-O', 1917 'align_outfile' : '-o', 1918 'checkpoint_outfile' : '-C', 1919 'restart_infile' : '-R', 1920 'hit_infile' : '-k', 1921 'matrix_outfile' : '-Q', 1922 'align_infile' : '-B', 1923 } 1924 from Applications import BlastpgpCommandline 1925 cline = BlastpgpCommandline(blastcmd) 1926 cline.set_parameter(att2param['database'], database) 1927 cline.set_parameter(att2param['infile'], infile) 1928 cline.set_parameter(att2param['align_view'], str(align_view)) 1929 for key, value in keywds.iteritems(): 1930 cline.set_parameter(att2param[key], str(value)) 1931 return _invoke_blast(cline)
1932 1933
1934 -def rpsblast(blastcmd, database, infile, align_view="7", **keywds):
1935 """Execute and retrieve data from standalone RPS-BLAST as handles (DEPRECATED). 1936 1937 NOTE - This function is deprecated, you are encouraged to the command 1938 line wrapper Bio.Blast.Applications.RpsBlastCommandline instead, or 1939 better the BLAST+ rpsblast wrapper NcbirpsblastCommandline. 1940 1941 Execute and retrieve data from standalone RPS-BLAST. blastcmd is the 1942 command used to launch the 'rpsblast' executable. database is the path 1943 to the database to search against. infile is the path to the file 1944 containing the sequence to search with. 1945 1946 The return values are two handles, for standard output and standard error. 1947 1948 You may pass more parameters to **keywds to change the behavior of 1949 the search. Otherwise, optional values will be chosen by rpsblast. 1950 1951 Please note that this function will give XML output by default, by 1952 setting align_view to seven (i.e. command line option -m 7). 1953 You should use the NCBIXML.parse() function to read the resulting output. 1954 This is because NCBIStandalone.BlastParser() does not understand the 1955 plain text output format from rpsblast. 1956 1957 WARNING - The following text and associated parameter handling has not 1958 received extensive testing. Please report any errors we might have made... 1959 1960 Algorithm/Scoring 1961 gapped Whether to do a gapped alignment. T/F 1962 multihit 0 for multiple hit (default), 1 for single hit 1963 expectation Expectation value cutoff. 1964 range_restriction Range restriction on query sequence (Format: start,stop) blastp only 1965 0 in 'start' refers to the beginning of the sequence 1966 0 in 'stop' refers to the end of the sequence 1967 Default = 0,0 1968 xdrop Dropoff value (bits) for gapped alignments. 1969 xdrop_final X dropoff for final gapped alignment (in bits). 1970 xdrop_extension Dropoff for blast extensions (in bits). 1971 search_length Effective length of search space. 1972 nbits_gapping Number of bits to trigger gapping. 1973 protein Query sequence is protein. T/F 1974 db_length Effective database length. 1975 1976 Processing 1977 filter Filter query sequence for low complexity? T/F 1978 case_filter Use lower case filtering of FASTA sequence T/F, default F 1979 believe_query Believe the query defline. T/F 1980 nprocessors Number of processors to use. 1981 logfile Name of log file to use, default rpsblast.log 1982 1983 Formatting 1984 html Produce HTML output? T/F 1985 descriptions Number of one-line descriptions. 1986 alignments Number of alignments. 1987 align_view Alignment view. Integer 0-11, 1988 passed as a string or integer. 1989 show_gi Show GI's in deflines? T/F 1990 seqalign_file seqalign file to output. 1991 align_outfile Output file for alignment. Filename to write to, if 1992 omitted standard output is used (which you can access 1993 from the returned handles). 1994 """ 1995 1996 warnings.warn("This function is deprecated; you are encouraged to the command line wrapper Bio.Blast.Applications.BlastrpsCommandline instead.", BiopythonDeprecationWarning) 1997 _security_check_parameters(keywds) 1998 1999 att2param = { 2000 'multihit' : '-P', 2001 'gapped' : '-g', 2002 'expectation' : '-e', 2003 'range_restriction' : '-L', 2004 'xdrop' : '-X', 2005 'xdrop_final' : '-Z', 2006 'xdrop_extension' : '-y', 2007 'search_length' : '-Y', 2008 'nbits_gapping' : '-N', 2009 'protein' : '-p', 2010 'db_length' : '-z', 2011 2012 'database' : '-d', 2013 'infile' : '-i', 2014 'filter' : '-F', 2015 'case_filter' : '-U', 2016 'believe_query' : '-J', 2017 'nprocessors' : '-a', 2018 'logfile' : '-l', 2019 2020 'html' : '-T', 2021 'descriptions' : '-v', 2022 'alignments' : '-b', 2023 'align_view' : '-m', 2024 'show_gi' : '-I', 2025 'seqalign_file' : '-O', 2026 'align_outfile' : '-o', 2027 } 2028 2029 from Applications import RpsBlastCommandline 2030 cline = RpsBlastCommandline(blastcmd) 2031 cline.set_parameter(att2param['database'], database) 2032 cline.set_parameter(att2param['infile'], infile) 2033 cline.set_parameter(att2param['align_view'], str(align_view)) 2034 for key, value in keywds.iteritems(): 2035 cline.set_parameter(att2param[key], str(value)) 2036 return _invoke_blast(cline)
2037 2038
2039 -def _re_search(regex, line, error_msg):
2040 m = re.search(regex, line) 2041 if not m: 2042 raise ValueError(error_msg) 2043 return m.groups()
2044 2045
2046 -def _get_cols(line, cols_to_get, ncols=None, expected={}):
2047 cols = line.split() 2048 2049 # Check to make sure number of columns is correct 2050 if ncols is not None and len(cols) != ncols: 2051 raise ValueError("I expected %d columns (got %d) in line\n%s" 2052 % (ncols, len(cols), line)) 2053 2054 # Check to make sure columns contain the correct data 2055 for k in expected: 2056 if cols[k] != expected[k]: 2057 raise ValueError("I expected '%s' in column %d in line\n%s" 2058 % (expected[k], k, line)) 2059 2060 # Construct the answer tuple 2061 results = [] 2062 for c in cols_to_get: 2063 results.append(cols[c]) 2064 return tuple(results)
2065 2066
2067 -def _safe_int(str):
2068 try: 2069 return int(str) 2070 except ValueError: 2071 # Something went wrong. Try to clean up the string. 2072 # Remove all commas from the string 2073 str = str.replace(',', '') 2074 # try again after removing commas. 2075 # Note int() will return a long rather than overflow 2076 try: 2077 return int(str) 2078 except ValueError: 2079 pass 2080 # Call float to handle things like "54.3", note could lose precision, e.g. 2081 # >>> int("5399354557888517312") 2082 # 5399354557888517312 2083 # >>> int(float("5399354557888517312")) 2084 # 5399354557888517120 2085 return int(float(str))
2086 2087
2088 -def _safe_float(str):
2089 # Thomas Rosleff Soerensen (rosleff@mpiz-koeln.mpg.de) noted that 2090 # float('e-172') does not produce an error on his platform. Thus, 2091 # we need to check the string for this condition. 2092 2093 # Sometimes BLAST leaves of the '1' in front of an exponent. 2094 if str and str[0] in ['E', 'e']: 2095 str = '1' + str 2096 try: 2097 return float(str) 2098 except ValueError: 2099 # Remove all commas from the string 2100 str = str.replace(',', '') 2101 # try again. 2102 return float(str)
2103 2104
2105 -def _invoke_blast(cline):
2106 """Start BLAST and returns handles for stdout and stderr (PRIVATE). 2107 2108 Expects a command line wrapper object from Bio.Blast.Applications 2109 """ 2110 import subprocess 2111 import sys 2112 blast_cmd = cline.program_name 2113 if not os.path.exists(blast_cmd): 2114 raise ValueError("BLAST executable does not exist at %s" % blast_cmd) 2115 #We don't need to supply any piped input, but we setup the 2116 #standard input pipe anyway as a work around for a python 2117 #bug if this is called from a Windows GUI program. For 2118 #details, see http://bugs.python.org/issue1124861 2119 blast_process = subprocess.Popen(str(cline), 2120 stdin=subprocess.PIPE, 2121 stdout=subprocess.PIPE, 2122 stderr=subprocess.PIPE, 2123 universal_newlines=True, 2124 shell=(sys.platform!="win32")) 2125 blast_process.stdin.close() 2126 return blast_process.stdout, blast_process.stderr
2127 2128
2129 -def _security_check_parameters(param_dict):
2130 """Look for any attempt to insert a command into a parameter. 2131 2132 e.g. blastall(..., matrix='IDENTITY -F 0; rm -rf /etc/passwd') 2133 2134 Looks for ";" or "&&" in the strings (Unix and Windows syntax 2135 for appending a command line), or ">", "<" or "|" (redirection) 2136 and if any are found raises an exception. 2137 """ 2138 for key, value in param_dict.iteritems(): 2139 str_value = str(value) # Could easily be an int or a float 2140 for bad_str in [";", "&&", ">", "<", "|"]: 2141 if bad_str in str_value: 2142 raise ValueError("Rejecting suspicious argument for %s" % key)
2143 2144
2145 -class _BlastErrorConsumer(_BlastConsumer):
2146 - def __init__(self):
2148
2149 - def noevent(self, line):
2150 if 'Query must be at least wordsize' in line: 2151 raise ShortQueryBlastError("Query must be at least wordsize") 2152 # Now pass the line back up to the superclass. 2153 method = getattr(_BlastConsumer, 'noevent', 2154 _BlastConsumer.__getattr__(self, 'noevent')) 2155 method(line)
2156 2157
2158 -class BlastErrorParser(AbstractParser):
2159 """Attempt to catch and diagnose BLAST errors while parsing. 2160 2161 This utilizes the BlastParser module but adds an additional layer 2162 of complexity on top of it by attempting to diagnose ValueErrors 2163 that may actually indicate problems during BLAST parsing. 2164 2165 Current BLAST problems this detects are: 2166 o LowQualityBlastError - When BLASTing really low quality sequences 2167 (ie. some GenBank entries which are just short streches of a single 2168 nucleotide), BLAST will report an error with the sequence and be 2169 unable to search with this. This will lead to a badly formatted 2170 BLAST report that the parsers choke on. The parser will convert the 2171 ValueError to a LowQualityBlastError and attempt to provide useful 2172 information. 2173 """
2174 - def __init__(self, bad_report_handle = None):
2175 """Initialize a parser that tries to catch BlastErrors. 2176 2177 Arguments: 2178 o bad_report_handle - An optional argument specifying a handle 2179 where bad reports should be sent. This would allow you to save 2180 all of the bad reports to a file, for instance. If no handle <