| Trees | Indices | Help |
|
|---|
|
|
1 # Copyright 1999-2000 by Jeffrey Chang. All rights reserved.
2 # This code is part of the Biopython distribution and governed by its
3 # license. Please see the LICENSE file that should have been included
4 # as part of this package.
5 # Patches by Mike Poidinger to support multiple databases.
6 # Updated by Peter Cock in 2007 to do a better job on BLAST 2.2.15
7
8 """Code for calling standalone BLAST and parsing plain text output (OBSOLETE).
9
10 Rather than parsing the human readable plain text BLAST output (which seems to
11 change with every update to BLAST), we and the NBCI recommend you parse the
12 XML output instead. The plain text parser in this module still works at the
13 time of writing, but is considered obsolete and updating it to cope with the
14 latest versions of BLAST is not a priority for us.
15
16 This module also provides code to work with the "legacy" standalone version of
17 NCBI BLAST, tools blastall, rpsblast and blastpgp via three helper functions of
18 the same name. These functions are very limited for dealing with the output as
19 files rather than handles, for which the wrappers in Bio.Blast.Applications are
20 prefered. Furthermore, the NCBI themselves regard these command line tools as
21 "legacy", and encourage using the new BLAST+ tools instead. Biopython has
22 wrappers for these under Bio.Blast.Applications (see the tutorial).
23
24 Classes:
25 LowQualityBlastError Except that indicates low quality query sequences.
26 BlastParser Parses output from blast.
27 BlastErrorParser Parses output and tries to diagnose possible errors.
28 PSIBlastParser Parses output from psi-blast.
29 Iterator Iterates over a file of blast results.
30
31 _Scanner Scans output from standalone BLAST.
32 _BlastConsumer Consumes output from blast.
33 _PSIBlastConsumer Consumes output from psi-blast.
34 _HeaderConsumer Consumes header information.
35 _DescriptionConsumer Consumes description information.
36 _AlignmentConsumer Consumes alignment information.
37 _HSPConsumer Consumes hsp information.
38 _DatabaseReportConsumer Consumes database report information.
39 _ParametersConsumer Consumes parameters information.
40
41 Functions:
42 blastall Execute blastall (OBSOLETE).
43 blastpgp Execute blastpgp (OBSOLETE).
44 rpsblast Execute rpsblast (OBSOLETE).
45
46 For calling the BLAST command line tools, we encourage you to use the
47 command line wrappers in Bio.Blast.Applications - the three functions
48 blastall, blastpgp and rpsblast are considered to be obsolete now, and
49 are likely to be deprecated and then removed in future releases.
50 """
51
52 import warnings
53 warnings.warn("The plain text parser in this module still works at the time of writing, but is considered obsolete and updating it to cope with the latest versions of BLAST is not a priority for us.", PendingDeprecationWarning)
54
55 import os
56 import re
57
58 from Bio import File
59 from Bio.ParserSupport import *
60 from Bio.Blast import Record
61 from Bio.Application import _escape_filename
62
64 """Error caused by running a low quality sequence through BLAST.
65
66 When low quality sequences (like GenBank entries containing only
67 stretches of a single nucleotide) are BLASTed, they will result in
68 BLAST generating an error and not being able to perform the BLAST.
69 search. This error should be raised for the BLAST reports produced
70 in this case.
71 """
72 pass
73
75 """Error caused by running a short query sequence through BLAST.
76
77 If the query sequence is too short, BLAST outputs warnings and errors:
78 Searching[blastall] WARNING: [000.000] AT1G08320: SetUpBlastSearch failed.
79 [blastall] ERROR: [000.000] AT1G08320: Blast:
80 [blastall] ERROR: [000.000] AT1G08320: Blast: Query must be at least wordsize
81 done
82
83 This exception is raised when that condition is detected.
84
85 """
86 pass
87
88
90 """Scan BLAST output from blastall or blastpgp.
91
92 Tested with blastall and blastpgp v2.0.10, v2.0.11
93
94 Methods:
95 feed Feed data into the scanner.
96
97 """
99 """S.feed(handle, consumer)
100
101 Feed in a BLAST report for scanning. handle is a file-like
102 object that contains the BLAST report. consumer is a Consumer
103 object that will receive events as the report is scanned.
104
105 """
106 if isinstance(handle, File.UndoHandle):
107 uhandle = handle
108 else:
109 uhandle = File.UndoHandle(handle)
110
111 # Try to fast-forward to the beginning of the blast report.
112 read_and_call_until(uhandle, consumer.noevent, contains='BLAST')
113 # Now scan the BLAST report.
114 self._scan_header(uhandle, consumer)
115 self._scan_rounds(uhandle, consumer)
116 self._scan_database_report(uhandle, consumer)
117 self._scan_parameters(uhandle, consumer)
118
120 # BLASTP 2.0.10 [Aug-26-1999]
121 #
122 #
123 # Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Schaf
124 # Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997),
125 # "Gapped BLAST and PSI-BLAST: a new generation of protein database sea
126 # programs", Nucleic Acids Res. 25:3389-3402.
127 #
128 # Query= test
129 # (140 letters)
130 #
131 # Database: sdqib40-1.35.seg.fa
132 # 1323 sequences; 223,339 total letters
133 #
134 # ========================================================
135 # This next example is from the online version of Blast,
136 # note there are TWO references, an RID line, and also
137 # the database is BEFORE the query line.
138 # Note there possibleuse of non-ASCII in the author names.
139 # ========================================================
140 #
141 # BLASTP 2.2.15 [Oct-15-2006]
142 # Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Sch??ffer,
143 # Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman
144 # (1997), "Gapped BLAST and PSI-BLAST: a new generation of
145 # protein database search programs", Nucleic Acids Res. 25:3389-3402.
146 #
147 # Reference: Sch??ffer, Alejandro A., L. Aravind, Thomas L. Madden, Sergei
148 # Shavirin, John L. Spouge, Yuri I. Wolf, Eugene V. Koonin, and
149 # Stephen F. Altschul (2001), "Improving the accuracy of PSI-BLAST
150 # protein database searches with composition-based statistics
151 # and other refinements", Nucleic Acids Res. 29:2994-3005.
152 #
153 # RID: 1166022616-19998-65316425856.BLASTQ1
154 #
155 #
156 # Database: All non-redundant GenBank CDS
157 # translations+PDB+SwissProt+PIR+PRF excluding environmental samples
158 # 4,254,166 sequences; 1,462,033,012 total letters
159 # Query= gi:16127998
160 # Length=428
161 #
162
163 consumer.start_header()
164
165 read_and_call(uhandle, consumer.version, contains='BLAST')
166 read_and_call_while(uhandle, consumer.noevent, blank=1)
167
168 # There might be a <pre> line, for qblast output.
169 attempt_read_and_call(uhandle, consumer.noevent, start="<pre>")
170
171 # Read the reference(s)
172 while attempt_read_and_call(uhandle,
173 consumer.reference, start='Reference'):
174 # References are normally multiline terminated by a blank line
175 # (or, based on the old code, the RID line)
176 while 1:
177 line = uhandle.readline()
178 if is_blank_line(line):
179 consumer.noevent(line)
180 break
181 elif line.startswith("RID"):
182 break
183 else:
184 #More of the reference
185 consumer.reference(line)
186
187 #Deal with the optional RID: ...
188 read_and_call_while(uhandle, consumer.noevent, blank=1)
189 attempt_read_and_call(uhandle, consumer.reference, start="RID:")
190 read_and_call_while(uhandle, consumer.noevent, blank=1)
191
192 # blastpgp may have a reference for compositional score matrix
193 # adjustment (see Bug 2502):
194 if attempt_read_and_call(
195 uhandle, consumer.reference, start="Reference"):
196 read_and_call_until(uhandle, consumer.reference, blank=1)
197 read_and_call_while(uhandle, consumer.noevent, blank=1)
198
199 # blastpgp has a Reference for composition-based statistics.
200 if attempt_read_and_call(
201 uhandle, consumer.reference, start="Reference"):
202 read_and_call_until(uhandle, consumer.reference, blank=1)
203 read_and_call_while(uhandle, consumer.noevent, blank=1)
204
205 line = uhandle.peekline()
206 assert line.strip() != ""
207 assert not line.startswith("RID:")
208 if line.startswith("Query="):
209 #This is an old style query then database...
210
211 # Read the Query lines and the following blank line.
212 read_and_call(uhandle, consumer.query_info, start='Query=')
213 read_and_call_until(uhandle, consumer.query_info, blank=1)
214 read_and_call_while(uhandle, consumer.noevent, blank=1)
215
216 # Read the database lines and the following blank line.
217 read_and_call_until(uhandle, consumer.database_info, end='total letters')
218 read_and_call(uhandle, consumer.database_info, contains='sequences')
219 read_and_call_while(uhandle, consumer.noevent, blank=1)
220 elif line.startswith("Database:"):
221 #This is a new style database then query...
222 read_and_call_until(uhandle, consumer.database_info, end='total letters')
223 read_and_call(uhandle, consumer.database_info, contains='sequences')
224 read_and_call_while(uhandle, consumer.noevent, blank=1)
225
226 # Read the Query lines and the following blank line.
227 # Or, on BLAST 2.2.22+ there is no blank link - need to spot
228 # the "... Score E" line instead.
229 read_and_call(uhandle, consumer.query_info, start='Query=')
230 #read_and_call_until(uhandle, consumer.query_info, blank=1)
231 while True:
232 line = uhandle.peekline()
233 if not line.strip() : break
234 if "Score E" in line : break
235 #It is more of the query (and its length)
236 read_and_call(uhandle, consumer.query_info)
237 read_and_call_while(uhandle, consumer.noevent, blank=1)
238 else:
239 raise ValueError("Invalid header?")
240
241 consumer.end_header()
242
244 # Scan a bunch of rounds.
245 # Each round begins with either a "Searching......" line
246 # or a 'Score E' line followed by descriptions and alignments.
247 # The email server doesn't give the "Searching....." line.
248 # If there is no 'Searching.....' line then you'll first see a
249 # 'Results from round' line
250
251 while not self._eof(uhandle):
252 line = safe_peekline(uhandle)
253 if (not line.startswith('Searching') and
254 not line.startswith('Results from round') and
255 re.search(r"Score +E", line) is None and
256 line.find('No hits found') == -1):
257 break
258 self._scan_descriptions(uhandle, consumer)
259 self._scan_alignments(uhandle, consumer)
260
262 # Searching..................................................done
263 # Results from round 2
264 #
265 #
266 # Sc
267 # Sequences producing significant alignments: (b
268 # Sequences used in model and found again:
269 #
270 # d1tde_2 3.4.1.4.4 (119-244) Thioredoxin reductase [Escherichia ...
271 # d1tcob_ 1.31.1.5.16 Calcineurin regulatory subunit (B-chain) [B...
272 # d1symb_ 1.31.1.2.2 Calcyclin (S100) [RAT (RATTUS NORVEGICUS)]
273 #
274 # Sequences not found previously or not previously below threshold:
275 #
276 # d1osa__ 1.31.1.5.11 Calmodulin [Paramecium tetraurelia]
277 # d1aoza3 2.5.1.3.3 (339-552) Ascorbate oxidase [zucchini (Cucurb...
278 #
279
280 # If PSI-BLAST, may also have:
281 #
282 # CONVERGED!
283
284 consumer.start_descriptions()
285
286 # Read 'Searching'
287 # This line seems to be missing in BLASTN 2.1.2 (others?)
288 attempt_read_and_call(uhandle, consumer.noevent, start='Searching')
289
290 # blastpgp 2.0.10 from NCBI 9/19/99 for Solaris sometimes crashes here.
291 # If this happens, the handle will yield no more information.
292 if not uhandle.peekline():
293 raise ValueError("Unexpected end of blast report. " + \
294 "Looks suspiciously like a PSI-BLAST crash.")
295
296 # BLASTN 2.2.3 sometimes spews a bunch of warnings and errors here:
297 # Searching[blastall] WARNING: [000.000] AT1G08320: SetUpBlastSearch
298 # [blastall] ERROR: [000.000] AT1G08320: Blast:
299 # [blastall] ERROR: [000.000] AT1G08320: Blast: Query must be at leas
300 # done
301 # Reported by David Weisman.
302 # Check for these error lines and ignore them for now. Let
303 # the BlastErrorParser deal with them.
304 line = uhandle.peekline()
305 if line.find("ERROR:") != -1 or line.startswith("done"):
306 read_and_call_while(uhandle, consumer.noevent, contains="ERROR:")
307 read_and_call(uhandle, consumer.noevent, start="done")
308
309 # Check to see if this is PSI-BLAST.
310 # If it is, the 'Searching' line will be followed by:
311 # (version 2.0.10)
312 # Searching.............................
313 # Results from round 2
314 # or (version 2.0.11)
315 # Searching.............................
316 #
317 #
318 # Results from round 2
319
320 # Skip a bunch of blank lines.
321 read_and_call_while(uhandle, consumer.noevent, blank=1)
322 # Check for the results line if it's there.
323 if attempt_read_and_call(uhandle, consumer.round, start='Results'):
324 read_and_call_while(uhandle, consumer.noevent, blank=1)
325
326 # Three things can happen here:
327 # 1. line contains 'Score E'
328 # 2. line contains "No hits found"
329 # 3. no descriptions
330 # The first one begins a bunch of descriptions. The last two
331 # indicates that no descriptions follow, and we should go straight
332 # to the alignments.
333 if not attempt_read_and_call(
334 uhandle, consumer.description_header,
335 has_re=re.compile(r'Score +E')):
336 # Either case 2 or 3. Look for "No hits found".
337 attempt_read_and_call(uhandle, consumer.no_hits,
338 contains='No hits found')
339 try:
340 read_and_call_while(uhandle, consumer.noevent, blank=1)
341 except ValueError, err:
342 if str(err) != "Unexpected end of stream." : raise err
343
344 consumer.end_descriptions()
345 # Stop processing.
346 return
347
348 # Read the score header lines
349 read_and_call(uhandle, consumer.description_header,
350 start='Sequences producing')
351
352 # If PSI-BLAST, read the 'Sequences used in model' line.
353 attempt_read_and_call(uhandle, consumer.model_sequences,
354 start='Sequences used in model')
355 read_and_call_while(uhandle, consumer.noevent, blank=1)
356
357 # In BLAT, rather than a "No hits found" line, we just
358 # get no descriptions (and no alignments). This can be
359 # spotted because the next line is the database block:
360 if safe_peekline(uhandle).startswith(" Database:"):
361 consumer.end_descriptions()
362 # Stop processing.
363 return
364
365 # Read the descriptions and the following blank lines, making
366 # sure that there are descriptions.
367 if not uhandle.peekline().startswith('Sequences not found'):
368 read_and_call_until(uhandle, consumer.description, blank=1)
369 read_and_call_while(uhandle, consumer.noevent, blank=1)
370
371 # If PSI-BLAST, read the 'Sequences not found' line followed
372 # by more descriptions. However, I need to watch out for the
373 # case where there were no sequences not found previously, in
374 # which case there will be no more descriptions.
375 if attempt_read_and_call(uhandle, consumer.nonmodel_sequences,
376 start='Sequences not found'):
377 # Read the descriptions and the following blank lines.
378 read_and_call_while(uhandle, consumer.noevent, blank=1)
379 l = safe_peekline(uhandle)
380 # Brad -- added check for QUERY. On some PSI-BLAST outputs
381 # there will be a 'Sequences not found' line followed by no
382 # descriptions. Check for this case since the first thing you'll
383 # get is a blank line and then 'QUERY'
384 if not l.startswith('CONVERGED') and l[0] != '>' \
385 and not l.startswith('QUERY'):
386 read_and_call_until(uhandle, consumer.description, blank=1)
387 read_and_call_while(uhandle, consumer.noevent, blank=1)
388
389 attempt_read_and_call(uhandle, consumer.converged, start='CONVERGED')
390 read_and_call_while(uhandle, consumer.noevent, blank=1)
391
392 consumer.end_descriptions()
393
395 if self._eof(uhandle) : return
396
397 # qblast inserts a helpful line here.
398 attempt_read_and_call(uhandle, consumer.noevent, start="ALIGNMENTS")
399
400 # First, check to see if I'm at the database report.
401 line = safe_peekline(uhandle)
402 if not line:
403 #EOF
404 return
405 elif line.startswith(' Database') or line.startswith("Lambda"):
406 return
407 elif line[0] == '>':
408 # XXX make a better check here between pairwise and masterslave
409 self._scan_pairwise_alignments(uhandle, consumer)
410 else:
411 # XXX put in a check to make sure I'm in a masterslave alignment
412 self._scan_masterslave_alignment(uhandle, consumer)
413
415 while not self._eof(uhandle):
416 line = safe_peekline(uhandle)
417 if line[0] != '>':
418 break
419 self._scan_one_pairwise_alignment(uhandle, consumer)
420
422 if self._eof(uhandle) : return
423 consumer.start_alignment()
424
425 self._scan_alignment_header(uhandle, consumer)
426
427 # Scan a bunch of score/alignment pairs.
428 while 1:
429 if self._eof(uhandle):
430 #Shouldn't have issued that _scan_alignment_header event...
431 break
432 line = safe_peekline(uhandle)
433 if not line.startswith(' Score'):
434 break
435 self._scan_hsp(uhandle, consumer)
436 consumer.end_alignment()
437
439 # >d1rip__ 2.24.7.1.1 Ribosomal S17 protein [Bacillus
440 # stearothermophilus]
441 # Length = 81
442 #
443 # Or, more recently with different white space:
444 #
445 # >gi|15799684|ref|NP_285696.1| threonine synthase ...
446 # gi|15829258|ref|NP_308031.1| threonine synthase
447 # ...
448 # Length=428
449 read_and_call(uhandle, consumer.title, start='>')
450 while 1:
451 line = safe_readline(uhandle)
452 if line.lstrip().startswith('Length =') \
453 or line.lstrip().startswith('Length='):
454 consumer.length(line)
455 break
456 elif is_blank_line(line):
457 # Check to make sure I haven't missed the Length line
458 raise ValueError("I missed the Length in an alignment header")
459 consumer.title(line)
460
461 # Older versions of BLAST will have a line with some spaces.
462 # Version 2.0.14 (maybe 2.0.13?) and above print a true blank line.
463 if not attempt_read_and_call(uhandle, consumer.noevent,
464 start=' '):
465 read_and_call(uhandle, consumer.noevent, blank=1)
466
468 consumer.start_hsp()
469 self._scan_hsp_header(uhandle, consumer)
470 self._scan_hsp_alignment(uhandle, consumer)
471 consumer.end_hsp()
472
474 # Score = 22.7 bits (47), Expect = 2.5
475 # Identities = 10/36 (27%), Positives = 18/36 (49%)
476 # Strand = Plus / Plus
477 # Frame = +3
478 #
479
480 read_and_call(uhandle, consumer.score, start=' Score')
481 read_and_call(uhandle, consumer.identities, start=' Identities')
482 # BLASTN
483 attempt_read_and_call(uhandle, consumer.strand, start = ' Strand')
484 # BLASTX, TBLASTN, TBLASTX
485 attempt_read_and_call(uhandle, consumer.frame, start = ' Frame')
486 read_and_call(uhandle, consumer.noevent, blank=1)
487
489 # Query: 11 GRGVSACA-------TCDGFFYRNQKVAVIGGGNTAVEEALYLSNIASEVHLIHRRDGF
490 # GRGVS+ TC Y + + V GGG+ + EE L + I R+
491 # Sbjct: 12 GRGVSSVVRRCIHKPTCKE--YAVKIIDVTGGGSFSAEEVQELREATLKEVDILRKVSG
492 #
493 # Query: 64 AEKILIKR 71
494 # I +K
495 # Sbjct: 70 PNIIQLKD 77
496 #
497
498 while 1:
499 # Blastn adds an extra line filled with spaces before Query
500 attempt_read_and_call(uhandle, consumer.noevent, start=' ')
501 read_and_call(uhandle, consumer.query, start='Query')
502 read_and_call(uhandle, consumer.align, start=' ')
503 read_and_call(uhandle, consumer.sbjct, start='Sbjct')
504 try:
505 read_and_call_while(uhandle, consumer.noevent, blank=1)
506 except ValueError, err:
507 if str(err) != "Unexpected end of stream." : raise err
508 # End of File (well, it looks like it with recent versions
509 # of BLAST for multiple queries after the Iterator class
510 # has broken up the whole file into chunks).
511 break
512 line = safe_peekline(uhandle)
513 # Alignment continues if I see a 'Query' or the spaces for Blastn.
514 if not (line.startswith('Query') or line.startswith(' ')):
515 break
516
518 consumer.start_alignment()
519 while 1:
520 line = safe_readline(uhandle)
521 # Check to see whether I'm finished reading the alignment.
522 # This is indicated by 1) database section, 2) next psi-blast
523 # round, which can also be a 'Results from round' if no
524 # searching line is present
525 # patch by chapmanb
526 if line.startswith('Searching') or \
527 line.startswith('Results from round'):
528 uhandle.saveline(line)
529 break
530 elif line.startswith(' Database'):
531 uhandle.saveline(line)
532 break
533 elif is_blank_line(line):
534 consumer.noevent(line)
535 else:
536 consumer.multalign(line)
537 read_and_call_while(uhandle, consumer.noevent, blank=1)
538 consumer.end_alignment()
539
541 try:
542 line = safe_peekline(uhandle)
543 except ValueError, err:
544 if str(err) != "Unexpected end of stream." : raise err
545 line = ""
546 return not line
547
549 # Database: sdqib40-1.35.seg.fa
550 # Posted date: Nov 1, 1999 4:25 PM
551 # Number of letters in database: 223,339
552 # Number of sequences in database: 1323
553 #
554 # Lambda K H
555 # 0.322 0.133 0.369
556 #
557 # Gapped
558 # Lambda K H
559 # 0.270 0.0470 0.230
560 #
561 ##########################################
562 # Or, more recently Blast 2.2.15 gives less blank lines
563 ##########################################
564 # Database: All non-redundant GenBank CDS translations+PDB+SwissProt+PIR+PRF excluding
565 # environmental samples
566 # Posted date: Dec 12, 2006 5:51 PM
567 # Number of letters in database: 667,088,753
568 # Number of sequences in database: 2,094,974
569 # Lambda K H
570 # 0.319 0.136 0.395
571 # Gapped
572 # Lambda K H
573 # 0.267 0.0410 0.140
574
575 if self._eof(uhandle) : return
576
577 consumer.start_database_report()
578
579 # Subset of the database(s) listed below
580 # Number of letters searched: 562,618,960
581 # Number of sequences searched: 228,924
582 if attempt_read_and_call(uhandle, consumer.noevent, start=" Subset"):
583 read_and_call(uhandle, consumer.noevent, contains="letters")
584 read_and_call(uhandle, consumer.noevent, contains="sequences")
585 read_and_call(uhandle, consumer.noevent, start=" ")
586
587 # Sameet Mehta reported seeing output from BLASTN 2.2.9 that
588 # was missing the "Database" stanza completely.
589 while attempt_read_and_call(uhandle, consumer.database,
590 start=' Database'):
591 # BLAT output ends abruptly here, without any of the other
592 # information. Check to see if this is the case. If so,
593 # then end the database report here gracefully.
594 if not uhandle.peekline().strip() \
595 or uhandle.peekline().startswith("BLAST"):
596 consumer.end_database_report()
597 return
598
599 # Database can span multiple lines.
600 read_and_call_until(uhandle, consumer.database, start=' Posted')
601 read_and_call(uhandle, consumer.posted_date, start=' Posted')
602 read_and_call(uhandle, consumer.num_letters_in_database,
603 start=' Number of letters')
604 read_and_call(uhandle, consumer.num_sequences_in_database,
605 start=' Number of sequences')
606 #There may not be a line starting with spaces...
607 attempt_read_and_call(uhandle, consumer.noevent, start=' ')
608
609 line = safe_readline(uhandle)
610 uhandle.saveline(line)
611 if line.find('Lambda') != -1:
612 break
613
614 read_and_call(uhandle, consumer.noevent, start='Lambda')
615 read_and_call(uhandle, consumer.ka_params)
616
617 #This blank line is optional:
618 attempt_read_and_call(uhandle, consumer.noevent, blank=1)
619
620 # not BLASTP
621 attempt_read_and_call(uhandle, consumer.gapped, start='Gapped')
622 # not TBLASTX
623 if attempt_read_and_call(uhandle, consumer.noevent, start='Lambda'):
624 read_and_call(uhandle, consumer.ka_params_gap)
625
626 # Blast 2.2.4 can sometimes skip the whole parameter section.
627 # Thus, I need to be careful not to read past the end of the
628 # file.
629 try:
630 read_and_call_while(uhandle, consumer.noevent, blank=1)
631 except ValueError, x:
632 if str(x) != "Unexpected end of stream.":
633 raise
634 consumer.end_database_report()
635
637 # Matrix: BLOSUM62
638 # Gap Penalties: Existence: 11, Extension: 1
639 # Number of Hits to DB: 50604
640 # Number of Sequences: 1323
641 # Number of extensions: 1526
642 # Number of successful extensions: 6
643 # Number of sequences better than 10.0: 5
644 # Number of HSP's better than 10.0 without gapping: 5
645 # Number of HSP's successfully gapped in prelim test: 0
646 # Number of HSP's that attempted gapping in prelim test: 1
647 # Number of HSP's gapped (non-prelim): 5
648 # length of query: 140
649 # length of database: 223,339
650 # effective HSP length: 39
651 # effective length of query: 101
652 # effective length of database: 171,742
653 # effective search space: 17345942
654 # effective search space used: 17345942
655 # T: 11
656 # A: 40
657 # X1: 16 ( 7.4 bits)
658 # X2: 38 (14.8 bits)
659 # X3: 64 (24.9 bits)
660 # S1: 41 (21.9 bits)
661 # S2: 42 (20.8 bits)
662 ##########################################
663 # Or, more recently Blast(x) 2.2.15 gives
664 ##########################################
665 # Matrix: BLOSUM62
666 # Gap Penalties: Existence: 11, Extension: 1
667 # Number of Sequences: 4535438
668 # Number of Hits to DB: 2,588,844,100
669 # Number of extensions: 60427286
670 # Number of successful extensions: 126433
671 # Number of sequences better than 2.0: 30
672 # Number of HSP's gapped: 126387
673 # Number of HSP's successfully gapped: 35
674 # Length of query: 291
675 # Length of database: 1,573,298,872
676 # Length adjustment: 130
677 # Effective length of query: 161
678 # Effective length of database: 983,691,932
679 # Effective search space: 158374401052
680 # Effective search space used: 158374401052
681 # Neighboring words threshold: 12
682 # Window for multiple hits: 40
683 # X1: 16 ( 7.3 bits)
684 # X2: 38 (14.6 bits)
685 # X3: 64 (24.7 bits)
686 # S1: 41 (21.7 bits)
687 # S2: 32 (16.9 bits)
688
689
690 # Blast 2.2.4 can sometimes skip the whole parameter section.
691 # BLAT also skips the whole parameter section.
692 # Thus, check to make sure that the parameter section really
693 # exists.
694 if not uhandle.peekline().strip():
695 return
696
697 # BLASTN 2.2.9 looks like it reverses the "Number of Hits" and
698 # "Number of Sequences" lines.
699 consumer.start_parameters()
700
701 # Matrix line may be missing in BLASTN 2.2.9
702 attempt_read_and_call(uhandle, consumer.matrix, start='Matrix')
703 # not TBLASTX
704 attempt_read_and_call(uhandle, consumer.gap_penalties, start='Gap')
705
706 attempt_read_and_call(uhandle, consumer.num_sequences,
707 start='Number of Sequences')
708 attempt_read_and_call(uhandle, consumer.num_hits,
709 start='Number of Hits')
710 attempt_read_and_call(uhandle, consumer.num_sequences,
711 start='Number of Sequences')
712 attempt_read_and_call(uhandle, consumer.num_extends,
713 start='Number of extensions')
714 attempt_read_and_call(uhandle, consumer.num_good_extends,
715 start='Number of successful')
716
717 attempt_read_and_call(uhandle, consumer.num_seqs_better_e,
718 start='Number of sequences')
719
720 # not BLASTN, TBLASTX
721 if attempt_read_and_call(uhandle, consumer.hsps_no_gap,
722 start="Number of HSP's better"):
723 # BLASTN 2.2.9
724 if attempt_read_and_call(uhandle, consumer.noevent,
725 start="Number of HSP's gapped:"):
726 read_and_call(uhandle, consumer.noevent,
727 start="Number of HSP's successfully")
728 #This is ommitted in 2.2.15
729 attempt_read_and_call(uhandle, consumer.noevent,
730 start="Number of extra gapped extensions")
731 else:
732 read_and_call(uhandle, consumer.hsps_prelim_gapped,
733 start="Number of HSP's successfully")
734 read_and_call(uhandle, consumer.hsps_prelim_gap_attempted,
735 start="Number of HSP's that")
736 read_and_call(uhandle, consumer.hsps_gapped,
737 start="Number of HSP's gapped")
738 #e.g. BLASTX 2.2.15 where the "better" line is missing
739 elif attempt_read_and_call(uhandle, consumer.noevent,
740 start="Number of HSP's gapped"):
741 read_and_call(uhandle, consumer.noevent,
742 start="Number of HSP's successfully")
743
744 # not in blastx 2.2.1
745 attempt_read_and_call(uhandle, consumer.query_length,
746 has_re=re.compile(r"[Ll]ength of query"))
747 # Not in BLASTX 2.2.22+
748 attempt_read_and_call(uhandle, consumer.database_length,
749 has_re=re.compile(r"[Ll]ength of \s*[Dd]atabase"))
750
751 # BLASTN 2.2.9
752 attempt_read_and_call(uhandle, consumer.noevent,
753 start="Length adjustment")
754 attempt_read_and_call(uhandle, consumer.effective_hsp_length,
755 start='effective HSP')
756 # Not in blastx 2.2.1
757 attempt_read_and_call(
758 uhandle, consumer.effective_query_length,
759 has_re=re.compile(r'[Ee]ffective length of query'))
760
761 # This is not in BLASTP 2.2.15
762 attempt_read_and_call(
763 uhandle, consumer.effective_database_length,
764 has_re=re.compile(r'[Ee]ffective length of \s*[Dd]atabase'))
765 # Not in blastx 2.2.1, added a ':' to distinguish between
766 # this and the 'effective search space used' line
767 attempt_read_and_call(
768 uhandle, consumer.effective_search_space,
769 has_re=re.compile(r'[Ee]ffective search space:'))
770 # Does not appear in BLASTP 2.0.5
771 attempt_read_and_call(
772 uhandle, consumer.effective_search_space_used,
773 has_re=re.compile(r'[Ee]ffective search space used'))
774
775 # BLASTX, TBLASTN, TBLASTX
776 attempt_read_and_call(uhandle, consumer.frameshift, start='frameshift')
777
778 # not in BLASTN 2.2.9
779 attempt_read_and_call(uhandle, consumer.threshold, start='T')
780 # In BLASTX 2.2.15 replaced by: "Neighboring words threshold: 12"
781 attempt_read_and_call(uhandle, consumer.threshold, start='Neighboring words threshold')
782
783 # not in BLASTX 2.2.15
784 attempt_read_and_call(uhandle, consumer.window_size, start='A')
785 # get this instead: "Window for multiple hits: 40"
786 attempt_read_and_call(uhandle, consumer.window_size, start='Window for multiple hits')
787
788 # not in BLASTX 2.2.22+
789 attempt_read_and_call(uhandle, consumer.dropoff_1st_pass, start='X1')
790 # not TBLASTN
791 attempt_read_and_call(uhandle, consumer.gap_x_dropoff, start='X2')
792
793 # not BLASTN, TBLASTX
794 attempt_read_and_call(uhandle, consumer.gap_x_dropoff_final,
795 start='X3')
796
797 # not TBLASTN
798 attempt_read_and_call(uhandle, consumer.gap_trigger, start='S1')
799 # not in blastx 2.2.1
800 # first we make sure we have additional lines to work with, if
801 # not then the file is done and we don't have a final S2
802 if not is_blank_line(uhandle.peekline(), allow_spaces=1):
803 read_and_call(uhandle, consumer.blast_cutoff, start='S2')
804
805 consumer.end_parameters()
806
820
834
838
840 c = line.split()
841 self._header.application = c[0]
842 self._header.version = c[1]
843 if len(c) > 2:
844 #The date is missing in the new C++ output from blastx 2.2.22+
845 #Just get "BLASTX 2.2.22+\n" and that's all.
846 self._header.date = c[2][1:-1]
847
849 if line.startswith('Reference: '):
850 self._header.reference = line[11:]
851 else:
852 self._header.reference = self._header.reference + line
853
855 if line.startswith('Query= '):
856 self._header.query = line[7:].lstrip()
857 elif line.startswith('Length='):
858 #New style way to give the query length in BLAST 2.2.22+ (the C++ code)
859 self._header.query_letters = _safe_int(line[7:].strip())
860 elif not line.startswith(' '): # continuation of query_info
861 self._header.query = "%s%s" % (self._header.query, line)
862 else:
863 #Hope it is the old style way to give the query length:
864 letters, = _re_search(
865 r"([0-9,]+) letters", line,
866 "I could not find the number of letters in line\n%s" % line)
867 self._header.query_letters = _safe_int(letters)
868
870 line = line.rstrip()
871 if line.startswith('Database: '):
872 self._header.database = line[10:]
873 elif not line.endswith('total letters'):
874 if self._header.database:
875 #Need to include a space when merging multi line datase descr
876 self._header.database = self._header.database + " " + line.strip()
877 else:
878 self._header.database = line.strip()
879 else:
880 sequences, letters =_re_search(
881 r"([0-9,]+) sequences; ([0-9,-]+) total letters", line,
882 "I could not find the sequences and letters in line\n%s" %line)
883 self._header.database_sequences = _safe_int(sequences)
884 self._header.database_letters = _safe_int(letters)
885
890
893 self._descriptions = []
894 self._model_sequences = []
895 self._nonmodel_sequences = []
896 self._converged = 0
897 self._type = None
898 self._roundnum = None
899
900 self.__has_n = 0 # Does the description line contain an N value?
901
903 if line.startswith('Sequences producing'):
904 cols = line.split()
905 if cols[-1] == 'N':
906 self.__has_n = 1
907
909 dh = self._parse(line)
910 if self._type == 'model':
911 self._model_sequences.append(dh)
912 elif self._type == 'nonmodel':
913 self._nonmodel_sequences.append(dh)
914 else:
915 self._descriptions.append(dh)
916
919
922
925
928
930 if not line.startswith('Results from round'):
931 raise ValueError("I didn't understand the round line\n%s" % line)
932 self._roundnum = _safe_int(line[18:].strip())
933
936
938 line = description_line # for convenience
939 dh = Record.Description()
940
941 # I need to separate the score and p-value from the title.
942 # sp|P21297|FLBT_CAUCR FLBT PROTEIN [snip] 284 7e-77
943 # sp|P21297|FLBT_CAUCR FLBT PROTEIN [snip] 284 7e-77 1
944 # special cases to handle:
945 # - title must be preserved exactly (including whitespaces)
946 # - score could be equal to e-value (not likely, but what if??)
947 # - sometimes there's an "N" score of '1'.
948 cols = line.split()
949 if len(cols) < 3:
950 raise ValueError( \
951 "Line does not appear to contain description:\n%s" % line)
952 if self.__has_n:
953 i = line.rfind(cols[-1]) # find start of N
954 i = line.rfind(cols[-2], 0, i) # find start of p-value
955 i = line.rfind(cols[-3], 0, i) # find start of score
956 else:
957 i = line.rfind(cols[-1]) # find start of p-value
958 i = line.rfind(cols[-2], 0, i) # find start of score
959 if self.__has_n:
960 dh.title, dh.score, dh.e, dh.num_alignments = \
961 line[:i].rstrip(), cols[-3], cols[-2], cols[-1]
962 else:
963 dh.title, dh.score, dh.e, dh.num_alignments = \
964 line[:i].rstrip(), cols[-2], cols[-1], 1
965 dh.num_alignments = _safe_int(dh.num_alignments)
966 dh.score = _safe_int(dh.score)
967 dh.e = _safe_float(dh.e)
968 return dh
969
971 # This is a little bit tricky. An alignment can either be a
972 # pairwise alignment or a multiple alignment. Since it's difficult
973 # to know a-priori which one the blast record will contain, I'm going
974 # to make one class that can parse both of them.
978
980 if self._alignment.title:
981 self._alignment.title += " "
982 self._alignment.title += line.strip()
983
985 #e.g. "Length = 81" or more recently, "Length=428"
986 parts = line.replace(" ","").split("=")
987 assert len(parts)==2, "Unrecognised format length line"
988 self._alignment.length = parts[1]
989 self._alignment.length = _safe_int(self._alignment.length)
990
992 # Standalone version uses 'QUERY', while WWW version uses blast_tmp.
993 if line.startswith('QUERY') or line.startswith('blast_tmp'):
994 # If this is the first line of the multiple alignment,
995 # then I need to figure out how the line is formatted.
996
997 # Format of line is:
998 # QUERY 1 acttg...gccagaggtggtttattcagtctccataagagaggggacaaacg 60
999 try:
1000 name, start, seq, end = line.split()
1001 except ValueError:
1002 raise ValueError("I do not understand the line\n%s" % line)
1003 self._start_index = line.index(start, len(name))
1004 self._seq_index = line.index(seq,
1005 self._start_index+len(start))
1006 # subtract 1 for the space
1007 self._name_length = self._start_index - 1
1008 self._start_length = self._seq_index - self._start_index - 1
1009 self._seq_length = line.rfind(end) - self._seq_index - 1
1010
1011 #self._seq_index = line.index(seq)
1012 ## subtract 1 for the space
1013 #self._seq_length = line.rfind(end) - self._seq_index - 1
1014 #self._start_index = line.index(start)
1015 #self._start_length = self._seq_index - self._start_index - 1
1016 #self._name_length = self._start_index
1017
1018 # Extract the information from the line
1019 name = line[:self._name_length]
1020 name = name.rstrip()
1021 start = line[self._start_index:self._start_index+self._start_length]
1022 start = start.rstrip()
1023 if start:
1024 start = _safe_int(start)
1025 end = line[self._seq_index+self._seq_length:].rstrip()
1026 if end:
1027 end = _safe_int(end)
1028 seq = line[self._seq_index:self._seq_index+self._seq_length].rstrip()
1029 # right pad the sequence with spaces if necessary
1030 if len(seq) < self._seq_length:
1031 seq = seq + ' '*(self._seq_length-len(seq))
1032
1033 # I need to make sure the sequence is aligned correctly with the query.
1034 # First, I will find the length of the query. Then, if necessary,
1035 # I will pad my current sequence with spaces so that they will line
1036 # up correctly.
1037
1038 # Two possible things can happen:
1039 # QUERY
1040 # 504
1041 #
1042 # QUERY
1043 # 403
1044 #
1045 # Sequence 504 will need padding at the end. Since I won't know
1046 # this until the end of the alignment, this will be handled in
1047 # end_alignment.
1048 # Sequence 403 will need padding before being added to the alignment.
1049
1050 align = self._multiple_alignment.alignment # for convenience
1051 align.append((name, start, seq, end))
1052
1053 # This is old code that tried to line up all the sequences
1054 # in a multiple alignment by using the sequence title's as
1055 # identifiers. The problem with this is that BLAST assigns
1056 # different HSP's from the same sequence the same id. Thus,
1057 # in one alignment block, there may be multiple sequences with
1058 # the same id. I'm not sure how to handle this, so I'm not
1059 # going to.
1060
1061 # # If the sequence is the query, then just add it.
1062 # if name == 'QUERY':
1063 # if len(align) == 0:
1064 # align.append((name, start, seq))
1065 # else:
1066 # aname, astart, aseq = align[0]
1067 # if name != aname:
1068 # raise ValueError, "Query is not the first sequence"
1069 # aseq = aseq + seq
1070 # align[0] = aname, astart, aseq
1071 # else:
1072 # if len(align) == 0:
1073 # raise ValueError, "I could not find the query sequence"
1074 # qname, qstart, qseq = align[0]
1075 #
1076 # # Now find my sequence in the multiple alignment.
1077 # for i in range(1, len(align)):
1078 # aname, astart, aseq = align[i]
1079 # if name == aname:
1080 # index = i
1081 # break
1082 # else:
1083 # # If I couldn't find it, then add a new one.
1084 # align.append((None, None, None))
1085 # index = len(align)-1
1086 # # Make sure to left-pad it.
1087 # aname, astart, aseq = name, start, ' '*(len(qseq)-len(seq))
1088 #
1089 # if len(qseq) != len(aseq) + len(seq):
1090 # # If my sequences are shorter than the query sequence,
1091 # # then I will need to pad some spaces to make them line up.
1092 # # Since I've already right padded seq, that means aseq
1093 # # must be too short.
1094 # aseq = aseq + ' '*(len(qseq)-len(aseq)-len(seq))
1095 # aseq = aseq + seq
1096 # if astart is None:
1097 # astart = start
1098 # align[index] = aname, astart, aseq
1099
1101 # Remove trailing newlines
1102 if self._alignment:
1103 self._alignment.title = self._alignment.title.rstrip()
1104
1105 # This code is also obsolete. See note above.
1106 # If there's a multiple alignment, I will need to make sure
1107 # all the sequences are aligned. That is, I may need to
1108 # right-pad the sequences.
1109 # if self._multiple_alignment is not None:
1110 # align = self._multiple_alignment.alignment
1111 # seqlen = None
1112 # for i in range(len(align)):
1113 # name, start, seq = align[i]
1114 # if seqlen is None:
1115 # seqlen = len(seq)
1116 # else:
1117 # if len(seq) < seqlen:
1118 # seq = seq + ' '*(seqlen - len(seq))
1119 # align[i] = name, start, seq
1120 # elif len(seq) > seqlen:
1121 # raise ValueError, \
1122 # "Sequence %s is longer than the query" % name
1123
1124 # Clean up some variables, if they exist.
1125 try:
1126 del self._seq_index
1127 del self._seq_length
1128 del self._start_index
1129 del self._start_length
1130 del self._name_length
1131 except AttributeError:
1132 pass
1133
1137
1139 self._hsp.bits, self._hsp.score = _re_search(
1140 r"Score =\s*([0-9.e+]+) bits \(([0-9]+)\)", line,
1141 "I could not find the score in line\n%s" % line)
1142 self._hsp.score = _safe_float(self._hsp.score)
1143 self._hsp.bits = _safe_float(self._hsp.bits)
1144
1145 x, y = _re_search(
1146 r"Expect\(?(\d*)\)? = +([0-9.e\-|\+]+)", line,
1147 "I could not find the expect in line\n%s" % line)
1148 if x:
1149 self._hsp.num_alignments = _safe_int(x)
1150 else:
1151 self._hsp.num_alignments = 1
1152 self._hsp.expect = _safe_float(y)
1153
1155 x, y = _re_search(
1156 r"Identities = (\d+)\/(\d+)", line,
1157 "I could not find the identities in line\n%s" % line)
1158 self._hsp.identities = _safe_int(x), _safe_int(y)
1159 self._hsp.align_length = _safe_int(y)
1160
1161 if line.find('Positives') != -1:
1162 x, y = _re_search(
1163 r"Positives = (\d+)\/(\d+)", line,
1164 "I could not find the positives in line\n%s" % line)
1165 self._hsp.positives = _safe_int(x), _safe_int(y)
1166 assert self._hsp.align_length == _safe_int(y)
1167
1168 if line.find('Gaps') != -1:
1169 x, y = _re_search(
1170 r"Gaps = (\d+)\/(\d+)", line,
1171 "I could not find the gaps in line\n%s" % line)
1172 self._hsp.gaps = _safe_int(x), _safe_int(y)
1173 assert self._hsp.align_length == _safe_int(y)
1174
1175
1177 self._hsp.strand = _re_search(
1178 r"Strand = (\w+) / (\w+)", line,
1179 "I could not find the strand in line\n%s" % line)
1180
1182 # Frame can be in formats:
1183 # Frame = +1
1184 # Frame = +2 / +2
1185 if line.find('/') != -1:
1186 self._hsp.frame = _re_search(
1187 r"Frame = ([-+][123]) / ([-+][123])", line,
1188 "I could not find the frame in line\n%s" % line)
1189 else:
1190 self._hsp.frame = _re_search(
1191 r"Frame = ([-+][123])", line,
1192 "I could not find the frame in line\n%s" % line)
1193
1194 # Match a space, if one is available. Masahir Ishikawa found a
1195 # case where there's no space between the start and the sequence:
1196 # Query: 100tt 101
1197 # line below modified by Yair Benita, Sep 2004
1198 # Note that the colon is not always present. 2006
1199 _query_re = re.compile(r"Query(:?) \s*(\d+)\s*(.+) (\d+)")
1201 m = self._query_re.search(line)
1202 if m is None:
1203 raise ValueError("I could not find the query in line\n%s" % line)
1204
1205 # line below modified by Yair Benita, Sep 2004.
1206 # added the end attribute for the query
1207 colon, start, seq, end = m.groups()
1208 self._hsp.query = self._hsp.query + seq
1209 if self._hsp.query_start is None:
1210 self._hsp.query_start = _safe_int(start)
1211
1212 # line below added by Yair Benita, Sep 2004.
1213 # added the end attribute for the query
1214 self._hsp.query_end = _safe_int(end)
1215
1216 #Get index for sequence start (regular expression element 3)
1217 self._query_start_index = m.start(3)
1218 self._query_len = len(seq)
1219
1221 seq = line[self._query_start_index:].rstrip()
1222 if len(seq) < self._query_len:
1223 # Make sure the alignment is the same length as the query
1224 seq = seq + ' ' * (self._query_len-len(seq))
1225 elif len(seq) < self._query_len:
1226 raise ValueError("Match is longer than the query in line\n%s" \
1227 % line)
1228 self._hsp.match = self._hsp.match + seq
1229
1230 # To match how we do the query, cache the regular expression.
1231 # Note that the colon is not always present.
1232 _sbjct_re = re.compile(r"Sbjct(:?) \s*(\d+)\s*(.+) (\d+)")
1234 m = self._sbjct_re.search(line)
1235 if m is None:
1236 raise ValueError("I could not find the sbjct in line\n%s" % line)
1237 colon, start, seq, end = m.groups()
1238 #mikep 26/9/00
1239 #On occasion, there is a blast hit with no subject match
1240 #so far, it only occurs with 1-line short "matches"
1241 #I have decided to let these pass as they appear
1242 if not seq.strip():
1243 seq = ' ' * self._query_len
1244 self._hsp.sbjct = self._hsp.sbjct + seq
1245 if self._hsp.sbjct_start is None:
1246 self._hsp.sbjct_start = _safe_int(start)
1247
1248 self._hsp.sbjct_end = _safe_int(end)
1249 if len(seq) != self._query_len:
1250 raise ValueError( \
1251 "QUERY and SBJCT sequence lengths don't match in line\n%s" \
1252 % line)
1253
1254 del self._query_start_index # clean up unused variables
1255 del self._query_len
1256
1259
1261
1264
1266 m = re.search(r"Database: (.+)$", line)
1267 if m:
1268 self._dr.database_name.append(m.group(1))
1269 elif self._dr.database_name:
1270 # This must be a continuation of the previous name.
1271 self._dr.database_name[-1] = "%s%s" % (self._dr.database_name[-1],
1272 line.strip())
1273
1275 self._dr.posted_date.append(_re_search(
1276 r"Posted date:\s*(.+)$", line,
1277 "I could not find the posted date in line\n%s" % line))
1278
1280 letters, = _get_cols(
1281 line, (-1,), ncols=6, expected={2:"letters", 4:"database:"})
1282 self._dr.num_letters_in_database.append(_safe_int(letters))
1283
1285 sequences, = _get_cols(
1286 line, (-1,), ncols=6, expected={2:"sequences", 4:"database:"})
1287 self._dr.num_sequences_in_database.append(_safe_int(sequences))
1288
1292
1294 self._dr.gapped = 1
1295
1299
1302
1306
1309
1311 x = _get_cols(
1312 line, (3, 5), ncols=6, expected={2:"Existence:", 4:"Extension:"})
1313 self._params.gap_penalties = map(_safe_float, x)
1314
1316 if line.find('1st pass') != -1:
1317 x, = _get_cols(line, (-4,), ncols=11, expected={2:"Hits"})
1318 self._params.num_hits = _safe_int(x)
1319 else:
1320 x, = _get_cols(line, (-1,), ncols=6, expected={2:"Hits"})
1321 self._params.num_hits = _safe_int(x)
1322
1324 if line.find('1st pass') != -1:
1325 x, = _get_cols(line, (-4,), ncols=9, expected={2:"Sequences:"})
1326 self._params.num_sequences = _safe_int(x)
1327 else:
1328 x, = _get_cols(line, (-1,), ncols=4, expected={2:"Sequences:"})
1329 self._params.num_sequences = _safe_int(x)
1330