Package Bio :: Package Blast :: Module NCBIXML
[hide private]
[frames] | no frames]

Source Code for Module Bio.Blast.NCBIXML

  1  # Copyright 2000 by Bertrand Frottier .  All rights reserved. 
  2  # Revisions 2005-2006 copyright Michiel de Hoon 
  3  # Revisions 2006-2009 copyright Peter Cock 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7  """This module provides code to work with the BLAST XML output 
  8  following the DTD available on the NCBI FTP 
  9  ftp://ftp.ncbi.nlm.nih.gov/blast/documents/xml/NCBI_BlastOutput.dtd 
 10   
 11  Classes: 
 12  BlastParser         Parses XML output from BLAST (direct use discouraged). 
 13                      This (now) returns a list of Blast records. 
 14                      Historically it returned a single Blast record. 
 15                      You are expected to use this via the parse or read functions. 
 16   
 17  _XMLParser          Generic SAX parser (private). 
 18   
 19  Functions: 
 20  parse               Incremental parser, this is an iterator that returns 
 21                      Blast records.  It uses the BlastParser internally. 
 22  read                Returns a single Blast record. Uses the BlastParser internally. 
 23  """ 
 24  from __future__ import print_function 
 25   
 26  from Bio.Blast import Record 
 27  import xml.sax 
 28  from xml.sax.handler import ContentHandler 
 29  from functools import reduce 
 30   
 31   
32 -class _XMLparser(ContentHandler):
33 """Generic SAX Parser 34 35 Just a very basic SAX parser. 36 37 Redefine the methods startElement, characters and endElement. 38 """
39 - def __init__(self, debug=0):
40 """Constructor 41 42 debug - integer, amount of debug information to print 43 """ 44 self._tag = [] 45 self._value = '' 46 self._debug = debug 47 self._debug_ignore_list = []
48
49 - def _secure_name(self, name):
50 """Removes 'dangerous' from tag names 51 52 name -- name to be 'secured' 53 """ 54 # Replace '-' with '_' in XML tag names 55 return name.replace('-', '_')
56
57 - def startElement(self, name, attr):
58 """Found XML start tag 59 60 No real need of attr, BLAST DTD doesn't use them 61 62 name -- name of the tag 63 64 attr -- tag attributes 65 """ 66 self._tag.append(name) 67 68 # Try to call a method (defined in subclasses) 69 method = self._secure_name('_start_' + name) 70 71 #Note could use try / except AttributeError 72 #BUT I found often triggered by nested errors... 73 if hasattr(self, method): 74 eval("self.%s()" % method) 75 if self._debug > 4: 76 print("NCBIXML: Parsed: " + method) 77 elif self._debug > 3: 78 # Doesn't exist (yet) and may want to warn about it 79 if method not in self._debug_ignore_list: 80 print("NCBIXML: Ignored: " + method) 81 self._debug_ignore_list.append(method) 82 83 #We don't care about white space in parent tags like Hsp, 84 #but that white space doesn't belong to child tags like Hsp_midline 85 if self._value.strip(): 86 raise ValueError("What should we do with %s before the %s tag?" 87 % (repr(self._value), name)) 88 self._value = ""
89
90 - def characters(self, ch):
91 """Found some text 92 93 ch -- characters read 94 """ 95 self._value += ch # You don't ever get the whole string
96
97 - def endElement(self, name):
98 """Found XML end tag 99 100 name -- tag name 101 """ 102 # DON'T strip any white space, we may need it e.g. the hsp-midline 103 104 # Try to call a method (defined in subclasses) 105 method = self._secure_name('_end_' + name) 106 #Note could use try / except AttributeError 107 #BUT I found often triggered by nested errors... 108 if hasattr(self, method): 109 eval("self.%s()" % method) 110 if self._debug > 2: 111 print("NCBIXML: Parsed: %s %s" % (method, self._value)) 112 elif self._debug > 1: 113 # Doesn't exist (yet) and may want to warn about it 114 if method not in self._debug_ignore_list: 115 print("NCBIXML: Ignored: %s %s" % (method, self._value)) 116 self._debug_ignore_list.append(method) 117 118 # Reset character buffer 119 self._value = ''
120 121
122 -class BlastParser(_XMLparser):
123 """Parse XML BLAST data into a Record.Blast object 124 125 All XML 'action' methods are private methods and may be: 126 _start_TAG called when the start tag is found 127 _end_TAG called when the end tag is found 128 """ 129
130 - def __init__(self, debug=0):
131 """Constructor 132 133 debug - integer, amount of debug information to print 134 """ 135 # Calling superclass method 136 _XMLparser.__init__(self, debug) 137 138 self._parser = xml.sax.make_parser() 139 self._parser.setContentHandler(self) 140 141 # To avoid ValueError: unknown url type: NCBI_BlastOutput.dtd 142 self._parser.setFeature(xml.sax.handler.feature_validation, 0) 143 self._parser.setFeature(xml.sax.handler.feature_namespaces, 0) 144 self._parser.setFeature(xml.sax.handler.feature_external_pes, 0) 145 self._parser.setFeature(xml.sax.handler.feature_external_ges, 0) 146 147 self.reset()
148
149 - def reset(self):
150 """Reset all the data allowing reuse of the BlastParser() object""" 151 self._records = [] 152 self._header = Record.Header() 153 self._parameters = Record.Parameters() 154 self._parameters.filter = None # Maybe I should update the class?
155
156 - def _start_Iteration(self):
157 self._blast = Record.Blast() 158 pass
159
160 - def _end_Iteration(self):
161 # We stored a lot of generic "top level" information 162 # in self._header (an object of type Record.Header) 163 self._blast.reference = self._header.reference 164 self._blast.date = self._header.date 165 self._blast.version = self._header.version 166 self._blast.database = self._header.database 167 self._blast.application = self._header.application 168 169 # These are required for "old" pre 2.2.14 files 170 # where only <BlastOutput_query-ID>, <BlastOutput_query-def> 171 # and <BlastOutput_query-len> were used. Now they 172 # are suplemented/replaced by <Iteration_query-ID>, 173 # <Iteration_query-def> and <Iteration_query-len> 174 if not hasattr(self._blast, "query") \ 175 or not self._blast.query: 176 self._blast.query = self._header.query 177 if not hasattr(self._blast, "query_id") \ 178 or not self._blast.query_id: 179 self._blast.query_id = self._header.query_id 180 if not hasattr(self._blast, "query_letters") \ 181 or not self._blast.query_letters: 182 self._blast.query_letters = self._header.query_letters 183 184 # Hack to record the query length as both the query_letters and 185 # query_length properties (as in the plain text parser, see 186 # Bug 2176 comment 12): 187 self._blast.query_length = self._blast.query_letters 188 # Perhaps in the long term we should deprecate one, but I would 189 # prefer to drop query_letters - so we need a transition period 190 # with both. 191 192 # Hack to record the claimed database size as database_length 193 # (as well as in num_letters_in_database, see Bug 2176 comment 13): 194 self._blast.database_length = self._blast.num_letters_in_database 195 # TODO? Deprecate database_letters next? 196 197 # Hack to record the claimed database sequence count as database_sequences 198 self._blast.database_sequences = self._blast.num_sequences_in_database 199 200 # Apply the "top level" parameter information 201 self._blast.matrix = self._parameters.matrix 202 self._blast.num_seqs_better_e = self._parameters.num_seqs_better_e 203 self._blast.gap_penalties = self._parameters.gap_penalties 204 self._blast.filter = self._parameters.filter 205 self._blast.expect = self._parameters.expect 206 self._blast.sc_match = self._parameters.sc_match 207 self._blast.sc_mismatch = self._parameters.sc_mismatch 208 209 #Add to the list 210 self._records.append(self._blast) 211 #Clear the object (a new empty one is create in _start_Iteration) 212 self._blast = None 213 214 if self._debug: 215 print("NCBIXML: Added Blast record to results")
216 217 # Header
218 - def _end_BlastOutput_program(self):
219 """BLAST program, e.g., blastp, blastn, etc. 220 221 Save this to put on each blast record object 222 """ 223 self._header.application = self._value.upper()
224
225 - def _end_BlastOutput_version(self):
226 """version number and date of the BLAST engine. 227 228 e.g. "BLASTX 2.2.12 [Aug-07-2005]" but there can also be 229 variants like "BLASTP 2.2.18+" without the date. 230 231 Save this to put on each blast record object 232 """ 233 parts = self._value.split() 234 #TODO - Check the first word starts with BLAST? 235 236 #The version is the second word (field one) 237 self._header.version = parts[1] 238 239 #Check there is a third word (the date) 240 if len(parts) >= 3: 241 if parts[2][0] == "[" and parts[2][-1] == "]": 242 self._header.date = parts[2][1:-1] 243 else: 244 #Assume this is still a date, but without the 245 #square brackets 246 self._header.date = parts[2]
247
249 """a reference to the article describing the algorithm 250 251 Save this to put on each blast record object 252 """ 253 self._header.reference = self._value
254
255 - def _end_BlastOutput_db(self):
256 """the database(s) searched 257 258 Save this to put on each blast record object 259 """ 260 self._header.database = self._value
261
263 """the identifier of the query 264 265 Important in old pre 2.2.14 BLAST, for recent versions 266 <Iteration_query-ID> is enough 267 """ 268 self._header.query_id = self._value
269
271 """the definition line of the query 272 273 Important in old pre 2.2.14 BLAST, for recent versions 274 <Iteration_query-def> is enough 275 """ 276 self._header.query = self._value
277
279 """the length of the query 280 281 Important in old pre 2.2.14 BLAST, for recent versions 282 <Iteration_query-len> is enough 283 """ 284 self._header.query_letters = int(self._value)
285
286 - def _end_Iteration_query_ID(self):
287 """the identifier of the query 288 """ 289 self._blast.query_id = self._value
290
291 - def _end_Iteration_query_def(self):
292 """the definition line of the query 293 """ 294 self._blast.query = self._value
295
296 - def _end_Iteration_query_len(self):
297 """the length of the query 298 """ 299 self._blast.query_letters = int(self._value)
300 301 ## def _end_BlastOutput_query_seq(self): 302 ## """the query sequence 303 ## """ 304 ## pass # XXX Missing in Record.Blast ? 305 306 ## def _end_BlastOutput_iter_num(self): 307 ## """the psi-blast iteration number 308 ## """ 309 ## pass # XXX TODO PSI 310
311 - def _end_BlastOutput_hits(self):
312 """hits to the database sequences, one for every sequence 313 """ 314 self._blast.num_hits = int(self._value)
315 316 ## def _end_BlastOutput_message(self): 317 ## """error messages 318 ## """ 319 ## pass # XXX What to do ? 320 321 # Parameters
322 - def _end_Parameters_matrix(self):
323 """matrix used (-M) 324 """ 325 self._parameters.matrix = self._value
326
327 - def _end_Parameters_expect(self):
328 """expect values cutoff (-e) 329 """ 330 # NOTE: In old text output there was a line: 331 # Number of sequences better than 1.0e-004: 1 332 # As far as I can see, parameters.num_seqs_better_e 333 # would take the value of 1, and the expectation 334 # value was not recorded. 335 # 336 # Anyway we should NOT record this against num_seqs_better_e 337 self._parameters.expect = self._value
338 339 ## def _end_Parameters_include(self): 340 ## """inclusion threshold for a psi-blast iteration (-h) 341 ## """ 342 ## pass # XXX TODO PSI 343
344 - def _end_Parameters_sc_match(self):
345 """match score for nucleotide-nucleotide comparaison (-r) 346 """ 347 self._parameters.sc_match = int(self._value)
348
350 """mismatch penalty for nucleotide-nucleotide comparaison (-r) 351 """ 352 self._parameters.sc_mismatch = int(self._value)
353
354 - def _end_Parameters_gap_open(self):
355 """gap existence cost (-G) 356 """ 357 self._parameters.gap_penalties = int(self._value)
358
360 """gap extension cose (-E) 361 """ 362 self._parameters.gap_penalties = (self._parameters.gap_penalties, 363 int(self._value))
364
365 - def _end_Parameters_filter(self):
366 """filtering options (-F) 367 """ 368 self._parameters.filter = self._value
369 370 ## def _end_Parameters_pattern(self): 371 ## """pattern used for phi-blast search 372 ## """ 373 ## pass # XXX TODO PSI 374 375 ## def _end_Parameters_entrez_query(self): 376 ## """entrez query used to limit search 377 ## """ 378 ## pass # XXX TODO PSI 379 380 # Hits
381 - def _start_Hit(self):
382 self._blast.alignments.append(Record.Alignment()) 383 self._blast.descriptions.append(Record.Description()) 384 self._blast.multiple_alignment = [] 385 self._hit = self._blast.alignments[-1] 386 self._descr = self._blast.descriptions[-1] 387 self._descr.num_alignments = 0
388
389 - def _end_Hit(self):
390 #Cleanup 391 self._blast.multiple_alignment = None 392 self._hit = None 393 self._descr = None
394
395 - def _end_Hit_id(self):
396 """identifier of the database sequence 397 """ 398 self._hit.hit_id = self._value 399 self._hit.title = self._value + ' '
400
401 - def _end_Hit_def(self):
402 """definition line of the database sequence 403 """ 404 self._hit.hit_def = self._value 405 self._hit.title += self._value 406 self._descr.title = self._hit.title
407
408 - def _end_Hit_accession(self):
409 """accession of the database sequence 410 """ 411 self._hit.accession = self._value 412 self._descr.accession = self._value
413
414 - def _end_Hit_len(self):
415 self._hit.length = int(self._value)
416 417 # HSPs
418 - def _start_Hsp(self):
419 #Note that self._start_Hit() should have been called 420 #to setup things like self._blast.multiple_alignment 421 self._hit.hsps.append(Record.HSP()) 422 self._hsp = self._hit.hsps[-1] 423 self._descr.num_alignments += 1 424 self._blast.multiple_alignment.append(Record.MultipleAlignment()) 425 self._mult_al = self._blast.multiple_alignment[-1]
426 427 # Hsp_num is useless
428 - def _end_Hsp_score(self):
429 """raw score of HSP 430 """ 431 self._hsp.score = float(self._value) 432 if self._descr.score is None: 433 self._descr.score = float(self._value)
434
435 - def _end_Hsp_bit_score(self):
436 """bit score of HSP 437 """ 438 self._hsp.bits = float(self._value) 439 if self._descr.bits is None: 440 self._descr.bits = float(self._value)
441
442 - def _end_Hsp_evalue(self):
443 """expect value of the HSP 444 """ 445 self._hsp.expect = float(self._value) 446 if self._descr.e is None: 447 self._descr.e = float(self._value)
448
449 - def _end_Hsp_query_from(self):
450 """offset of query at the start of the alignment (one-offset) 451 """ 452 self._hsp.query_start = int(self._value)
453
454 - def _end_Hsp_query_to(self):
455 """offset of query at the end of the alignment (one-offset) 456 """ 457 self._hsp.query_end = int(self._value)
458
459 - def _end_Hsp_hit_from(self):
460 """offset of the database at the start of the alignment (one-offset) 461 """ 462 self._hsp.sbjct_start = int(self._value)
463
464 - def _end_Hsp_hit_to(self):
465 """offset of the database at the end of the alignment (one-offset) 466 """ 467 self._hsp.sbjct_end = int(self._value)
468 469 ## def _end_Hsp_pattern_from(self): 470 ## """start of phi-blast pattern on the query (one-offset) 471 ## """ 472 ## pass # XXX TODO PSI 473 474 ## def _end_Hsp_pattern_to(self): 475 ## """end of phi-blast pattern on the query (one-offset) 476 ## """ 477 ## pass # XXX TODO PSI 478
479 - def _end_Hsp_query_frame(self):
480 """frame of the query if applicable 481 """ 482 self._hsp.frame = (int(self._value),)
483
484 - def _end_Hsp_hit_frame(self):
485 """frame of the database sequence if applicable 486 """ 487 self._hsp.frame += (int(self._value),)
488
489 - def _end_Hsp_identity(self):
490 """number of identities in the alignment 491 """ 492 self._hsp.identities = int(self._value)
493
494 - def _end_Hsp_positive(self):
495 """number of positive (conservative) substitutions in the alignment 496 """ 497 self._hsp.positives = int(self._value)
498
499 - def _end_Hsp_gaps(self):
500 """number of gaps in the alignment 501 """ 502 self._hsp.gaps = int(self._value)
503
504 - def _end_Hsp_align_len(self):
505 """length of the alignment 506 """ 507 self._hsp.align_length = int(self._value)
508 509 ## def _en_Hsp_density(self): 510 ## """score density 511 ## """ 512 ## pass # XXX ??? 513
514 - def _end_Hsp_qseq(self):
515 """alignment string for the query 516 """ 517 self._hsp.query = self._value
518
519 - def _end_Hsp_hseq(self):
520 """alignment string for the database 521 """ 522 self._hsp.sbjct = self._value
523
524 - def _end_Hsp_midline(self):
525 """Formatting middle line as normally seen in BLAST report 526 """ 527 self._hsp.match = self._value # do NOT strip spaces! 528 assert len(self._hsp.match)==len(self._hsp.query) 529 assert len(self._hsp.match)==len(self._hsp.sbjct)
530 531 # Statistics
532 - def _end_Statistics_db_num(self):
533 """number of sequences in the database 534 """ 535 self._blast.num_sequences_in_database = int(self._value)
536
537 - def _end_Statistics_db_len(self):
538 """number of letters in the database 539 """ 540 self._blast.num_letters_in_database = int(self._value)
541
542 - def _end_Statistics_hsp_len(self):
543 """the effective HSP length 544 """ 545 self._blast.effective_hsp_length = int(self._value)
546
548 """the effective search space 549 """ 550 self._blast.effective_search_space = float(self._value)
551
552 - def _end_Statistics_kappa(self):
553 """Karlin-Altschul parameter K 554 """ 555 self._blast.ka_params = float(self._value)
556
557 - def _end_Statistics_lambda(self):
558 """Karlin-Altschul parameter Lambda 559 """ 560 self._blast.ka_params = (float(self._value), 561 self._blast.ka_params)
562
563 - def _end_Statistics_entropy(self):
564 """Karlin-Altschul parameter H 565 """ 566 self._blast.ka_params = self._blast.ka_params + (float(self._value),)
567 568
569 -def read(handle, debug=0):
570 """Returns a single Blast record (assumes just one query). 571 572 This function is for use when there is one and only one BLAST 573 result in your XML file. 574 575 Use the Bio.Blast.NCBIXML.parse() function if you expect more than 576 one BLAST record (i.e. if you have more than one query sequence). 577 578 """ 579 iterator = parse(handle, debug) 580 try: 581 first = next(iterator) 582 except StopIteration: 583 first = None 584 if first is None: 585 raise ValueError("No records found in handle") 586 try: 587 second = next(iterator) 588 except StopIteration: 589 second = None 590 if second is not None: 591 raise ValueError("More than one record found in handle") 592 return first
593 594
595 -def parse(handle, debug=0):
596 """Returns an iterator a Blast record for each query. 597 598 handle - file handle to and XML file to parse 599 debug - integer, amount of debug information to print 600 601 This is a generator function that returns multiple Blast records 602 objects - one for each query sequence given to blast. The file 603 is read incrementally, returning complete records as they are read 604 in. 605 606 Should cope with new BLAST 2.2.14+ which gives a single XML file 607 for multiple query records. 608 609 Should also cope with XML output from older versions BLAST which 610 gave multiple XML files concatenated together (giving a single file 611 which strictly speaking wasn't valid XML).""" 612 from xml.parsers import expat 613 BLOCK = 1024 614 MARGIN = 10 # must be at least length of newline + XML start 615 XML_START = "<?xml" 616 617 text = handle.read(BLOCK) 618 pending = "" 619 620 if not text: 621 #NO DATA FOUND! 622 raise ValueError("Your XML file was empty") 623 624 while text: 625 #We are now starting a new XML file 626 if not text.startswith(XML_START): 627 raise ValueError("Your XML file did not start with %s... " 628 "but instead %s" 629 % (XML_START, repr(text[:20]))) 630 631 expat_parser = expat.ParserCreate() 632 blast_parser = BlastParser(debug) 633 expat_parser.StartElementHandler = blast_parser.startElement 634 expat_parser.EndElementHandler = blast_parser.endElement 635 expat_parser.CharacterDataHandler = blast_parser.characters 636 637 expat_parser.Parse(text, False) 638 while blast_parser._records: 639 record = blast_parser._records[0] 640 blast_parser._records = blast_parser._records[1:] 641 yield record 642 643 while True: 644 #Read in another block of the file... 645 text, pending = pending + handle.read(BLOCK), "" 646 if not text: 647 #End of the file! 648 expat_parser.Parse("", True) # End of XML record 649 break 650 651 #Now read a little bit more so we can check for the 652 #start of another XML file... 653 pending = handle.read(MARGIN) 654 655 if ("\n" + XML_START) not in (text + pending): 656 # Good - still dealing with the same XML file 657 expat_parser.Parse(text, False) 658 while blast_parser._records: 659 yield blast_parser._records.pop(0) 660 else: 661 # This is output from pre 2.2.14 BLAST, 662 # one XML file for each query! 663 664 # Finish the old file: 665 text, pending = (text+pending).split("\n" + XML_START, 1) 666 pending = XML_START + pending 667 668 expat_parser.Parse(text, True) # End of XML record 669 while blast_parser._records: 670 yield blast_parser._records.pop(0) 671 672 #Now we are going to re-loop, reset the 673 #parsers and start reading the next XML file 674 text, pending = pending, "" 675 break 676 677 #this was added because it seems that the Jython expat parser 678 #was adding records later then the Python one 679 while blast_parser._records: 680 yield blast_parser._records.pop(0) 681 682 #At this point we have finished the first XML record. 683 #If the file is from an old version of blast, it may 684 #contain more XML records (check if text==""). 685 assert pending=="" 686 assert len(blast_parser._records) == 0 687 688 #We should have finished the file! 689 assert text=="" 690 assert pending=="" 691 assert len(blast_parser._records) == 0
692 693 if __name__ == '__main__': 694 import sys 695 with open(sys.argv[1]) as handle: 696 r_list = parse(handle) 697 698 for r in r_list: 699 # Small test 700 print('Blast of %s' % r.query) 701 print('Found %s alignments with a total of %s HSPs' % (len(r.alignments), 702 reduce(lambda a, b: a+b, 703 [len(a.hsps) for a in r.alignments]))) 704 705 for al in r.alignments: 706 print("%s %i bp %i HSPs" % (al.title[:50], al.length, len(al.hsps))) 707 708 # Cookbook example 709 E_VALUE_THRESH = 0.04 710 for alignment in r.alignments: 711 for hsp in alignment.hsps: 712 if hsp.expect < E_VALUE_THRESH: 713 print('*****') 714 print('sequence %s' % alignment.title) 715 print('length %i' % alignment.length) 716 print('e value %f' % hsp.expect) 717 print(hsp.query[:75] + '...') 718 print(hsp.match[:75] + '...') 719 print(hsp.sbjct[:75] + '...') 720