Package Bio :: Package SearchIO :: Package BlastIO :: Module blast_xml
[hide private]
[frames] | no frames]

Source Code for Module Bio.SearchIO.BlastIO.blast_xml

  1  # Copyright 2012 by Wibowo Arindrarto.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Bio.SearchIO parser for BLAST+ XML output formats.""" 
  7  # for more info: http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.mod.dtd 
  8   
  9  import sys 
 10  import re 
 11  import warnings 
 12  from itertools import chain 
 13  from xml.sax.saxutils import XMLGenerator, escape 
 14   
 15  from Bio import BiopythonParserWarning 
 16   
 17   
 18  #For speed try to use cElementTree rather than ElementTree 
 19  try: 
 20      if (3, 0) <= sys.version_info[:2] <= (3, 1): 
 21          # Workaround for bug in python 3.0 and 3.1, 
 22          # see http://bugs.python.org/issue9257 
 23          from xml.etree import ElementTree as ElementTree 
 24      else: 
 25          from xml.etree import cElementTree as ElementTree 
 26  except ImportError: 
 27      from xml.etree import ElementTree as ElementTree 
 28   
 29   
 30  from Bio._py3k import _as_bytes, _bytes_to_string, unicode 
 31  _empty_bytes_string = _as_bytes("") 
 32   
 33  from Bio.Alphabet import generic_dna, generic_protein 
 34  from Bio.SearchIO._index import SearchIndexer 
 35  from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment 
 36   
 37   
 38  __all__ = ['BlastXmlParser', 'BlastXmlIndexer', 'BlastXmlWriter'] 
 39   
 40   
 41  # element - optional qresult attribute name mapping 
 42  _ELEM_QRESULT_OPT = { 
 43      'Statistics_db-num': ('stat_db_num', int), 
 44      'Statistics_db-len': ('stat_db_len', int), 
 45      'Statistics_eff-space': ('stat_eff_space', float), 
 46      'Statistics_hsp-len': ('stat_hsp_len', int), 
 47      'Statistics_kappa': ('stat_kappa', float), 
 48      'Statistics_lambda': ('stat_lambda', float), 
 49      'Statistics_entropy': ('stat_entropy', float), 
 50  } 
 51  # element - hit attribute name mapping 
 52  _ELEM_HIT = { 
 53      #'Hit_def': ('description', str),   # not set by this dict 
 54      'Hit_accession': ('accession', str), 
 55      'Hit_len': ('seq_len', int), 
 56  } 
 57  # element - hsp attribute name mapping 
 58  _ELEM_HSP = { 
 59      'Hsp_bit-score': ('bitscore', float), 
 60      'Hsp_score': ('bitscore_raw', int), 
 61      'Hsp_evalue': ('evalue', float), 
 62      'Hsp_identity': ('ident_num', int), 
 63      'Hsp_positive': ('pos_num', int), 
 64      'Hsp_gaps': ('gap_num', int), 
 65      'Hsp_density': ('density', float), 
 66  } 
 67  # element - fragment attribute name mapping 
 68  _ELEM_FRAG = { 
 69      'Hsp_query-from': ('query_start', int), 
 70      'Hsp_query-to': ('query_end', int), 
 71      'Hsp_hit-from': ('hit_start', int), 
 72      'Hsp_hit-to': ('hit_end', int), 
 73      'Hsp_query-frame': ('query_frame', int), 
 74      'Hsp_hit-frame': ('hit_frame', int), 
 75      'Hsp_align-len': ('aln_span', int), 
 76      'Hsp_pattern-from': ('pattern_start', int), 
 77      'Hsp_pattern-to': ('pattern_end', int), 
 78      'Hsp_hseq': ('hit', str), 
 79      'Hsp_qseq': ('query', str), 
 80  } 
 81  # dictionary for mapping tag name and meta key name 
 82  _ELEM_META = { 
 83      'BlastOutput_db': ('target', str), 
 84      'BlastOutput_program': ('program', str), 
 85      'BlastOutput_version': ('version', str), 
 86      'BlastOutput_reference': ('reference', str), 
 87      'Parameters_expect': ('param_evalue_threshold', float), 
 88      'Parameters_entrez-query': ('param_entrez_query', str), 
 89      'Parameters_filter': ('param_filter', str), 
 90      'Parameters_gap-extend': ('param_gap_extend', int), 
 91      'Parameters_gap-open': ('param_gap_open', int), 
 92      'Parameters_include': ('param_include', str), 
 93      'Parameters_matrix': ('param_matrix', str), 
 94      'Parameters_pattern': ('param_pattern', str), 
 95      'Parameters_sc-match': ('param_score_match', int), 
 96      'Parameters_sc-mismatch': ('param_score_mismatch', int), 
 97  } 
 98  # these are fallback tags that store information on the first query 
 99  # outside the <Iteration> tag 
100  # only used if query_{ID,def,len} is not found in <Iteration> 
101  # (seen in legacy Blast <2.2.14) 
102  _ELEM_QRESULT_FALLBACK = { 
103      'BlastOutput_query-ID': ('id', str), 
104      'BlastOutput_query-def': ('description', str), 
105      'BlastOutput_query-len': ('len', str), 
106  } 
107  # element-attribute maps, for writing 
108  _WRITE_MAPS = { 
109      'preamble': ( 
110          ('program', 'program'), 
111          ('version', 'version'), 
112          ('reference', 'reference'), 
113          ('db', 'target'), 
114          ('query-ID', 'id'), 
115          ('query-def', 'description'), 
116          ('query-len', 'seq_len'), 
117          ('param', None), 
118      ), 
119      'param': ( 
120          ('matrix', 'param_matrix'), 
121          ('expect', 'param_evalue_threshold'), 
122          ('sc-match', 'param_score_match'), 
123          ('sc-mismatch', 'param_score_mismatch'), 
124          ('gap-open', 'param_gap_open'), 
125          ('gap-extend', 'param_gap_extend'), 
126          ('filter', 'param_filter'), 
127          ('pattern', 'param_pattern'), 
128          ('entrez-query', 'param_entrez_query'), 
129      ), 
130      'qresult': ( 
131          ('query-ID', 'id'), 
132          ('query-def', 'description'), 
133          ('query-len', 'seq_len'), 
134      ), 
135      'stat': ( 
136          ('db-num', 'stat_db_num'), 
137          ('db-len', 'stat_db_len'), 
138          ('hsp-len', 'stat_hsp_len'), 
139          ('eff-space', 'stat_eff_space'), 
140          ('kappa', 'stat_kappa'), 
141          ('lambda', 'stat_lambda'), 
142          ('entropy', 'stat_entropy'), 
143      ), 
144      'hit': ( 
145          ('id', 'id'), 
146          ('def', 'description'), 
147          ('accession', 'accession'), 
148          ('len', 'seq_len'), 
149      ), 
150      'hsp': ( 
151          ('bit-score', 'bitscore'), 
152          ('score', 'bitscore_raw'), 
153          ('evalue', 'evalue'), 
154          ('query-from', 'query_start'), 
155          ('query-to', 'query_end'), 
156          ('hit-from', 'hit_start'), 
157          ('hit-to', 'hit_end'), 
158          ('pattern-from', 'pattern_start'), 
159          ('pattern-to', 'pattern_end'), 
160          ('query-frame', 'query_frame'), 
161          ('hit-frame', 'hit_frame'), 
162          ('identity', 'ident_num'), 
163          ('positive', 'pos_num'), 
164          ('gaps', 'gap_num'), 
165          ('align-len', 'aln_span'), 
166          ('density', 'density'), 
167          ('qseq', 'query'), 
168          ('hseq', 'hit'), 
169          ('midline', None), 
170      ), 
171  } 
172  # optional elements, based on the DTD 
173  _DTD_OPT = ( 
174      'BlastOutput_query-seq', 'BlastOutput_mbstat', 'Iteration_query-def', 
175      'Iteration_query-len', 'Iteration-hits', 'Iteration_stat', 
176      'Iteration_message', 'Parameters_matrix', 'Parameters_include', 
177      'Parameters_sc-match', 'Parameters_sc-mismatch', 'Parameters_filter', 
178      'Parameters_pattern', 'Parameters_entrez-query', 'Hit_hsps', 
179      'Hsp_pattern-from', 'Hsp_pattern-to', 'Hsp_query-frame', 'Hsp_hit-frame', 
180      'Hsp_identity', 'Hsp_positive', 'Hsp_gaps', 'Hsp_align-len', 'Hsp_density', 
181      'Hsp_midline', 
182  ) 
183   
184  # compile RE patterns 
185  _RE_VERSION = re.compile(r'\d+\.\d+\.\d+\+?') 
186   
187   
188 -class BlastXmlParser(object):
189 """Parser for the BLAST XML format""" 190
191 - def __init__(self, handle):
192 self.xml_iter = iter(ElementTree.iterparse(handle, events=('start', 'end'))) 193 self._meta, self._fallback = self._parse_preamble()
194
195 - def __iter__(self):
196 for qresult in self._parse_qresult(): 197 yield qresult
198
199 - def _parse_preamble(self):
200 """Parses all tag data prior to the first query result.""" 201 # dictionary for containing all information prior to the first query 202 meta = {} 203 # dictionary for fallback information 204 fallback = {} 205 206 # parse the preamble part (anything prior to the first result) 207 for event, elem in self.xml_iter: 208 # get the tag values, cast appropriately, store into meta 209 if event == 'end' and elem.tag in _ELEM_META: 210 attr_name, caster = _ELEM_META[elem.tag] 211 212 if caster is not str: 213 meta[attr_name] = caster(elem.text) 214 else: 215 meta[attr_name] = elem.text 216 217 # delete element after we finish parsing it 218 elem.clear() 219 continue 220 # capture fallback values 221 # these are used only if the first <Iteration> does not have any 222 # ID, ref, or len. 223 elif event == 'end' and elem.tag in _ELEM_QRESULT_FALLBACK: 224 attr_name, caster = _ELEM_QRESULT_FALLBACK[elem.tag] 225 226 if caster is not str: 227 fallback[attr_name] = caster(elem.text) 228 else: 229 fallback[attr_name] = elem.text 230 231 elem.clear() 232 continue 233 234 if event == 'start' and elem.tag == 'Iteration': 235 break 236 237 # we only want the version number, sans the program name or date 238 if meta.get('version') is not None: 239 meta['version'] = re.search(_RE_VERSION, 240 meta['version']).group(0) 241 242 return meta, fallback
243
244 - def _parse_qresult(self):
245 """Parses query results.""" 246 # parse the queries 247 for event, qresult_elem in self.xml_iter: 248 # </Iteration> marks the end of a single query 249 # which means we can process it 250 if event == 'end' and qresult_elem.tag == 'Iteration': 251 252 # we'll use the following schema 253 # <!ELEMENT Iteration ( 254 # Iteration_iter-num, 255 # Iteration_query-ID?, 256 # Iteration_query-def?, 257 # Iteration_query-len?, 258 # Iteration_hits?, 259 # Iteration_stat?, 260 # Iteration_message?)> 261 262 # assign query attributes with fallbacks 263 query_id = qresult_elem.findtext('Iteration_query-ID') 264 if query_id is None: 265 query_id = self._fallback['id'] 266 267 query_desc = qresult_elem.findtext('Iteration_query-def') 268 if query_desc is None: 269 query_desc = self._fallback['description'] 270 271 query_len = qresult_elem.findtext('Iteration_query-len') 272 if query_len is None: 273 query_len = self._fallback['len'] 274 275 # handle blast searches against databases with Blast's IDs 276 # 'Query_' marks the beginning of a BLAST+-generated ID, 277 # 'lcl|' marks the beginning of a BLAST legacy-generated ID 278 if query_id.startswith('Query_') or query_id.startswith('lcl|'): 279 # store the Blast-generated query ID 280 blast_query_id = query_id 281 id_desc = query_desc.split(' ', 1) 282 query_id = id_desc[0] 283 try: 284 query_desc = id_desc[1] 285 except IndexError: 286 query_desc = '' 287 else: 288 blast_query_id = '' 289 290 hit_list, key_list = [], [] 291 for hit in self._parse_hit(qresult_elem.find('Iteration_hits'), 292 query_id): 293 if hit: 294 # need to keep track of hit IDs, since there could be duplicates, 295 if hit.id in key_list: 296 warnings.warn("Adding hit with BLAST-generated ID " 297 "%r since hit ID %r is already present " 298 "in query %r. Your BLAST database may contain " 299 "duplicate entries." % 300 (hit._blast_id, hit.id, query_id), BiopythonParserWarning) 301 # fallback to Blast-generated IDs, if the ID is already present 302 # and restore the desc, too 303 hit.description = '%s %s' % (hit.id, hit.description) 304 hit.id = hit._blast_id 305 # and change the hit_id of the HSPs contained 306 for hsp in hit: 307 hsp.hit_id = hit._blast_id 308 else: 309 key_list.append(hit.id) 310 311 hit_list.append(hit) 312 313 # create qresult and assign its attributes 314 qresult = QueryResult(hit_list, query_id) 315 qresult.description = query_desc 316 qresult.seq_len = int(query_len) 317 qresult._blast_id = blast_query_id 318 for key, value in self._meta.items(): 319 setattr(qresult, key, value) 320 321 # statistics are stored in Iteration_stat's 'grandchildren' with the 322 # following DTD 323 # <!ELEMENT Statistics ( 324 # Statistics_db-num, 325 # Statistics_db-len, 326 # Statistics_hsp-len, 327 # Statistics_eff-space, 328 # Statistics_kappa, 329 # Statistics_lambda, 330 # Statistics_entropy)> 331 332 stat_iter_elem = qresult_elem.find('Iteration_stat') 333 if stat_iter_elem is not None: 334 stat_elem = stat_iter_elem.find('Statistics') 335 336 for key, val_info in _ELEM_QRESULT_OPT.items(): 337 value = stat_elem.findtext(key) 338 if value is not None: 339 caster = val_info[1] 340 # recast only if value is not intended to be str 341 if value is not None and caster is not str: 342 value = caster(value) 343 setattr(qresult, val_info[0], value) 344 345 # delete element after we finish parsing it 346 qresult_elem.clear() 347 yield qresult
348
349 - def _parse_hit(self, root_hit_elem, query_id):
350 """Generator that transforms Iteration_hits XML elements into Hit objects. 351 352 Arguments: 353 root_hit_elem -- Element object of the Iteration_hits tag. 354 query_id -- String of QueryResult ID of this Hit 355 356 """ 357 # Hit level processing 358 # Hits are stored in the Iteration_hits tag, with the following 359 # DTD 360 # <!ELEMENT Hit ( 361 # Hit_num, 362 # Hit_id, 363 # Hit_def, 364 # Hit_accession, 365 # Hit_len, 366 # Hit_hsps?)> 367 368 # feed the loop below an empty list so iteration still works 369 if root_hit_elem is None: 370 root_hit_elem = [] 371 372 for hit_elem in root_hit_elem: 373 374 # create empty hit object 375 hit_id = hit_elem.findtext('Hit_id') 376 hit_desc = hit_elem.findtext('Hit_def') 377 # handle blast searches against databases with Blast's IDs 378 if hit_id.startswith('gnl|BL_ORD_ID|'): 379 blast_hit_id = hit_id 380 id_desc = hit_desc.split(' ', 1) 381 hit_id = id_desc[0] 382 try: 383 hit_desc = id_desc[1] 384 except IndexError: 385 hit_desc = '' 386 else: 387 blast_hit_id = '' 388 389 hsps = [hsp for hsp in 390 self._parse_hsp(hit_elem.find('Hit_hsps'), 391 query_id, hit_id)] 392 393 hit = Hit(hsps) 394 hit.description = hit_desc 395 # blast_hit_id is only set if the hit ID is Blast-generated 396 hit._blast_id = blast_hit_id 397 398 for key, val_info in _ELEM_HIT.items(): 399 value = hit_elem.findtext(key) 400 if value is not None: 401 caster = val_info[1] 402 # recast only if value is not intended to be str 403 if value is not None and caster is not str: 404 value = caster(value) 405 setattr(hit, val_info[0], value) 406 407 # delete element after we finish parsing it 408 hit_elem.clear() 409 yield hit
410
411 - def _parse_hsp(self, root_hsp_frag_elem, query_id, hit_id):
412 """Iterator that transforms Hit_hsps XML elements into HSP objects. 413 414 Arguments: 415 root_hsp_frag_elem -- Element object of the Hit_hsps tag. 416 query_id -- Query ID string. 417 hit_id -- Hit ID string. 418 419 """ 420 # Hit_hsps DTD: 421 # <!ELEMENT Hsp ( 422 # Hsp_num, 423 # Hsp_bit-score, 424 # Hsp_score, 425 # Hsp_evalue, 426 # Hsp_query-from, 427 # Hsp_query-to, 428 # Hsp_hit-from, 429 # Hsp_hit-to, 430 # Hsp_pattern-from?, 431 # Hsp_pattern-to?, 432 # Hsp_query-frame?, 433 # Hsp_hit-frame?, 434 # Hsp_identity?, 435 # Hsp_positive?, 436 # Hsp_gaps?, 437 # Hsp_align-len?, 438 # Hsp_density?, 439 # Hsp_qseq, 440 # Hsp_hseq, 441 # Hsp_midline?)> 442 443 # if value is None, feed the loop below an empty list 444 if root_hsp_frag_elem is None: 445 root_hsp_frag_elem = [] 446 447 for hsp_frag_elem in root_hsp_frag_elem: 448 coords = {} # temporary container for coordinates 449 frag = HSPFragment(hit_id, query_id) 450 for key, val_info in _ELEM_FRAG.items(): 451 value = hsp_frag_elem.findtext(key) 452 caster = val_info[1] 453 454 # adjust 'from' and 'to' coordinates to 0-based ones 455 if value is not None: 456 if key.endswith('-from') or key.endswith('-to'): 457 # store coordinates for further processing 458 coords[val_info[0]] = caster(value) 459 continue 460 # recast only if value is not intended to be str 461 elif caster is not str: 462 value = caster(value) 463 setattr(frag, val_info[0], value) 464 465 # set the similarity characters into aln_annotation dict 466 frag.aln_annotation['similarity'] = \ 467 hsp_frag_elem.findtext('Hsp_midline') 468 469 # process coordinates 470 # since 'x-from' could be bigger than 'x-to', we need to figure 471 # out which one is smaller/bigger since 'x_start' is always smaller 472 # than 'x_end' 473 for coord_type in ('query', 'hit', 'pattern'): 474 start_type = coord_type + '_start' 475 end_type = coord_type + '_end' 476 try: 477 start = coords[start_type] 478 end = coords[end_type] 479 except KeyError: 480 continue 481 else: 482 # convert to python range and setattr 483 setattr(frag, start_type, min(start, end) - 1) 484 setattr(frag, end_type, max(start, end)) 485 486 # set alphabet, based on program 487 prog = self._meta.get('program') 488 if prog == 'blastn': 489 frag.alphabet = generic_dna 490 elif prog in ['blastp', 'blastx', 'tblastn', 'tblastx']: 491 frag.alphabet = generic_protein 492 493 hsp = HSP([frag]) 494 for key, val_info in _ELEM_HSP.items(): 495 value = hsp_frag_elem.findtext(key) 496 caster = val_info[1] 497 if value is not None: 498 if caster is not str: 499 value = caster(value) 500 setattr(hsp, val_info[0], value) 501 # delete element after we finish parsing it 502 hsp_frag_elem.clear() 503 yield hsp
504 505
506 -class BlastXmlIndexer(SearchIndexer):
507 """Indexer class for BLAST XML output.""" 508 509 _parser = BlastXmlParser 510 qstart_mark = _as_bytes('<Iteration>') 511 qend_mark = _as_bytes('</Iteration>') 512 block_size = 16384 513
514 - def __init__(self, filename):
515 SearchIndexer.__init__(self, filename) 516 # TODO: better way to do this? 517 iter_obj = self._parser(self._handle) 518 self._meta, self._fallback = iter_obj._meta, iter_obj._fallback
519
520 - def __iter__(self):
521 qstart_mark = self.qstart_mark 522 qend_mark = self.qend_mark 523 blast_id_mark = _as_bytes('Query_') 524 block_size = self.block_size 525 handle = self._handle 526 handle.seek(0) 527 re_desc = re.compile(_as_bytes(r'<Iteration_query-ID>(.*?)' 528 '</Iteration_query-ID>\s+?<Iteration_query-def>' 529 '(.*?)</Iteration_query-def>')) 530 re_desc_end = re.compile(_as_bytes(r'</Iteration_query-def>')) 531 counter = 0 532 533 while True: 534 start_offset = handle.tell() 535 line = handle.readline() 536 if not line: 537 break 538 if qstart_mark not in line: 539 continue 540 # The following requirements are to make supporting BGZF compressed 541 # BLAST XML files simpler (avoids complex offset manipulations): 542 assert line.count(qstart_mark) == 1, "XML without line breaks?" 543 assert line.lstrip().startswith(qstart_mark), line 544 if qend_mark in line: 545 # Should cope with <Iteration>...</Iteration> on one long line 546 block = line 547 else: 548 # Load the rest of this block up to and including </Iteration> 549 block = [line] 550 while line and qend_mark not in line: 551 line = handle.readline() 552 assert qstart_mark not in line, line 553 block.append(line) 554 assert line.rstrip().endswith(qend_mark), line 555 block = _empty_bytes_string.join(block) 556 assert block.count(qstart_mark) == 1, "XML without line breaks? %r" % block 557 assert block.count(qend_mark) == 1, "XML without line breaks? %r" % block 558 #Now we have a full <Iteration>...</Iteration> block, find the ID 559 regx = re.search(re_desc, block) 560 try: 561 qstart_desc = regx.group(2) 562 qstart_id = regx.group(1) 563 except AttributeError: 564 # use the fallback values 565 assert re.search(re_desc_end, block) 566 qstart_desc = _as_bytes(self._fallback['description']) 567 qstart_id = _as_bytes(self._fallback['id']) 568 if qstart_id.startswith(blast_id_mark): 569 qstart_id = qstart_desc.split(_as_bytes(' '), 1)[0] 570 yield _bytes_to_string(qstart_id), start_offset, len(block) 571 counter += 1
572
573 - def _parse(self, handle):
574 # overwrites SearchIndexer._parse, since we need to set the meta and 575 # fallback dictionaries to the parser 576 generator = self._parser(handle, **self._kwargs) 577 generator._meta = self._meta 578 generator._fallback = self._fallback 579 return next(iter(generator))
580
581 - def get_raw(self, offset):
582 qend_mark = self.qend_mark 583 handle = self._handle 584 handle.seek(offset) 585 586 qresult_raw = handle.readline() 587 assert qresult_raw.lstrip().startswith(self.qstart_mark) 588 while qend_mark not in qresult_raw: 589 qresult_raw += handle.readline() 590 assert qresult_raw.rstrip().endswith(qend_mark) 591 assert qresult_raw.count(qend_mark) == 1 592 # Note this will include any leading and trailing whitespace, in 593 # general expecting " <Iteration>\n...\n </Iteration>\n" 594 return qresult_raw
595 596
597 -class _BlastXmlGenerator(XMLGenerator):
598 """Event-based XML Generator.""" 599
600 - def __init__(self, out, encoding='utf-8', indent=" ", increment=2):
601 XMLGenerator.__init__(self, out, encoding) 602 # the indentation character 603 self._indent = indent 604 # nest level 605 self._level = 0 606 # how many indentation character should we increment per level 607 self._increment = increment 608 # container for names of tags with children 609 self._parent_stack = [] 610 # determine writer method 611 try: 612 # this should work for all platforms except Jython 613 self.write = self._write 614 except AttributeError: 615 # Jython uses self._out.write 616 self.write = self._out.write
617
618 - def startDocument(self):
619 """Starts the XML document.""" 620 self.write(u'<?xml version="1.0"?>\n' 621 '<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" ' 622 '"http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">\n')
623
624 - def startElement(self, name, attrs={}, children=False):
625 """Starts an XML element. 626 627 Arguments: 628 name -- String of element name. 629 attrs -- Dictionary of element attributes. 630 children -- Boolean, whether the element has children or not. 631 632 """ 633 self.ignorableWhitespace(self._indent * self._level) 634 XMLGenerator.startElement(self, name, attrs)
635
636 - def endElement(self, name):
637 """Ends and XML element of the given name.""" 638 XMLGenerator.endElement(self, name) 639 self.write(u'\n')
640
641 - def startParent(self, name, attrs={}):
642 """Starts an XML element which has children. 643 644 Arguments: 645 name -- String of element name. 646 attrs -- Dictionary of element attributes. 647 648 """ 649 self.startElement(name, attrs, children=True) 650 self._level += self._increment 651 self.write(u'\n') 652 # append the element name, so we can end it later 653 self._parent_stack.append(name)
654
655 - def endParent(self):
656 """Ends an XML element with children.""" 657 # the element to end is the one on top of the stack 658 name = self._parent_stack.pop() 659 self._level -= self._increment 660 self.ignorableWhitespace(self._indent * self._level) 661 self.endElement(name)
662
663 - def startParents(self, *names):
664 """Starts XML elements without children.""" 665 for name in names: 666 self.startParent(name)
667
668 - def endParents(self, num):
669 """Ends XML elements, according to the given number.""" 670 for i in range(num): 671 self.endParent()
672
673 - def simpleElement(self, name, content=None):
674 """Creates an XML element without children with the given content.""" 675 self.startElement(name, attrs={}) 676 if content: 677 self.characters(content) 678 self.endElement(name)
679
680 - def characters(self, content):
681 content = escape(unicode(content)) 682 for a, b in ((u'"', u'&quot;'), (u"'", u'&apos;')): 683 content = content.replace(a, b) 684 self.write(content)
685 686
687 -class BlastXmlWriter(object):
688 """Stream-based BLAST+ XML Writer.""" 689
690 - def __init__(self, handle):
691 self.xml = _BlastXmlGenerator(handle, 'utf-8')
692
693 - def write_file(self, qresults):
694 """Writes the XML contents to the output handle.""" 695 xml = self.xml 696 self.qresult_counter, self.hit_counter, self.hsp_counter, \ 697 self.frag_counter = 0, 0, 0, 0 698 699 # get the first qresult, since the preamble requires its attr values 700 first_qresult = next(qresults) 701 # start the XML document, set the root element, and create the preamble 702 xml.startDocument() 703 xml.startParent('BlastOutput') 704 self._write_preamble(first_qresult) 705 # and write the qresults 706 xml.startParent('BlastOutput_iterations') 707 self._write_qresults(chain([first_qresult], qresults)) 708 xml.endParents(2) 709 xml.endDocument() 710 711 return self.qresult_counter, self.hit_counter, self.hsp_counter, \ 712 self.frag_counter
713
714 - def _write_elem_block(self, block_name, map_name, obj, opt_dict={}):
715 """Writes sibling XML elements. 716 717 Arguments: 718 block_name -- String of common element name prefix. 719 map_name -- Dictionary name to for mapping element and attribute names. 720 obj -- Object whose attribute values will be used. 721 opt_dict -- Dictionary for custom element-attribute mapping. 722 723 """ 724 for elem, attr in _WRITE_MAPS[map_name]: 725 elem = block_name + elem 726 try: 727 content = str(getattr(obj, attr)) 728 except AttributeError: 729 # ensure attrs that is not present is optional 730 assert elem in _DTD_OPT, "Element %r (attribute %r) not " \ 731 "found" % (elem, attr) 732 else: 733 # custom element-attribute mapping, for fallback values 734 if elem in opt_dict: 735 content = opt_dict[elem] 736 self.xml.simpleElement(elem, content)
737
738 - def _write_preamble(self, qresult):
739 """Writes the XML file preamble.""" 740 xml = self.xml 741 742 for elem, attr in _WRITE_MAPS['preamble']: 743 elem = 'BlastOutput_' + elem 744 if elem == 'BlastOutput_param': 745 xml.startParent(elem) 746 self._write_param(qresult) 747 xml.endParent() 748 continue 749 try: 750 content = str(getattr(qresult, attr)) 751 except AttributeError: 752 assert elem in _DTD_OPT, "Element %s (attribute %s) not " \ 753 "found" % (elem, attr) 754 else: 755 if elem == 'BlastOutput_version': 756 content = '%s %s' % (qresult.program.upper(), 757 qresult.version) 758 elif qresult._blast_id: 759 if elem == 'BlastOutput_query-ID': 760 content = qresult._blast_id 761 elif elem == 'BlastOutput_query-def': 762 content = ' '.join([qresult.id, 763 qresult.description]).strip() 764 xml.simpleElement(elem, content)
765
766 - def _write_param(self, qresult):
767 """Writes the parameter block of the preamble.""" 768 xml = self.xml 769 xml.startParent('Parameters') 770 self._write_elem_block('Parameters_', 'param', qresult) 771 xml.endParent()
772
773 - def _write_qresults(self, qresults):
774 """Writes QueryResult objects into iteration elements.""" 775 xml = self.xml 776 777 for num, qresult in enumerate(qresults): 778 xml.startParent('Iteration') 779 xml.simpleElement('Iteration_iter-num', str(num+1)) 780 opt_dict = {} 781 # use custom Iteration_query-ID and Iteration_query-def mapping 782 # if the query has a BLAST-generated ID 783 if qresult._blast_id: 784 opt_dict = { 785 'Iteration_query-ID': qresult._blast_id, 786 'Iteration_query-def': ' '.join([qresult.id, 787 qresult.description]).strip(), 788 } 789 self._write_elem_block('Iteration_', 'qresult', qresult, opt_dict) 790 # the Iteration_hits tag only has children if there are hits 791 if qresult: 792 xml.startParent('Iteration_hits') 793 self._write_hits(qresult.hits) 794 xml.endParent() 795 # otherwise it's a simple element without any contents 796 else: 797 xml.simpleElement('Iteration_hits', '') 798 799 xml.startParents('Iteration_stat', 'Statistics') 800 self._write_elem_block('Statistics_', 'stat', qresult) 801 xml.endParents(2) 802 # there's a message if no hits is present 803 if not qresult: 804 xml.simpleElement('Iteration_message', 'No hits found') 805 self.qresult_counter += 1 806 xml.endParent()
807
808 - def _write_hits(self, hits):
809 """Writes Hit objects.""" 810 xml = self.xml 811 812 for num, hit in enumerate(hits): 813 xml.startParent('Hit') 814 xml.simpleElement('Hit_num', str(num+1)) 815 # use custom hit_id and hit_def mapping if the hit has a 816 # BLAST-generated ID 817 opt_dict = {} 818 if hit._blast_id: 819 opt_dict = { 820 'Hit_id': hit._blast_id, 821 'Hit_def': ' '.join([hit.id, hit.description]).strip(), 822 } 823 self._write_elem_block('Hit_', 'hit', hit, opt_dict) 824 xml.startParent('Hit_hsps') 825 self._write_hsps(hit.hsps) 826 self.hit_counter += 1 827 xml.endParents(2)
828
829 - def _write_hsps(self, hsps):
830 """Writes HSP objects.""" 831 xml = self.xml 832 for num, hsp in enumerate(hsps): 833 xml.startParent('Hsp') 834 xml.simpleElement('Hsp_num', str(num+1)) 835 for elem, attr in _WRITE_MAPS['hsp']: 836 elem = 'Hsp_' + elem 837 try: 838 content = self._adjust_output(hsp, elem, attr) 839 # make sure any elements that is not present is optional 840 # in the DTD 841 except AttributeError: 842 assert elem in _DTD_OPT, "Element %s (attribute %s) not found" \ 843 % (elem, attr) 844 else: 845 xml.simpleElement(elem, str(content)) 846 self.hsp_counter += 1 847 self.frag_counter += len(hsp.fragments) 848 xml.endParent()
849
850 - def _adjust_output(self, hsp, elem, attr):
851 """Adjusts output to mimic native BLAST+ XML as much as possible.""" 852 853 # adjust coordinates 854 if attr in ('query_start', 'query_end', 'hit_start', 'hit_end', 855 'pattern_start', 'pattern_end'): 856 content = getattr(hsp, attr) + 1 857 if '_start' in attr: 858 content = getattr(hsp, attr) + 1 859 else: 860 content = getattr(hsp, attr) 861 862 # adjust for 'from' <--> 'to' flip if it's not a translated search 863 # and frames are different 864 # adapted from /src/algo/blast/format/blastxml_format.cpp#L216 865 if hsp.query_frame != 0 and hsp.hit_frame < 0: 866 if attr == 'hit_start': 867 content = getattr(hsp, 'hit_end') 868 elif attr == 'hit_end': 869 content = getattr(hsp, 'hit_start') + 1 870 871 # for seqrecord objects, we only need the sequence string 872 elif elem in ('Hsp_hseq', 'Hsp_qseq'): 873 content = str(getattr(hsp, attr).seq) 874 elif elem == 'Hsp_midline': 875 content = hsp.aln_annotation['similarity'] 876 elif elem in ('Hsp_evalue', 'Hsp_bit-score'): 877 # adapted from src/algo/blast/format/blastxml_format.cpp#L138-140 878 content = '%.*g' % (6, getattr(hsp, attr)) 879 else: 880 content = getattr(hsp, attr) 881 882 return content
883 884 885 # if not used as a module, run the doctest 886 if __name__ == "__main__": 887 from Bio._utils import run_doctest 888 run_doctest() 889