1
2
3
4
5
6 """Bio.SearchIO parser for BLAST+ XML output formats."""
7
8
9 import sys
10 import re
11 from itertools import chain
12 from xml.sax.saxutils import XMLGenerator, escape
13
14
15
16 try:
17 if (3, 0) <= sys.version_info[:2] <= (3, 1):
18
19
20 from xml.etree import ElementTree as ElementTree
21 else:
22 from xml.etree import cElementTree as ElementTree
23 except ImportError:
24 from xml.etree import ElementTree as ElementTree
25
26
27 from Bio._py3k import _as_bytes, _bytes_to_string
28 _empty_bytes_string = _as_bytes("")
29
30 from Bio.Alphabet import generic_dna, generic_protein
31 from Bio.SearchIO._index import SearchIndexer
32 from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment
33
34
35 __all__ = ['BlastXmlParser', 'BlastXmlIndexer', 'BlastXmlWriter']
36
37
38
39 _ELEM_QRESULT_OPT = {
40 'Statistics_db-num': ('stat_db_num', int),
41 'Statistics_db-len': ('stat_db_len', int),
42 'Statistics_eff-space': ('stat_eff_space', float),
43 'Statistics_hsp-len': ('stat_hsp_len', int),
44 'Statistics_kappa': ('stat_kappa', float),
45 'Statistics_lambda': ('stat_lambda', float),
46 'Statistics_entropy': ('stat_entropy', float),
47 }
48
49 _ELEM_HIT = {
50
51 'Hit_accession': ('accession', str),
52 'Hit_len': ('seq_len', int),
53 }
54
55 _ELEM_HSP = {
56 'Hsp_bit-score': ('bitscore', float),
57 'Hsp_score': ('bitscore_raw', int),
58 'Hsp_evalue': ('evalue', float),
59 'Hsp_identity': ('ident_num', int),
60 'Hsp_positive': ('pos_num', int),
61 'Hsp_gaps': ('gap_num', int),
62 'Hsp_density': ('density', float),
63 }
64
65 _ELEM_FRAG = {
66 'Hsp_query-from': ('query_start', int),
67 'Hsp_query-to': ('query_end', int),
68 'Hsp_hit-from': ('hit_start', int),
69 'Hsp_hit-to': ('hit_end', int),
70 'Hsp_query-frame': ('query_frame', int),
71 'Hsp_hit-frame': ('hit_frame', int),
72 'Hsp_align-len': ('aln_span', int),
73 'Hsp_pattern-from': ('pattern_start', int),
74 'Hsp_pattern-to': ('pattern_end', int),
75 'Hsp_hseq': ('hit', str),
76 'Hsp_qseq': ('query', str),
77 }
78
79 _ELEM_META = {
80 'BlastOutput_db': ('target', str),
81 'BlastOutput_program': ('program', str),
82 'BlastOutput_version': ('version', str),
83 'BlastOutput_reference': ('reference', str),
84 'Parameters_expect': ('param_evalue_threshold', float),
85 'Parameters_entrez-query': ('param_entrez_query', str),
86 'Parameters_filter': ('param_filter', str),
87 'Parameters_gap-extend': ('param_gap_extend', int),
88 'Parameters_gap-open': ('param_gap_open', int),
89 'Parameters_include': ('param_include', str),
90 'Parameters_matrix': ('param_matrix', str),
91 'Parameters_pattern': ('param_pattern', str),
92 'Parameters_sc-match': ('param_score_match', int),
93 'Parameters_sc-mismatch': ('param_score_mismatch', int),
94 }
95
96
97
98
99 _ELEM_QRESULT_FALLBACK = {
100 'BlastOutput_query-ID': ('id', str),
101 'BlastOutput_query-def': ('description', str),
102 'BlastOutput_query-len': ('len', str),
103 }
104
105 _WRITE_MAPS = {
106 'preamble': (
107 ('program', 'program'),
108 ('version', 'version'),
109 ('reference', 'reference'),
110 ('db', 'target'),
111 ('query-ID', 'id'),
112 ('query-def', 'description'),
113 ('query-len', 'seq_len'),
114 ('param', None),
115 ),
116 'param': (
117 ('matrix', 'param_matrix'),
118 ('expect', 'param_evalue_threshold'),
119 ('sc-match', 'param_score_match'),
120 ('sc-mismatch', 'param_score_mismatch'),
121 ('gap-open', 'param_gap_open'),
122 ('gap-extend', 'param_gap_extend'),
123 ('filter', 'param_filter'),
124 ('pattern', 'param_pattern'),
125 ('entrez-query', 'param_entrez_query'),
126 ),
127 'qresult': (
128 ('query-ID', 'id'),
129 ('query-def', 'description'),
130 ('query-len', 'seq_len'),
131 ),
132 'stat': (
133 ('db-num', 'stat_db_num'),
134 ('db-len', 'stat_db_len'),
135 ('hsp-len', 'stat_hsp_len'),
136 ('eff-space', 'stat_eff_space'),
137 ('kappa', 'stat_kappa'),
138 ('lambda', 'stat_lambda'),
139 ('entropy', 'stat_entropy'),
140 ),
141 'hit': (
142 ('id', 'id'),
143 ('def', 'description'),
144 ('accession', 'accession'),
145 ('len', 'seq_len'),
146 ),
147 'hsp': (
148 ('bit-score', 'bitscore'),
149 ('score', 'bitscore_raw'),
150 ('evalue', 'evalue'),
151 ('query-from', 'query_start'),
152 ('query-to', 'query_end'),
153 ('hit-from', 'hit_start'),
154 ('hit-to', 'hit_end'),
155 ('pattern-from', 'pattern_start'),
156 ('pattern-to', 'pattern_end'),
157 ('query-frame', 'query_frame'),
158 ('hit-frame', 'hit_frame'),
159 ('identity', 'ident_num'),
160 ('positive', 'pos_num'),
161 ('gaps', 'gap_num'),
162 ('align-len', 'aln_span'),
163 ('density', 'density'),
164 ('qseq', 'query'),
165 ('hseq', 'hit'),
166 ('midline', None),
167 ),
168 }
169
170 _DTD_OPT = (
171 'BlastOutput_query-seq', 'BlastOutput_mbstat', 'Iteration_query-def',
172 'Iteration_query-len', 'Iteration-hits', 'Iteration_stat',
173 'Iteration_message', 'Parameters_matrix', 'Parameters_include',
174 'Parameters_sc-match', 'Parameters_sc-mismatch', 'Parameters_filter',
175 'Parameters_pattern', 'Parameters_entrez-query', 'Hit_hsps',
176 'Hsp_pattern-from', 'Hsp_pattern-to', 'Hsp_query-frame', 'Hsp_hit-frame',
177 'Hsp_identity', 'Hsp_positive', 'Hsp_gaps', 'Hsp_align-len', 'Hsp_density',
178 'Hsp_midline',
179 )
180
181
182 _RE_VERSION = re.compile(r'\d+\.\d+\.\d+\+?')
183
184
186 """Parser for the BLAST XML format"""
187
189 self.xml_iter = iter(ElementTree.iterparse(handle, events=('start', 'end')))
190 self._meta, self._fallback = self._parse_preamble()
191
195
197 """Parses all tag data prior to the first query result."""
198
199 meta = {}
200
201 fallback = {}
202
203
204 for event, elem in self.xml_iter:
205
206 if event == 'end' and elem.tag in _ELEM_META:
207 attr_name, caster = _ELEM_META[elem.tag]
208
209 if caster is not str:
210 meta[attr_name] = caster(elem.text)
211 else:
212 meta[attr_name] = elem.text
213
214
215 elem.clear()
216 continue
217
218
219
220 elif event == 'end' and elem.tag in _ELEM_QRESULT_FALLBACK:
221 attr_name, caster = _ELEM_QRESULT_FALLBACK[elem.tag]
222
223 if caster is not str:
224 fallback[attr_name] = caster(elem.text)
225 else:
226 fallback[attr_name] = elem.text
227
228 elem.clear()
229 continue
230
231 if event == 'start' and elem.tag == 'Iteration':
232 break
233
234
235 if meta.get('version') is not None:
236 meta['version'] = re.search(_RE_VERSION,
237 meta['version']).group(0)
238
239 return meta, fallback
240
242 """Parses query results."""
243
244 for event, qresult_elem in self.xml_iter:
245
246
247 if event == 'end' and qresult_elem.tag == 'Iteration':
248
249
250
251
252
253
254
255
256
257
258
259
260 query_id = qresult_elem.findtext('Iteration_query-ID')
261 if query_id is None:
262 query_id = self._fallback['id']
263
264 query_desc = qresult_elem.findtext('Iteration_query-def')
265 if query_desc is None:
266 query_desc = self._fallback['description']
267
268 query_len = qresult_elem.findtext('Iteration_query-len')
269 if query_len is None:
270 query_len = self._fallback['len']
271
272
273
274
275 if query_id.startswith('Query_') or query_id.startswith('lcl|'):
276
277 blast_query_id = query_id
278 id_desc = query_desc.split(' ', 1)
279 query_id = id_desc[0]
280 try:
281 query_desc = id_desc[1]
282 except IndexError:
283 query_desc = ''
284 else:
285 blast_query_id = ''
286
287 hit_list, key_list = [], []
288 for hit in self._parse_hit(qresult_elem.find('Iteration_hits'),
289 query_id):
290 if hit:
291
292 if hit.id in key_list:
293
294
295 hit.description = '%s %s' % (hit.id, hit.description)
296 hit.id = hit._blast_id
297
298 for hsp in hit:
299 hsp.hit_id = hit._blast_id
300 else:
301 key_list.append(hit.id)
302
303 hit_list.append(hit)
304
305
306 qresult = QueryResult(query_id, hits=hit_list)
307 qresult.description = query_desc
308 qresult.seq_len = int(query_len)
309 qresult._blast_id = blast_query_id
310 for key, value in self._meta.items():
311 setattr(qresult, key, value)
312
313
314
315
316
317
318
319
320
321
322
323
324 stat_iter_elem = qresult_elem.find('Iteration_stat')
325 if stat_iter_elem is not None:
326 stat_elem = stat_iter_elem.find('Statistics')
327
328 for key, val_info in _ELEM_QRESULT_OPT.items():
329 value = stat_elem.findtext(key)
330 if value is not None:
331 caster = val_info[1]
332
333 if value is not None and caster is not str:
334 value = caster(value)
335 setattr(qresult, val_info[0], value)
336
337
338 qresult_elem.clear()
339 yield qresult
340
342 """Generator that transforms Iteration_hits XML elements into Hit objects.
343
344 Arguments:
345 root_hit_elem -- Element object of the Iteration_hits tag.
346 query_id -- String of QueryResult ID of this Hit
347
348 """
349
350
351
352
353
354
355
356
357
358
359
360
361 if root_hit_elem is None:
362 root_hit_elem = []
363
364 for hit_elem in root_hit_elem:
365
366
367 hit_id = hit_elem.findtext('Hit_id')
368 hit_desc = hit_elem.findtext('Hit_def')
369
370 if hit_id.startswith('gnl|BL_ORD_ID|'):
371 blast_hit_id = hit_id
372 id_desc = hit_desc.split(' ', 1)
373 hit_id = id_desc[0]
374 try:
375 hit_desc = id_desc[1]
376 except IndexError:
377 hit_desc = ''
378 else:
379 blast_hit_id = ''
380
381 hsps = [hsp for hsp in
382 self._parse_hsp(hit_elem.find('Hit_hsps'),
383 query_id, hit_id)]
384
385 hit = Hit(hsps)
386 hit.description = hit_desc
387
388 hit._blast_id = blast_hit_id
389
390 for key, val_info in _ELEM_HIT.items():
391 value = hit_elem.findtext(key)
392 if value is not None:
393 caster = val_info[1]
394
395 if value is not None and caster is not str:
396 value = caster(value)
397 setattr(hit, val_info[0], value)
398
399
400 hit_elem.clear()
401 yield hit
402
403 - def _parse_hsp(self, root_hsp_frag_elem, query_id, hit_id):
404 """Iterator that transforms Hit_hsps XML elements into HSP objects.
405
406 Arguments:
407 root_hsp_frag_elem -- Element object of the Hit_hsps tag.
408 query_id -- Query ID string.
409 hit_id -- Hit ID string.
410
411 """
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436 if root_hsp_frag_elem is None:
437 root_hsp_frag_elem = []
438
439 for hsp_frag_elem in root_hsp_frag_elem:
440 coords = {}
441 frag = HSPFragment(hit_id, query_id)
442 for key, val_info in _ELEM_FRAG.items():
443 value = hsp_frag_elem.findtext(key)
444 caster = val_info[1]
445
446
447 if value is not None:
448 if key.endswith('-from') or key.endswith('-to'):
449
450 coords[val_info[0]] = caster(value)
451 continue
452
453 elif caster is not str:
454 value = caster(value)
455 setattr(frag, val_info[0], value)
456
457
458 frag.aln_annotation['homology'] = \
459 hsp_frag_elem.findtext('Hsp_midline')
460
461
462
463
464
465 for coord_type in ('query', 'hit', 'pattern'):
466 start_type = coord_type + '_start'
467 end_type = coord_type + '_end'
468 try:
469 start = coords[start_type]
470 end = coords[end_type]
471 except KeyError:
472 continue
473 else:
474
475 setattr(frag, start_type, min(start, end) - 1)
476 setattr(frag, end_type, max(start, end))
477
478
479 prog = self._meta.get('program')
480 if prog == 'blastn':
481 frag.alphabet = generic_dna
482 elif prog in ['blastp', 'blastx', 'tblastn', 'tblastx']:
483 frag.alphabet = generic_protein
484
485 hsp = HSP([frag])
486 for key, val_info in _ELEM_HSP.items():
487 value = hsp_frag_elem.findtext(key)
488 caster = val_info[1]
489 if value is not None:
490 if caster is not str:
491 value = caster(value)
492 setattr(hsp, val_info[0], value)
493
494 hsp_frag_elem.clear()
495 yield hsp
496
497
499 """Indexer class for BLAST XML output."""
500
501 _parser = BlastXmlParser
502 qstart_mark = _as_bytes('<Iteration>')
503 qend_mark = _as_bytes('</Iteration>')
504 block_size = 16384
505
507 SearchIndexer.__init__(self, filename)
508
509 iter_obj = self._parser(self._handle)
510 self._meta, self._fallback = iter_obj._meta, iter_obj._fallback
511
513 qstart_mark = self.qstart_mark
514 qend_mark = self.qend_mark
515 blast_id_mark = _as_bytes('Query_')
516 block_size = self.block_size
517 handle = self._handle
518 handle.seek(0)
519 re_desc = re.compile(_as_bytes(r'<Iteration_query-ID>(.*?)'
520 '</Iteration_query-ID>\s+?<Iteration_query-def>'
521 '(.*?)</Iteration_query-def>'))
522 re_desc_end = re.compile(_as_bytes(r'</Iteration_query-def>'))
523 counter = 0
524
525 while True:
526 start_offset = handle.tell()
527 line = handle.readline()
528 if not line:
529 break
530 if qstart_mark not in line:
531 continue
532
533
534 assert line.count(qstart_mark) == 1, "XML without line breaks?"
535 assert line.lstrip().startswith(qstart_mark), line
536 if qend_mark in line:
537
538 block = line
539 else:
540
541 block = [line]
542 while line and qend_mark not in line:
543 line = handle.readline()
544 assert qstart_mark not in line, line
545 block.append(line)
546 assert line.rstrip().endswith(qend_mark), line
547 block = _empty_bytes_string.join(block)
548 assert block.count(qstart_mark) == 1, "XML without line breaks? %r" % block
549 assert block.count(qend_mark) == 1, "XML without line breaks? %r" % block
550
551 regx = re.search(re_desc, block)
552 try:
553 qstart_desc = regx.group(2)
554 qstart_id = regx.group(1)
555 except AttributeError:
556
557 assert re.search(re_desc_end, block)
558 qstart_desc = _as_bytes(self._fallback['description'])
559 qstart_id = _as_bytes(self._fallback['id'])
560 if qstart_id.startswith(blast_id_mark):
561 qstart_id = qstart_desc.split(_as_bytes(' '), 1)[0]
562 yield _bytes_to_string(qstart_id), start_offset, len(block)
563 counter += 1
564
566
567
568 generator = self._parser(handle, **self._kwargs)
569 generator._meta = self._meta
570 generator._fallback = self._fallback
571 return iter(generator).next()
572
587
588
590 """Event-based XML Generator."""
591
592 - def __init__(self, out, encoding='utf-8', indent=" ", increment=2):
593 XMLGenerator.__init__(self, out, encoding)
594
595 self._indent = indent
596
597 self._level = 0
598
599 self._increment = increment
600
601 self._parent_stack = []
602
603 try:
604
605 self.write = self._write
606 except AttributeError:
607
608 self.write = self._out.write
609
611 """Starts the XML document."""
612 self.write('<?xml version="1.0"?>\n'
613 '<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" '
614 '"http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">\n')
615
617 """Starts an XML element.
618
619 Arguments:
620 name -- String of element name.
621 attrs -- Dictionary of element attributes.
622 children -- Boolean, whether the element has children or not.
623
624 """
625 self.ignorableWhitespace(self._indent * self._level)
626 XMLGenerator.startElement(self, name, attrs)
627
629 """Ends and XML element of the given name."""
630 XMLGenerator.endElement(self, name)
631 self.write('\n')
632
634 """Starts an XML element which has children.
635
636 Arguments:
637 name -- String of element name.
638 attrs -- Dictionary of element attributes.
639
640 """
641 self.startElement(name, attrs, children=True)
642 self._level += self._increment
643 self.write('\n')
644
645 self._parent_stack.append(name)
646
648 """Ends an XML element with children."""
649
650 name = self._parent_stack.pop()
651 self._level -= self._increment
652 self.ignorableWhitespace(self._indent * self._level)
653 self.endElement(name)
654
656 """Starts XML elements without children."""
657 for name in names:
658 self.startParent(name)
659
661 """Ends XML elements, according to the given number."""
662 for i in range(num):
663 self.endParent()
664
671
673
674 content = escape(content)
675 for a, b in (('"', '"'), ("'", ''')):
676 content = content.replace(a, b)
677 self.write(content)
678
679
681 """Stream-based BLAST+ XML Writer."""
682
685
687 """Writes the XML contents to the output handle."""
688 xml = self.xml
689 self.qresult_counter, self.hit_counter, self.hsp_counter, \
690 self.frag_counter = 0, 0, 0, 0
691
692
693 first_qresult = qresults.next()
694
695 xml.startDocument()
696 xml.startParent('BlastOutput')
697 self._write_preamble(first_qresult)
698
699 xml.startParent('BlastOutput_iterations')
700 self._write_qresults(chain([first_qresult], qresults))
701 xml.endParents(2)
702 xml.endDocument()
703
704 return self.qresult_counter, self.hit_counter, self.hsp_counter, \
705 self.frag_counter
706
708 """Writes sibling XML elements.
709
710 Arguments:
711 block_name -- String of common element name prefix.
712 map_name -- Dictionary name to for mapping element and attribute names.
713 obj -- Object whose attribute values will be used.
714 opt_dict -- Dictionary for custom element-attribute mapping.
715
716 """
717 for elem, attr in _WRITE_MAPS[map_name]:
718 elem = block_name + elem
719 try:
720 content = str(getattr(obj, attr))
721 except AttributeError:
722
723 assert elem in _DTD_OPT, "Element %r (attribute %r) not " \
724 "found" % (elem, attr)
725 else:
726
727 if elem in opt_dict:
728 content = opt_dict[elem]
729 self.xml.simpleElement(elem, content)
730
732 """Writes the XML file preamble."""
733 xml = self.xml
734
735 for elem, attr in _WRITE_MAPS['preamble']:
736 elem = 'BlastOutput_' + elem
737 if elem == 'BlastOutput_param':
738 xml.startParent(elem)
739 self._write_param(qresult)
740 xml.endParent()
741 continue
742 try:
743 content = str(getattr(qresult, attr))
744 except AttributeError:
745 assert elem in _DTD_OPT, "Element %s (attribute %s) not " \
746 "found" % (elem, attr)
747 else:
748 if elem == 'BlastOutput_version':
749 content = '%s %s' % (qresult.program.upper(),
750 qresult.version)
751 elif qresult._blast_id:
752 if elem == 'BlastOutput_query-ID':
753 content = qresult._blast_id
754 elif elem == 'BlastOutput_query-def':
755 content = ' '.join([qresult.id,
756 qresult.description]).strip()
757 xml.simpleElement(elem, content)
758
760 """Writes the parameter block of the preamble."""
761 xml = self.xml
762 xml.startParent('Parameters')
763 self._write_elem_block('Parameters_', 'param', qresult)
764 xml.endParent()
765
767 """Writes QueryResult objects into iteration elements."""
768 xml = self.xml
769
770 for num, qresult in enumerate(qresults):
771 xml.startParent('Iteration')
772 xml.simpleElement('Iteration_iter-num', str(num+1))
773 opt_dict = {}
774
775
776 if qresult._blast_id:
777 opt_dict = {
778 'Iteration_query-ID': qresult._blast_id,
779 'Iteration_query-def': ' '.join([qresult.id,
780 qresult.description]).strip(),
781 }
782 self._write_elem_block('Iteration_', 'qresult', qresult, opt_dict)
783
784 if qresult:
785 xml.startParent('Iteration_hits')
786 self._write_hits(qresult.hits)
787 xml.endParent()
788
789 else:
790 xml.simpleElement('Iteration_hits', '')
791
792 xml.startParents('Iteration_stat', 'Statistics')
793 self._write_elem_block('Statistics_', 'stat', qresult)
794 xml.endParents(2)
795
796 if not qresult:
797 xml.simpleElement('Iteration_message', 'No hits found')
798 self.qresult_counter += 1
799 xml.endParent()
800
821
823 """Writes HSP objects."""
824 xml = self.xml
825 for num, hsp in enumerate(hsps):
826 xml.startParent('Hsp')
827 xml.simpleElement('Hsp_num', str(num+1))
828 for elem, attr in _WRITE_MAPS['hsp']:
829 elem = 'Hsp_' + elem
830 try:
831 content = self._adjust_output(hsp, elem, attr)
832
833
834 except AttributeError:
835 assert elem in _DTD_OPT, "Element %s (attribute %s) not found" \
836 % (elem, attr)
837 else:
838 xml.simpleElement(elem, str(content))
839 self.hsp_counter += 1
840 self.frag_counter += len(hsp.fragments)
841 xml.endParent()
842
844 """Adjusts output to mimic native BLAST+ XML as much as possible."""
845
846
847 if attr in ('query_start' ,'query_end' ,'hit_start', 'hit_end',
848 'pattern_start', 'pattern_end'):
849 content = getattr(hsp, attr) + 1
850 if '_start' in attr:
851 content = getattr(hsp, attr) + 1
852 else:
853 content = getattr(hsp, attr)
854
855
856
857
858 if hsp.query_frame != 0 and hsp.hit_frame < 0:
859 if attr == 'hit_start':
860 content = getattr(hsp, 'hit_end')
861 elif attr == 'hit_end':
862 content = getattr(hsp, 'hit_start') + 1
863
864
865 elif elem in ('Hsp_hseq', 'Hsp_qseq'):
866 content = str(getattr(hsp, attr).seq)
867 elif elem == 'Hsp_midline':
868 content = hsp.aln_annotation['homology']
869 elif elem in ('Hsp_evalue', 'Hsp_bit-score'):
870
871 content = '%.*g' % (6, getattr(hsp, attr))
872 else:
873 content = getattr(hsp, attr)
874
875 return content
876
877
878
879 if __name__ == "__main__":
880 from Bio._utils import run_doctest
881 run_doctest()
882