1
2
3
4
5
6 """Bio.SearchIO parser for BLAST+ plain text output formats.
7
8 At the moment this is a wrapper around Biopython's NCBIStandalone text
9 parser.
10
11 """
12
13 from Bio.Alphabet import generic_dna, generic_protein
14 from Bio.Blast import NCBIStandalone
15 from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment
16
17
18 __all__ = ['BlastTextParser']
19
20
21 -class BlastTextParser(object):
22
23 """Parser for the BLAST text format."""
24
25 - def __init__(self, handle):
26 self.handle = handle
27 blast_parser = NCBIStandalone.BlastParser()
28 self.blast_iter = NCBIStandalone.Iterator(handle, blast_parser)
29
31 for rec in self.blast_iter:
32
33
34 if rec.query.startswith('>'):
35 rec.query = rec.query[1:]
36 try:
37 qid, qdesc = rec.query.split(' ', 1)
38 except ValueError:
39 qid, qdesc = rec.query, ''
40 qdesc = qdesc.replace('\n', '').replace('\r', '')
41
42 qresult = QueryResult(qid)
43 qresult.program = rec.application.lower()
44 qresult.target = rec.database
45 qresult.seq_len = rec.query_letters
46 qresult.version = rec.version
47
48
49 if qresult.program == 'blastn':
50 alphabet = generic_dna
51 elif qresult.program in ['blastp', 'blastx', 'tblastn', 'tblastx']:
52 alphabet = generic_protein
53
54
55 for idx, aln in enumerate(rec.alignments):
56
57 if aln.title.startswith('> '):
58 aln.title = aln.title[2:]
59 elif aln.title.startswith('>'):
60 aln.title = aln.title[1:]
61 try:
62 hid, hdesc = aln.title.split(' ', 1)
63 except ValueError:
64 hid, hdesc = aln.title, ''
65 hdesc = hdesc.replace('\n', '').replace('\r', '')
66
67
68 hsp_list = []
69 for bhsp in aln.hsps:
70 frag = HSPFragment(hid, qid)
71 frag.alphabet = alphabet
72
73 frag.aln_span = bhsp.identities[1]
74
75 try:
76 frag.query_frame = int(bhsp.frame[0])
77 except IndexError:
78 if qresult.program in ('blastp', 'tblastn'):
79 frag.query_frame = 0
80 else:
81 frag.query_frame = 1
82 try:
83 frag.hit_frame = int(bhsp.frame[1])
84 except IndexError:
85 if qresult.program in ('blastp', 'tblastn'):
86 frag.hit_frame = 0
87 else:
88 frag.hit_frame = 1
89
90 frag.query_start = min(bhsp.query_start,
91 bhsp.query_end) - 1
92 frag.query_end = max(bhsp.query_start, bhsp.query_end)
93
94 frag.hit_start = min(bhsp.sbjct_start,
95 bhsp.sbjct_end) - 1
96 frag.hit_end = max(bhsp.sbjct_start, bhsp.sbjct_end)
97
98 qseq = ''
99 hseq = ''
100 midline = ''
101 for seqtrio in zip(bhsp.query, bhsp.sbjct, bhsp.match):
102 qchar, hchar, mchar = seqtrio
103 if qchar == ' ' or hchar == ' ':
104 assert all([' ' == x for x in seqtrio])
105 else:
106 qseq += qchar
107 hseq += hchar
108 midline += mchar
109 frag.query, frag.hit = qseq, hseq
110 frag.aln_annotation['homology'] = midline
111
112
113 hsp = HSP([frag])
114 hsp.evalue = bhsp.expect
115 hsp.bitscore = bhsp.bits
116 hsp.bitscore_raw = bhsp.score
117
118 try:
119 hsp.gap_num = bhsp.gaps[0]
120 except IndexError:
121 hsp.gap_num = 0
122
123 hsp.ident_num = bhsp.identities[0]
124 hsp.pos_num = bhsp.positives[0]
125 if hsp.pos_num is None:
126 hsp.pos_num = hsp[0].aln_span
127
128 hsp_list.append(hsp)
129
130 hit = Hit(hsp_list)
131 hit.seq_len = aln.length
132 hit.description = hdesc
133 qresult.append(hit)
134
135 qresult.description = qdesc
136 yield qresult
137
138
139
140 if __name__ == "__main__":
141 from Bio._utils import run_doctest
142 run_doctest()
143