1
2
3
4
5
6 """Bio.SearchIO abstract base parser for Exonerate standard output format."""
7
8 import re
9
10 from Bio.SearchIO._index import SearchIndexer
11 from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment
12 from Bio.SeqUtils import seq1
13
14
15
16 _STRAND_MAP = {'+': 1, '-': -1, '.': 0}
17
18 _RE_SHIFTS = re.compile(r'(#+)')
19
20 _RE_TRANS = re.compile(r'[53ISCF]')
21
22
27
28
30 """Splits a string into a list containing triplets of the original
31 string."""
32 return [seq[3*i:3*(i+1)] for i in range(len(seq) // 3)]
33
34
36 """Transforms three-letter amino acid codes into one-letters in the
37 given HSPFragments."""
38 hsp_hstart = fraglist[0].hit_start
39 hsp_qstart = fraglist[0].query_start
40 for frag in fraglist:
41 assert frag.query_strand == 0
42
43 assert len(frag) % 3 == 0
44
45 hstep = 1 if frag.hit_strand >= 0 else -1
46
47
48 custom_map = {'***': '*', '<->': '-'}
49
50 hseq1 = seq1(str(frag.hit.seq), custom_map=custom_map)
51 hstart = hsp_hstart
52 hend = hstart + len(hseq1.replace('-', '')) * hstep
53
54 qseq1 = seq1(str(frag.query.seq), custom_map=custom_map)
55 qstart = hsp_qstart
56 qend = qstart + len(qseq1.replace('-', ''))
57
58
59 frag.hit = None
60 frag.query = None
61 frag.hit = hseq1
62 frag.query = qseq1
63
64
65
66
67 frag.query_start, frag.query_end = qstart, qend
68
69
70
71 for annot, annotseq in frag.aln_annotation.items():
72 frag.aln_annotation[annot] = _make_triplets(annotseq)
73
74
75 hsp_hstart, hsp_qstart = hend, qend
76
77 return fraglist
78
79
81 """Splits one HSPFragment containing frame-shifted alignment into two."""
82
83
84
85 homol = frag.aln_annotation['homology']
86
87 assert homol.count('#') > 0
88
89 split_frags = []
90 qstep = 1 if frag.query_strand >= 0 else -1
91 hstep = 1 if frag.hit_strand >= 0 else -1
92 qpos = min(frag.query_range) if qstep >= 0 else max(frag.query_range)
93 hpos = min(frag.hit_range) if qstep >= 0 else max(frag.hit_range)
94 abs_pos = 0
95
96 while homol:
97
98 try:
99 shifts = re.search(_RE_SHIFTS, homol).group(1)
100 s_start = homol.find(shifts)
101 s_stop = s_start + len(shifts)
102 split = frag[abs_pos:abs_pos+s_start]
103 except AttributeError:
104 shifts = ''
105 s_start = 0
106 s_stop = len(homol)
107 split = frag[abs_pos:]
108
109
110 qstart, hstart = qpos, hpos
111 qpos += (len(split) - sum(str(split.query.seq).count(x)
112 for x in ('-', '<', '>'))) * qstep
113 hpos += (len(split) - sum(str(split.hit.seq).count(x)
114 for x in ('-', '<', '>'))) * hstep
115
116 split.hit_start = min(hstart, hpos)
117 split.query_start = min(qstart, qpos)
118 split.hit_end = max(hstart, hpos)
119 split.query_end = max(qstart, qpos)
120
121
122 abs_slice = slice(abs_pos+s_start, abs_pos+s_stop)
123 if len(frag.aln_annotation) == 2:
124 seqs = (str(frag[abs_slice].query.seq),
125 str(frag[abs_slice].hit.seq))
126 elif len(frag.aln_annotation) == 3:
127 seqs = (frag[abs_slice].aln_annotation['query_annotation'],
128 frag[abs_slice].aln_annotation['hit_annotation'],)
129 if '#' in seqs[0]:
130 qpos += len(shifts) * qstep
131 elif '#' in seqs[1]:
132 hpos += len(shifts) * hstep
133
134
135 _set_frame(split)
136 split_frags.append(split)
137
138 homol = homol[s_stop:]
139 abs_pos += s_stop
140
141 return split_frags
142
143
145 """Returns a list of HSP objects from the given parsed HSP values."""
146 frags = []
147
148 for idx, qcoords in enumerate(hspd['query_ranges']):
149
150 hseqlist = hspd.get('hit')
151 hseq = '' if hseqlist is None else hseqlist[idx]
152 qseqlist = hspd.get('query')
153 qseq = '' if qseqlist is None else qseqlist[idx]
154 frag = HSPFragment(hid, qid, hit=hseq, query=qseq)
155
156 frag.query_start = qcoords[0]
157 frag.query_end = qcoords[1]
158 frag.hit_start = hspd['hit_ranges'][idx][0]
159 frag.hit_end = hspd['hit_ranges'][idx][1]
160
161 try:
162 aln_annot = hspd.get('aln_annotation', {})
163 for key, value in aln_annot.items():
164 frag.aln_annotation[key] = value[idx]
165 except IndexError:
166 pass
167
168 frag.query_strand = hspd['query_strand']
169 frag.hit_strand = hspd['hit_strand']
170
171 if frag.aln_annotation.get('homology') is not None:
172 if '#' in frag.aln_annotation['homology']:
173 frags.extend(_split_fragment(frag))
174 continue
175
176 if len(frag.aln_annotation) > 1 or \
177 frag.query_strand == 0 or \
178 ('vulgar_comp' in hspd and re.search(_RE_TRANS, hspd['vulgar_comp'])):
179 _set_frame(frag)
180
181 frags.append(frag)
182
183
184
185
186 if len(frags[0].aln_annotation) == 2:
187 frags = _adjust_aa_seq(frags)
188
189 hsp = HSP(frags)
190
191 for attr in ('score', 'hit_split_codons', 'query_split_codons',
192 'model', 'vulgar_comp', 'cigar_comp', 'alphabet'):
193 if attr in hspd:
194 setattr(hsp, attr, hspd[attr])
195
196 return hsp
197
198
200 """Parse the 'Query:' line of exonerate alignment outputs."""
201 try:
202 mark, id, desc = line.split(' ', 2)
203 except ValueError:
204 mark, id = line.split(' ', 1)
205 desc = ''
206
207 return id, desc
208
209
211
212 """Abstract iterator for exonerate format."""
213
214 _ALN_MARK = None
215
219
242
244 """Reads the file handle until the given bool function returns True."""
245 while True:
246 if not self.line or bool_func(self.line):
247 return
248 else:
249 self.line = self.handle.readline()
250
252 raise NotImplementedError("Subclass must implement this")
253
255
256 aln_header = []
257 while not self.line == '\n':
258 aln_header.append(self.line.strip())
259 self.line = self.handle.readline()
260
261 qresult, hit, hsp = {}, {}, {}
262 for line in aln_header:
263
264 if line.startswith('Query:'):
265 qresult['id'], qresult['description'] = \
266 _parse_hit_or_query_line(line)
267
268 elif line.startswith('Target:'):
269 hit['id'], hit['description'] = \
270 _parse_hit_or_query_line(line)
271
272 elif line.startswith('Model:'):
273 qresult['model'] = line.split(' ', 1)[1]
274
275 elif line.startswith('Raw score:'):
276 hsp['score'] = line.split(' ', 2)[2]
277
278 elif line.startswith('Query range:'):
279
280
281 hsp['query_start'], hsp['query_end'] = line.split(' ', 4)[2:5:2]
282
283 elif line.startswith('Target range:'):
284
285 hsp['hit_start'], hsp['hit_end'] = line.split(' ', 4)[2:5:2]
286
287
288 if qresult['description'].endswith(':[revcomp]'):
289 hsp['query_strand'] = '-'
290 qresult['description'] = qresult['description'].replace(':[revcomp]', '')
291 elif 'protein2' in qresult['model']:
292 hsp['query_strand'] = '.'
293 else:
294 hsp['query_strand'] = '+'
295 if hit['description'].endswith(':[revcomp]'):
296 hsp['hit_strand'] = '-'
297 hit['description'] = hit['description'].replace(':[revcomp]', '')
298 else:
299 hsp['hit_strand'] = '+'
300
301
302
303
304
305 return {'qresult': qresult, 'hit': hit, 'hsp': hsp}
306
308
309 state_EOF = 0
310 state_QRES_NEW = 1
311 state_QRES_SAME = 3
312 state_HIT_NEW = 2
313 state_HIT_SAME = 4
314
315 qres_state, hit_state = None, None
316 file_state = None
317 prev_qid, prev_hid = None, None
318 cur, prev = None, None
319 hit_list, hsp_list = [], []
320
321 if self.has_c4_alignment:
322 self._ALN_MARK = 'C4 Alignment:'
323
324 while True:
325 self.read_until(lambda line: line.startswith(self._ALN_MARK))
326 if cur is not None:
327 prev = cur
328 prev_qid = cur_qid
329 prev_hid = cur_hid
330
331 if self.line:
332 assert self.line.startswith(self._ALN_MARK), self.line
333
334 header = {'qresult': {}, 'hit': {}, 'hsp': {}}
335
336 if self.has_c4_alignment:
337 self.read_until(lambda line:
338 line.strip().startswith('Query:'))
339 header = self._parse_alignment_header()
340
341 cur = self.parse_alignment_block(header)
342 cur_qid = cur['qresult']['id']
343 cur_hid = cur['hit']['id']
344 elif not self.line or self.line.startswith('-- completed '):
345 file_state = state_EOF
346 cur_qid, cur_hid = None, None
347
348
349 if prev_qid != cur_qid:
350 qres_state = state_QRES_NEW
351 else:
352 qres_state = state_QRES_SAME
353
354 if prev_hid != cur_hid or qres_state == state_QRES_NEW:
355 hit_state = state_HIT_NEW
356 else:
357 hit_state = state_HIT_SAME
358
359 if prev is not None:
360 hsp = _create_hsp(prev_hid, prev_qid, prev['hsp'])
361 hsp_list.append(hsp)
362
363 if hit_state == state_HIT_NEW:
364 hit = Hit(hsp_list)
365 for attr, value in prev['hit'].items():
366 setattr(hit, attr, value)
367 hit_list.append(hit)
368 hsp_list = []
369
370 if qres_state == state_QRES_NEW or file_state == state_EOF:
371 qresult = QueryResult(prev_qid)
372 for hit in hit_list:
373
374
375 qresult.absorb(hit)
376 for attr, value in prev['qresult'].items():
377 setattr(qresult, attr, value)
378 yield qresult
379 if file_state == state_EOF:
380 break
381 hit_list = []
382
383
384
385
386 if not self.has_c4_alignment:
387 self.line = self.handle.readline()
388
389
391
392 """Indexer class for Exonerate plain text."""
393
394 _parser = None
395 _query_mark = None
396
398 raise NotImplementedError("Should be defined by subclass")
399
401 """Iterates over the file handle; yields key, start offset, and length."""
402 handle = self._handle
403 handle.seek(0)
404 qresult_key = None
405
406 while True:
407 start_offset = handle.tell()
408 line = handle.readline()
409 if line.startswith(self._query_mark):
410 if qresult_key is None:
411 qresult_key = self.get_qresult_id(start_offset)
412 qresult_offset = start_offset
413 else:
414 curr_key = self.get_qresult_id(start_offset)
415 if curr_key != qresult_key:
416 yield qresult_key, qresult_offset, \
417 start_offset - qresult_offset
418 qresult_key = curr_key
419 qresult_offset = start_offset
420 handle.seek(qresult_offset)
421 elif not line:
422 yield qresult_key, qresult_offset, \
423 start_offset - qresult_offset
424 break
425
426
427
428 if __name__ == "__main__":
429 from Bio._utils import run_doctest
430 run_doctest()
431