1
2
3
4
5
6
7
8 """Bio.SearchIO support for Bill Pearson's FASTA tools.
9
10 This module adds support for parsing FASTA outputs. FASTA is a suite of
11 programs that finds regions of local or global similarity between protein
12 or nucleotide sequences, either by searching databases or identifying
13 local duplications.
14
15 Bio.SearchIO.FastaIO was tested on the following FASTA flavors and versions:
16
17 - flavors: fasta, ssearch, tfastx
18 - versions: 35, 36
19
20 Other flavors and/or versions may introduce some bugs. Please file a bug report
21 if you see such problems to Biopython's bug tracker.
22
23 More information on FASTA are available through these links:
24 - Website: http://fasta.bioch.virginia.edu/fasta_www2/fasta_list2.shtml
25 - User guide: http://fasta.bioch.virginia.edu/fasta_www2/fasta_guide.pdf
26
27
28 Supported Formats
29 =================
30
31 Bio.SearchIO.FastaIO supports parsing and indexing FASTA outputs triggered by
32 the -m 10 flag. Other formats that mimic other programs (e.g. the BLAST tabular
33 format using the -m 8 flag) may be parseable but using SearchIO's other parsers
34 (in this case, using the 'blast-tab' parser).
35
36
37 fasta-m10
38 =========
39
40 Note that in FASTA -m 10 outputs, HSPs from different strands are considered to
41 be from different hits. They are listed as two separate entries in the hit
42 table. FastaIO recognizes this and will group HSPs with the same hit ID into a
43 single Hit object, regardless of strand.
44
45 FASTA also sometimes output extra sequences adjacent to the HSP match. These
46 extra sequences are discarded by FastaIO. Only regions containing the actual
47 sequence match are extracted.
48
49 The following object attributes are provided:
50
51 +-----------------+-------------------------+----------------------------------+
52 | Object | Attribute | Value |
53 +=================+=========================+==================================+
54 | QueryResult | description | query sequence description |
55 | +-------------------------+----------------------------------+
56 | | id | query sequence ID |
57 | +-------------------------+----------------------------------+
58 | | program | FASTA flavor |
59 | +-------------------------+----------------------------------+
60 | | seq_len | full length of query sequence |
61 | +-------------------------+----------------------------------+
62 | | target | target search database |
63 | +-------------------------+----------------------------------+
64 | | version | FASTA version |
65 +-----------------+-------------------------+----------------------------------+
66 | Hit | seq_len | full length of the hit sequence |
67 +-----------------+-------------------------+----------------------------------+
68 | HSP | bitscore | *_bits line |
69 | +-------------------------+----------------------------------+
70 | | evalue | *_expect line |
71 | +-------------------------+----------------------------------+
72 | | ident_pct | *_ident line |
73 | +-------------------------+----------------------------------+
74 | | init1_score | *_init1 line |
75 | +-------------------------+----------------------------------+
76 | | initn_score | *_initn line |
77 | +-------------------------+----------------------------------+
78 | | opt_score | *_opt line, *_s-w opt line |
79 | +-------------------------+----------------------------------+
80 | | pos_pct | *_sim line |
81 | +-------------------------+----------------------------------+
82 | | sw_score | *_score line |
83 | +-------------------------+----------------------------------+
84 | | z_score | *_z-score line |
85 +-----------------+-------------------------+----------------------------------+
86 | HSPFragment | aln_annotation | al_cons block, if present |
87 | (also via HSP) +-------------------------+----------------------------------+
88 | | hit | hit sequence |
89 | +-------------------------+----------------------------------+
90 | | hit_end | hit sequence end coordinate |
91 | +-------------------------+----------------------------------+
92 | | hit_start | hit sequence start coordinate |
93 | +-------------------------+----------------------------------+
94 | | hit_strand | hit sequence strand |
95 | +-------------------------+----------------------------------+
96 | | query | query sequence |
97 | +-------------------------+----------------------------------+
98 | | query_end | query sequence end coordinate |
99 | +-------------------------+----------------------------------+
100 | | query_start | query sequence start coordinate |
101 | +-------------------------+----------------------------------+
102 | | query_strand | query sequence strand |
103 +-----------------+-------------------------+----------------------------------+
104
105 """
106
107 import re
108
109 from Bio._py3k import _as_bytes, _bytes_to_string
110 from Bio.Alphabet import generic_dna, generic_protein
111 from Bio.File import UndoHandle
112 from Bio.SearchIO._index import SearchIndexer
113 from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment
114
115
116 __all__ = ['FastaM10Parser', 'FastaM10Indexer']
117
118
119
120
121 _RE_FLAVS = re.compile(r't?fast[afmsxy]|pr[sf][sx]|lalign|[gs]?[glso]search')
122
123 _PTR_ID_DESC_SEQLEN = r'>>>(.+?)\s+(.*?) *- (\d+) (?:aa|nt)$'
124 _RE_ID_DESC_SEQLEN = re.compile(_PTR_ID_DESC_SEQLEN)
125 _RE_ID_DESC_SEQLEN_IDX = re.compile(_as_bytes(_PTR_ID_DESC_SEQLEN))
126
127 _RE_ATTR = re.compile(r'^; [a-z]+(_[ \w-]+):\s+(.*)$')
128
129
130 _HSP_ATTR_MAP = {
131 '_initn': ('initn_score', int),
132 '_init1': ('init1_score', int),
133 '_opt': ('opt_score', int),
134 '_s-w opt': ('opt_score', int),
135 '_z-score': ('z_score', float),
136 '_bits': ('bitscore', float),
137 '_expect': ('evalue', float),
138 '_score': ('sw_score', int),
139 '_ident': ('ident_pct', float),
140 '_sim': ('pos_pct', float),
141 }
142
143
144 _STATE_NONE = 0
145 _STATE_QUERY_BLOCK = 1
146 _STATE_HIT_BLOCK = 2
147 _STATE_CONS_BLOCK = 3
148
149
151 """Helper function for appending Hits without alignments into QueryResults."""
152 for hit_row in hit_rows:
153 hit_id, remainder = hit_row.split(' ', 1)
154
155
156
157
158
159
160 if hit_id not in qresult:
161 frag = HSPFragment(hit_id, qresult.id)
162 hsp = HSP([frag])
163 hit = Hit([hsp])
164 qresult.append(hit)
165
166 return qresult
167
168
170 """Helper function for the main parsing code.
171
172 Arguments:
173 hsp -- HSP object whose properties are to be set.
174 parsed -- Dictionary containing parsed values for HSP attributes.
175 program -- String of program name.
176
177 """
178
179 for seq_type in ('hit', 'query'):
180 if 'tfast' not in program:
181 parsed[seq_type]['seq'] = _extract_alignment(parsed[seq_type])
182 assert len(parsed['query']['seq']) == len(parsed['hit']['seq']), parsed
183
184
185 assert parsed['query']['_type'] == parsed['hit']['_type']
186 type_val = parsed['query']['_type']
187 alphabet = generic_dna if type_val == 'D' else generic_protein
188 setattr(hsp.fragment, 'alphabet', alphabet)
189
190 for seq_type in ('hit', 'query'):
191
192 start = int(parsed[seq_type]['_start'])
193 end = int(parsed[seq_type]['_stop'])
194
195 setattr(hsp.fragment, seq_type + '_start', min(start, end) - 1)
196 setattr(hsp.fragment, seq_type + '_end', max(start, end))
197
198 setattr(hsp.fragment, seq_type, parsed[seq_type]['seq'])
199
200 if alphabet is not generic_protein:
201
202
203 if start <= end:
204 setattr(hsp.fragment, seq_type + '_strand', 1)
205 else:
206 setattr(hsp.fragment, seq_type + '_strand', -1)
207 else:
208 setattr(hsp.fragment, seq_type + '_strand', 0)
209
210
212 """Helper function for the main parsing code.
213
214 To get the actual pairwise alignment sequences, we must first
215 translate the un-gapped sequence based coordinates into positions
216 in the gapped sequence (which may have a flanking region shown
217 using leading - characters). To date, I have never seen any
218 trailing flanking region shown in the m10 file, but the
219 following code should also cope with that.
220
221 Note that this code seems to work fine even when the "sq_offset"
222 entries are prsent as a result of using the -X command line option.
223 """
224 seq = parsed_hsp['seq']
225 seq_stripped = seq.strip('-')
226 disp_start = int(parsed_hsp['_display_start'])
227 start = int(parsed_hsp['_start'])
228 stop = int(parsed_hsp['_stop'])
229
230 if start <= stop:
231 start = start - disp_start
232 stop = stop - disp_start + 1
233 else:
234 start = disp_start - start
235 stop = disp_start - stop + 1
236 stop += seq_stripped.count('-')
237 assert 0 <= start and start < stop and stop <= len(seq_stripped), \
238 "Problem with sequence start/stop,\n%s[%i:%i]\n%s" \
239 % (seq, start, stop, parsed_hsp)
240 return seq_stripped[start:stop]
241
242
244 """Parser for Bill Pearson's FASTA suite's -m 10 output."""
245
246 - def __init__(self, handle, __parse_hit_table=False):
249
255
257 """Parses the Fasta preamble for Fasta flavor and version."""
258 preamble = {}
259 while True:
260 self.line = self.handle.readline()
261
262 if self.line.startswith('Query'):
263 break
264
265 elif self.line.startswith(' version'):
266 preamble['version'] = self.line.split(' ')[2]
267 else:
268
269 flav_match = re.match(_RE_FLAVS, self.line.lower())
270 if flav_match:
271 preamble['program'] = flav_match.group(0)
272
273 return preamble
274
276 """Parses hit table rows."""
277
278 self.line = self.handle.readline()
279
280 hit_rows = []
281 while self.line and not self.line.strip():
282 hit_rows.append(self.line.strip())
283 self.line = self.handle.readline()
284 return hit_rows
285
287
288 qresult = None
289 hit_rows = []
290
291 state_QRES_NEW = 1
292 state_QRES_HITTAB = 3
293 state_QRES_CONTENT = 5
294 state_QRES_END = 7
295
296 while True:
297
298
299 if self.line.startswith('The best scores are:'):
300 qres_state = state_QRES_HITTAB
301
302 elif self.line.strip() == '>>>///' or not self.line:
303 qres_state = state_QRES_END
304
305 elif not self.line.startswith('>>>') and '>>>' in self.line:
306 qres_state = state_QRES_NEW
307
308 elif self.line.startswith('>>>') and not \
309 self.line.strip() == '>>><<<':
310 qres_state = state_QRES_CONTENT
311
312 else:
313 qres_state = None
314
315 if qres_state is not None:
316 if qres_state == state_QRES_HITTAB:
317
318 hit_rows = self.__parse_hit_table()
319
320 elif qres_state == state_QRES_END:
321 yield _set_qresult_hits(qresult, hit_rows)
322 break
323
324 elif qres_state == state_QRES_NEW:
325
326 if qresult is not None:
327 yield _set_qresult_hits(qresult, hit_rows)
328 regx = re.search(_RE_ID_DESC_SEQLEN, self.line)
329 query_id = regx.group(1)
330 seq_len = regx.group(3)
331 desc = regx.group(2)
332 qresult = QueryResult(query_id)
333 qresult.seq_len = int(seq_len)
334
335 self.line = self.handle.readline()
336 qresult.target = list(filter(None,
337 self.line.split(' ')))[1].strip()
338 if desc is not None:
339 qresult.description = desc
340
341 for key, value in self._preamble.items():
342 setattr(qresult, key, value)
343
344 elif qres_state == state_QRES_CONTENT:
345 assert self.line[3:].startswith(qresult.id), self.line
346 for hit, strand in self._parse_hit(query_id):
347
348 hit.description = hit.description
349
350 try:
351 qresult.append(hit)
352
353 except ValueError:
354
355
356 for hsp in hit.hsps:
357 assert strand != hsp.query_strand
358 qresult[hit.id].append(hsp)
359
360 self.line = self.handle.readline()
361
363 while True:
364 self.line = self.handle.readline()
365 if self.line.startswith('>>'):
366 break
367
368 strand = None
369 hsp_list = []
370 while True:
371 peekline = self.handle.peekline()
372
373
374 if peekline.strip() in [">>><<<", ">>>///"] or \
375 (not peekline.startswith('>>>') and '>>>' in peekline):
376
377 if state == _STATE_HIT_BLOCK:
378 parsed_hsp['hit']['seq'] += self.line.strip()
379 elif state == _STATE_CONS_BLOCK:
380 hsp.aln_annotation['homology'] += \
381 self.line.strip('\n')
382
383 _set_hsp_seqs(hsp, parsed_hsp, self._preamble['program'])
384 hit = Hit(hsp_list)
385 hit.description = hit_desc
386 hit.seq_len = seq_len
387 yield hit, strand
388 hsp_list = []
389 break
390
391 elif self.line.startswith('>>'):
392
393 if hsp_list:
394 _set_hsp_seqs(hsp, parsed_hsp, self._preamble['program'])
395 hit = Hit(hsp_list)
396 hit.description = hit_desc
397 hit.seq_len = seq_len
398 yield hit, strand
399 hsp_list = []
400
401 try:
402 hit_id, hit_desc = self.line[2:].strip().split(' ', 1)
403 except ValueError:
404 hit_id = self.line[2:].strip().split(' ', 1)[0]
405 hit_desc = ''
406
407 frag = HSPFragment(hit_id, query_id)
408 hsp = HSP([frag])
409 hsp_list.append(hsp)
410
411 state = _STATE_NONE
412 parsed_hsp = {'query':{}, 'hit': {}}
413
414 elif self.line.startswith('>--'):
415
416 _set_hsp_seqs(hsp, parsed_hsp, self._preamble['program'])
417
418 frag = HSPFragment(hit_id, query_id)
419 hsp = HSP([frag])
420 hsp_list.append(hsp)
421
422 state = _STATE_NONE
423 parsed_hsp = {'query':{}, 'hit': {}}
424
425 elif self.line.startswith('>'):
426 if state == _STATE_NONE:
427
428 assert query_id.startswith(self.line[1:].split(' ')[0]), \
429 "%r vs %r" % (query_id, self.line)
430 state = _STATE_QUERY_BLOCK
431 parsed_hsp['query']['seq'] = ''
432 elif state == _STATE_QUERY_BLOCK:
433
434 assert hit_id.startswith(self.line[1:].split(' ')[0])
435 state = _STATE_HIT_BLOCK
436 parsed_hsp['hit']['seq'] = ''
437
438 elif self.line.startswith('; al_cons'):
439 state = _STATE_CONS_BLOCK
440 hsp.fragment.aln_annotation['homology'] = ''
441 elif self.line.startswith(';'):
442
443
444
445 regx = re.search(_RE_ATTR, self.line.strip())
446 name = regx.group(1)
447 value = regx.group(2)
448
449
450 if state == _STATE_NONE:
451 if name in _HSP_ATTR_MAP:
452 attr_name, caster = _HSP_ATTR_MAP[name]
453 if caster is not str:
454 value = caster(value)
455 if name in ['_ident', '_sim']:
456 value *= 100
457 setattr(hsp, attr_name, value)
458
459 elif state == _STATE_QUERY_BLOCK:
460 parsed_hsp['query'][name] = value
461 elif state == _STATE_HIT_BLOCK:
462 if name == '_len':
463 seq_len = int(value)
464 else:
465 parsed_hsp['hit'][name] = value
466
467 else:
468 raise ValueError("Unexpected line: %r" % self.line)
469
470 else:
471 assert '>' not in self.line
472
473 if state == _STATE_HIT_BLOCK:
474 parsed_hsp['hit']['seq'] += self.line.strip()
475 elif state == _STATE_QUERY_BLOCK:
476 parsed_hsp['query']['seq'] += self.line.strip()
477 elif state == _STATE_CONS_BLOCK:
478 hsp.fragment.aln_annotation['homology'] += \
479 self.line.strip('\n')
480
481 else:
482 raise ValueError("Unexpected line: %r" % self.line)
483
484 self.line = self.handle.readline()
485
486
488 """Indexer class for Bill Pearson's FASTA suite's -m 10 output."""
489
490 _parser = FastaM10Parser
491
495
520
550
551
552
553 if __name__ == "__main__":
554 from Bio._utils import run_doctest
555 run_doctest()
556