Package Bio :: Package SearchIO :: Package ExonerateIO :: Module _base
[hide private]
[frames] | no frames]

Source Code for Module Bio.SearchIO.ExonerateIO._base

  1  # Copyright 2012 by Wibowo Arindrarto.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Bio.SearchIO abstract base parser for Exonerate standard output format.""" 
  7   
  8  import re 
  9   
 10  from Bio.SearchIO._index import SearchIndexer 
 11  from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment 
 12  from Bio.SeqUtils import seq1 
 13   
 14   
 15  # strand char-value mapping 
 16  _STRAND_MAP = {'+': 1, '-': -1, '.': 0} 
 17   
 18  _RE_SHIFTS = re.compile(r'(#+)') 
 19  # regex for checking whether a vulgar line has protein/translated components 
 20  _RE_TRANS = re.compile(r'[53ISCF]') 
 21   
 22   
23 -def _set_frame(frag):
24 """Sets the HSPFragment frames.""" 25 frag.hit_frame = (frag.hit_start % 3 + 1) * frag.hit_strand 26 frag.query_frame = (frag.query_start % 3 + 1) * frag.query_strand
27 28
29 -def _make_triplets(seq):
30 """Splits a string into a list containing triplets of the original 31 string.""" 32 return [seq[3*i:3*(i+1)] for i in range(len(seq) // 3)]
33 34
35 -def _adjust_aa_seq(fraglist):
36 """Transforms three-letter amino acid codes into one-letters in the 37 given HSPFragments.""" 38 hsp_hstart = fraglist[0].hit_start 39 hsp_qstart = fraglist[0].query_start 40 for frag in fraglist: 41 assert frag.query_strand == 0 42 # fragment should have a length that is a multiple of 3 43 assert len(frag) % 3 == 0 44 # hit step may be -1 as we're aligning to DNA 45 hstep = 1 if frag.hit_strand >= 0 else -1 46 # get one letter codes 47 # and replace gap codon markers and termination characters 48 custom_map = {'***': '*', '<->': '-'} 49 50 hseq1 = seq1(str(frag.hit.seq), custom_map=custom_map) 51 hstart = hsp_hstart 52 hend = hstart + len(hseq1.replace('-', '')) * hstep 53 54 qseq1 = seq1(str(frag.query.seq), custom_map=custom_map) 55 qstart = hsp_qstart 56 qend = qstart + len(qseq1.replace('-', '')) 57 58 # replace the old frag sequences with the new ones 59 frag.hit = None 60 frag.query = None 61 frag.hit = hseq1 62 frag.query = qseq1 63 64 # set coordinates 65 # no need to set the hit coordinates since the hit sequence 66 # is not a protein sequence 67 frag.query_start, frag.query_end = qstart, qend 68 69 # update alignment annotation 70 # by turning them into list of triplets 71 for annot, annotseq in frag.aln_annotation.items(): 72 frag.aln_annotation[annot] = _make_triplets(annotseq) 73 74 # update values for next iteration 75 hsp_hstart, hsp_qstart = hend, qend 76 77 return fraglist
78 79
80 -def _split_fragment(frag):
81 """Splits one HSPFragment containing frame-shifted alignment into two.""" 82 # given an HSPFragment object with frameshift(s), this method splits it 83 # into fragments without frameshifts by sequentially chopping it off 84 # starting from the beginning 85 homol = frag.aln_annotation['homology'] 86 # we should have at least 1 frame shift for splitting 87 assert homol.count('#') > 0 88 89 split_frags = [] 90 qstep = 1 if frag.query_strand >= 0 else -1 91 hstep = 1 if frag.hit_strand >= 0 else -1 92 qpos = min(frag.query_range) if qstep >= 0 else max(frag.query_range) 93 hpos = min(frag.hit_range) if qstep >= 0 else max(frag.hit_range) 94 abs_pos = 0 95 # split according to hit, then query 96 while homol: 97 98 try: 99 shifts = re.search(_RE_SHIFTS, homol).group(1) 100 s_start = homol.find(shifts) 101 s_stop = s_start + len(shifts) 102 split = frag[abs_pos:abs_pos+s_start] 103 except AttributeError: # no '#' in homol, i.e. last frag 104 shifts = '' 105 s_start = 0 106 s_stop = len(homol) 107 split = frag[abs_pos:] 108 109 # coordinates for the split strand 110 qstart, hstart = qpos, hpos 111 qpos += (len(split) - sum(str(split.query.seq).count(x) 112 for x in ('-', '<', '>'))) * qstep 113 hpos += (len(split) - sum(str(split.hit.seq).count(x) 114 for x in ('-', '<', '>'))) * hstep 115 116 split.hit_start = min(hstart, hpos) 117 split.query_start = min(qstart, qpos) 118 split.hit_end = max(hstart, hpos) 119 split.query_end = max(qstart, qpos) 120 121 # account for frameshift length 122 abs_slice = slice(abs_pos+s_start, abs_pos+s_stop) 123 if len(frag.aln_annotation) == 2: 124 seqs = (str(frag[abs_slice].query.seq), 125 str(frag[abs_slice].hit.seq)) 126 elif len(frag.aln_annotation) == 3: 127 seqs = (frag[abs_slice].aln_annotation['query_annotation'], 128 frag[abs_slice].aln_annotation['hit_annotation'],) 129 if '#' in seqs[0]: 130 qpos += len(shifts) * qstep 131 elif '#' in seqs[1]: 132 hpos += len(shifts) * hstep 133 134 # set frame 135 _set_frame(split) 136 split_frags.append(split) 137 # set homology string and absolute position for the next loop 138 homol = homol[s_stop:] 139 abs_pos += s_stop 140 141 return split_frags
142 143
144 -def _create_hsp(hid, qid, hspd):
145 """Returns a list of HSP objects from the given parsed HSP values.""" 146 frags = [] 147 # we are iterating over query_ranges, but hit_ranges works just as well 148 for idx, qcoords in enumerate(hspd['query_ranges']): 149 # get sequences, create object 150 hseqlist = hspd.get('hit') 151 hseq = '' if hseqlist is None else hseqlist[idx] 152 qseqlist = hspd.get('query') 153 qseq = '' if qseqlist is None else qseqlist[idx] 154 frag = HSPFragment(hid, qid, hit=hseq, query=qseq) 155 # coordinates 156 frag.query_start = qcoords[0] 157 frag.query_end = qcoords[1] 158 frag.hit_start = hspd['hit_ranges'][idx][0] 159 frag.hit_end = hspd['hit_ranges'][idx][1] 160 # alignment annotation 161 try: 162 aln_annot = hspd.get('aln_annotation', {}) 163 for key, value in aln_annot.items(): 164 frag.aln_annotation[key] = value[idx] 165 except IndexError: 166 pass 167 # strands 168 frag.query_strand = hspd['query_strand'] 169 frag.hit_strand = hspd['hit_strand'] 170 # and append the hsp object to the list 171 if frag.aln_annotation.get('homology') is not None: 172 if '#' in frag.aln_annotation['homology']: 173 frags.extend(_split_fragment(frag)) 174 continue 175 # try to set frame if there are translation in the alignment 176 if len(frag.aln_annotation) > 1 or \ 177 frag.query_strand == 0 or \ 178 ('vulgar_comp' in hspd and re.search(_RE_TRANS, hspd['vulgar_comp'])): 179 _set_frame(frag) 180 181 frags.append(frag) 182 183 # if the query is protein, we need to change the hit and query sequences 184 # from three-letter amino acid codes to one letter, and adjust their 185 # coordinates accordingly 186 if len(frags[0].aln_annotation) == 2: # 2 annotations == protein query 187 frags = _adjust_aa_seq(frags) 188 189 hsp = HSP(frags) 190 # set hsp-specific attributes 191 for attr in ('score', 'hit_split_codons', 'query_split_codons', 192 'model', 'vulgar_comp', 'cigar_comp', 'alphabet'): 193 if attr in hspd: 194 setattr(hsp, attr, hspd[attr]) 195 196 return hsp
197 198
199 -def _parse_hit_or_query_line(line):
200 """Parse the 'Query:' line of exonerate alignment outputs.""" 201 try: 202 mark, id, desc = line.split(' ', 2) 203 except ValueError: # no desc 204 mark, id = line.split(' ', 1) 205 desc = '' 206 207 return id, desc
208 209
210 -class _BaseExonerateParser(object):
211 212 """Abstract iterator for exonerate format.""" 213 214 _ALN_MARK = None 215
216 - def __init__(self, handle):
217 self.handle = handle 218 self.has_c4_alignment = False
219
220 - def __iter__(self):
221 # read line until the first alignment block or cigar/vulgar lines 222 while True: 223 self.line = self.handle.readline() 224 # flag for human-readable alignment block 225 if self.line.startswith('C4 Alignment:') and not \ 226 self.has_c4_alignment: 227 self.has_c4_alignment = True 228 if self.line.startswith('C4 Alignment:') or \ 229 self.line.startswith('vulgar:') or \ 230 self.line.startswith('cigar:'): 231 break 232 elif not self.line or self.line.startswith('-- completed '): 233 raise StopIteration 234 235 for qresult in self._parse_qresult(): 236 qresult.program = 'exonerate' 237 # HACK: so that all descriptions are set 238 qresult.description = qresult.description 239 for hit in qresult: 240 hit.description = hit.description 241 yield qresult
242
243 - def read_until(self, bool_func):
244 """Reads the file handle until the given bool function returns True.""" 245 while True: 246 if not self.line or bool_func(self.line): 247 return 248 else: 249 self.line = self.handle.readline()
250
251 - def parse_alignment_block(self, header):
252 raise NotImplementedError("Subclass must implement this")
253
254 - def _parse_alignment_header(self):
255 # read all header lines and store them 256 aln_header = [] 257 # header is everything before the first empty line 258 while self.line.strip(): 259 aln_header.append(self.line.strip()) 260 self.line = self.handle.readline() 261 # then parse them 262 qresult, hit, hsp = {}, {}, {} 263 for line in aln_header: 264 # query line 265 if line.startswith('Query:'): 266 qresult['id'], qresult['description'] = \ 267 _parse_hit_or_query_line(line) 268 # target line 269 elif line.startswith('Target:'): 270 hit['id'], hit['description'] = \ 271 _parse_hit_or_query_line(line) 272 # model line 273 elif line.startswith('Model:'): 274 qresult['model'] = line.split(' ', 1)[1] 275 # score line 276 elif line.startswith('Raw score:'): 277 hsp['score'] = line.split(' ', 2)[2] 278 # query range line 279 elif line.startswith('Query range:'): 280 # line is always 'Query range: \d+ -> \d+', so we can pluck 281 # the numbers directly 282 hsp['query_start'], hsp['query_end'] = line.split(' ', 4)[2:5:2] 283 # hit range line 284 elif line.startswith('Target range:'): 285 # same logic with query range 286 hsp['hit_start'], hsp['hit_end'] = line.split(' ', 4)[2:5:2] 287 288 # determine strand 289 if qresult['description'].endswith(':[revcomp]'): 290 hsp['query_strand'] = '-' 291 qresult['description'] = qresult['description'].replace(':[revcomp]', '') 292 elif 'protein2' in qresult['model']: 293 hsp['query_strand'] = '.' 294 else: 295 hsp['query_strand'] = '+' 296 if hit['description'].endswith(':[revcomp]'): 297 hsp['hit_strand'] = '-' 298 hit['description'] = hit['description'].replace(':[revcomp]', '') 299 else: 300 hsp['hit_strand'] = '+' 301 302 # NOTE: we haven't processed the coordinates types 303 # and the strands are not yet Biopython's standard (1 / -1 / 0) 304 # since it's easier if we do the conversion later 305 306 return {'qresult': qresult, 'hit': hit, 'hsp': hsp}
307
308 - def _parse_qresult(self):
309 # state values 310 state_EOF = 0 311 state_QRES_NEW = 1 312 state_QRES_SAME = 3 313 state_HIT_NEW = 2 314 state_HIT_SAME = 4 315 # initial dummies 316 qres_state, hit_state = None, None 317 file_state = None 318 prev_qid, prev_hid = None, None 319 cur, prev = None, None 320 hit_list, hsp_list = [], [] 321 # if the file has c4 alignments, use that as the alignment mark 322 if self.has_c4_alignment: 323 self._ALN_MARK = 'C4 Alignment:' 324 325 while True: 326 self.read_until(lambda line: line.startswith(self._ALN_MARK)) 327 if cur is not None: 328 prev = cur 329 prev_qid = cur_qid 330 prev_hid = cur_hid 331 # only parse the result row if it's not EOF 332 if self.line: 333 assert self.line.startswith(self._ALN_MARK), self.line 334 # create temp dicts for storing parsed values 335 header = {'qresult': {}, 'hit': {}, 'hsp': {}} 336 # if the file has c4 alignments, try to parse the header 337 if self.has_c4_alignment: 338 self.read_until(lambda line: 339 line.strip().startswith('Query:')) 340 header = self._parse_alignment_header() 341 # parse the block contents 342 cur = self.parse_alignment_block(header) 343 cur_qid = cur['qresult']['id'] 344 cur_hid = cur['hit']['id'] 345 elif not self.line or self.line.startswith('-- completed '): 346 file_state = state_EOF 347 cur_qid, cur_hid = None, None 348 349 # get the state of hit and qresult 350 if prev_qid != cur_qid: 351 qres_state = state_QRES_NEW 352 else: 353 qres_state = state_QRES_SAME 354 # new hits are hits with different ids or hits in a new query 355 if prev_hid != cur_hid or qres_state == state_QRES_NEW: 356 hit_state = state_HIT_NEW 357 else: 358 hit_state = state_HIT_SAME 359 360 if prev is not None: 361 hsp = _create_hsp(prev_hid, prev_qid, prev['hsp']) 362 hsp_list.append(hsp) 363 364 if hit_state == state_HIT_NEW: 365 hit = Hit(hsp_list) 366 for attr, value in prev['hit'].items(): 367 setattr(hit, attr, value) 368 hit_list.append(hit) 369 hsp_list = [] 370 371 if qres_state == state_QRES_NEW or file_state == state_EOF: 372 qresult = QueryResult(id=prev_qid) 373 for hit in hit_list: 374 # not using append since Exonerate may separate the 375 # same hit if it has different strands 376 qresult.absorb(hit) 377 for attr, value in prev['qresult'].items(): 378 setattr(qresult, attr, value) 379 yield qresult 380 if file_state == state_EOF: 381 break 382 hit_list = [] 383 384 # only readline() here if we're not parsing C4 alignments 385 # C4 alignments readline() is handled by its parse_alignment_block 386 # function 387 if not self.has_c4_alignment: 388 self.line = self.handle.readline()
389 390
391 -class _BaseExonerateIndexer(SearchIndexer):
392 393 """Indexer class for Exonerate plain text.""" 394 395 _parser = None # should be defined by subclass 396 _query_mark = None # this one too 397
398 - def get_qresult_id(self, pos):
399 raise NotImplementedError("Should be defined by subclass")
400
401 - def __iter__(self):
402 """Iterates over the file handle; yields key, start offset, and length.""" 403 handle = self._handle 404 handle.seek(0) 405 qresult_key = None 406 407 while True: 408 start_offset = handle.tell() 409 line = handle.readline() 410 if line.startswith(self._query_mark): 411 if qresult_key is None: 412 qresult_key = self.get_qresult_id(start_offset) 413 qresult_offset = start_offset 414 else: 415 curr_key = self.get_qresult_id(start_offset) 416 if curr_key != qresult_key: 417 yield qresult_key, qresult_offset, \ 418 start_offset - qresult_offset 419 qresult_key = curr_key 420 qresult_offset = start_offset 421 handle.seek(qresult_offset) 422 elif not line: 423 yield qresult_key, qresult_offset, \ 424 start_offset - qresult_offset 425 break
426 427 428 # if not used as a module, run the doctest 429 if __name__ == "__main__": 430 from Bio._utils import run_doctest 431 run_doctest() 432