Package Bio :: Package SearchIO :: Package HmmerIO :: Module hmmer3_domtab
[hide private]
[frames] | no frames]

Source Code for Module Bio.SearchIO.HmmerIO.hmmer3_domtab

  1  # Copyright 2012 by Wibowo Arindrarto.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Bio.SearchIO parser for HMMER domain table output format.""" 
  7   
  8  from itertools import chain 
  9   
 10  from Bio.Alphabet import generic_protein 
 11  from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment 
 12   
 13  from .hmmer3_tab import Hmmer3TabParser, Hmmer3TabIndexer 
 14   
 15   
 16  __docformat__ = "restructuredtext en" 
 17   
 18   
19 -class Hmmer3DomtabParser(Hmmer3TabParser):
20 21 """Base hmmer3-domtab iterator.""" 22
23 - def _parse_row(self):
24 """Returns a dictionary of parsed row values.""" 25 assert self.line 26 cols = [x for x in self.line.strip().split(' ') if x] 27 # if len(cols) > 23, we have extra description columns 28 # combine them all into one string in the 19th column 29 if len(cols) > 23: 30 cols[22] = ' '.join(cols[22:]) 31 elif len(cols) < 23: 32 cols.append('') 33 assert len(cols) == 23 34 35 # assign parsed column data into qresult, hit, and hsp dicts 36 qresult = {} 37 qresult['id'] = cols[3] # query name 38 qresult['accession'] = cols[4] # query accession 39 qresult['seq_len'] = int(cols[5]) # qlen 40 hit = {} 41 hit['id'] = cols[0] # target name 42 hit['accession'] = cols[1] # target accession 43 hit['seq_len'] = int(cols[2]) # tlen 44 hit['evalue'] = float(cols[6]) # evalue 45 hit['bitscore'] = float(cols[7]) # score 46 hit['bias'] = float(cols[8]) # bias 47 hit['description'] = cols[22] # description of target 48 hsp = {} 49 hsp['domain_index'] = int(cols[9]) # # (domain number) 50 # not parsing cols[10] since it's basically len(hit) 51 hsp['evalue_cond'] = float(cols[11]) # c-evalue 52 hsp['evalue'] = float(cols[12]) # i-evalue 53 hsp['bitscore'] = float(cols[13]) # score 54 hsp['bias'] = float(cols[14]) # bias 55 hsp['env_start'] = int(cols[19]) - 1 # env from 56 hsp['env_end'] = int(cols[20]) # env to 57 hsp['acc_avg'] = float(cols[21]) # acc 58 frag = {} 59 # strand is always 0, since HMMER now only handles protein 60 frag['hit_strand'] = frag['query_strand'] = 0 61 frag['hit_start'] = int(cols[15]) - 1 # hmm from 62 frag['hit_end'] = int(cols[16]) # hmm to 63 frag['query_start'] = int(cols[17]) - 1 # ali from 64 frag['query_end'] = int(cols[18]) # ali to 65 # HMMER alphabets are always protein 66 frag['alphabet'] = generic_protein 67 68 # switch hmm<-->ali coordinates if hmm is not hit 69 if not self.hmm_as_hit: 70 frag['hit_end'], frag['query_end'] = \ 71 frag['query_end'], frag['hit_end'] 72 frag['hit_start'], frag['query_start'] = \ 73 frag['query_start'], frag['hit_start'] 74 75 return {'qresult': qresult, 'hit': hit, 'hsp': hsp, 'frag': frag}
76
77 - def _parse_qresult(self):
78 """Generator function that returns QueryResult objects.""" 79 # state values, determines what to do for each line 80 state_EOF = 0 81 state_QRES_NEW = 1 82 state_QRES_SAME = 3 83 state_HIT_NEW = 2 84 state_HIT_SAME = 4 85 # dummies for initial states 86 qres_state = None 87 hit_state = None 88 file_state = None 89 # dummies for initial id caches 90 prev_qid = None 91 prev_hid = None 92 # dummies for initial parsed value containers 93 cur, prev = None, None 94 hit_list, hsp_list = [], [] 95 96 while True: 97 # store previous line's parsed values, for every line after the 1st 98 if cur is not None: 99 prev = cur 100 prev_qid = cur_qid 101 prev_hid = cur_hid 102 # only parse the line if it's not EOF 103 if self.line and not self.line.startswith('#'): 104 cur = self._parse_row() 105 cur_qid = cur['qresult']['id'] 106 cur_hid = cur['hit']['id'] 107 else: 108 file_state = state_EOF 109 # mock ID values since the line is empty 110 cur_qid, cur_hid = None, None 111 112 # get the state of hit and qresult 113 if prev_qid != cur_qid: 114 qres_state = state_QRES_NEW 115 else: 116 qres_state = state_QRES_SAME 117 # new hits are hits with different ids or hits in a new qresult 118 if prev_hid != cur_hid or qres_state == state_QRES_NEW: 119 hit_state = state_HIT_NEW 120 else: 121 hit_state = state_HIT_SAME 122 123 # start creating objects after the first line (i.e. prev is filled) 124 if prev is not None: 125 # each line is basically an HSP with one HSPFragment 126 frag = HSPFragment(prev_hid, prev_qid) 127 for attr, value in prev['frag'].items(): 128 setattr(frag, attr, value) 129 hsp = HSP([frag]) 130 for attr, value in prev['hsp'].items(): 131 setattr(hsp, attr, value) 132 hsp_list.append(hsp) 133 134 # create hit object when we've finished parsing all its hsps 135 # i.e. when hit state is state_HIT_NEW 136 if hit_state == state_HIT_NEW: 137 hit = Hit(hsp_list) 138 for attr, value in prev['hit'].items(): 139 setattr(hit, attr, value) 140 hit_list.append(hit) 141 hsp_list = [] 142 143 # create qresult and yield if we're at a new qresult or EOF 144 if qres_state == state_QRES_NEW or file_state == state_EOF: 145 qresult = QueryResult(hit_list, prev_qid) 146 for attr, value in prev['qresult'].items(): 147 setattr(qresult, attr, value) 148 yield qresult 149 # if current line is EOF, break 150 if file_state == state_EOF: 151 break 152 hit_list = [] 153 154 self.line = self.handle.readline()
155 156
157 -class Hmmer3DomtabHmmhitParser(Hmmer3DomtabParser):
158 159 """Parser for the HMMER domain table format that assumes HMM profile 160 coordinates are hit coordinates.""" 161 162 hmm_as_hit = True
163 164
165 -class Hmmer3DomtabHmmqueryParser(Hmmer3DomtabParser):
166 167 """Parser for the HMMER domain table format that assumes HMM profile 168 coordinates are query coordinates.""" 169 170 hmm_as_hit = False
171 172
173 -class Hmmer3DomtabHmmhitIndexer(Hmmer3TabIndexer):
174 175 """Indexer class for HMMER domain table output that assumes HMM profile 176 coordinates are hit coordinates.""" 177 178 _parser = Hmmer3DomtabHmmhitParser 179 _query_id_idx = 3
180 181
182 -class Hmmer3DomtabHmmqueryIndexer(Hmmer3TabIndexer):
183 184 """Indexer class for HMMER domain table output that assumes HMM profile 185 coordinates are query coordinates.""" 186 187 _parser = Hmmer3DomtabHmmqueryParser 188 _query_id_idx = 3
189 190
191 -class Hmmer3DomtabHmmhitWriter(object):
192 193 """Writer for hmmer3-domtab output format which writes hit coordinates 194 as HMM profile coordinates.""" 195 196 hmm_as_hit = True 197
198 - def __init__(self, handle):
199 self.handle = handle
200
201 - def write_file(self, qresults):
202 """Writes to the handle. 203 204 Returns a tuple of how many QueryResult, Hit, and HSP objects were written. 205 206 """ 207 handle = self.handle 208 qresult_counter, hit_counter, hsp_counter, frag_counter = 0, 0, 0, 0 209 210 try: 211 first_qresult = next(qresults) 212 except StopIteration: 213 handle.write(self._build_header()) 214 else: 215 # write header 216 handle.write(self._build_header(first_qresult)) 217 # and then the qresults 218 for qresult in chain([first_qresult], qresults): 219 if qresult: 220 handle.write(self._build_row(qresult)) 221 qresult_counter += 1 222 hit_counter += len(qresult) 223 hsp_counter += sum(len(hit) for hit in qresult) 224 frag_counter += sum(len(hit.fragments) for hit in qresult) 225 226 return qresult_counter, hit_counter, hsp_counter, frag_counter
227
228 - def _build_header(self, first_qresult=None):
229 """Returns the header string of a domain HMMER table output.""" 230 231 # calculate whitespace required 232 # adapted from HMMER's source: src/p7_tophits.c#L1157 233 if first_qresult: 234 # qnamew = max(20, len(first_qresult.id)) 235 qnamew = 20 236 tnamew = max(20, len(first_qresult[0].id)) 237 try: 238 qaccw = max(10, len(first_qresult.acc)) 239 taccw = max(10, len(first_qresult[0].acc)) 240 except AttributeError: 241 qaccw, taccw = 10, 10 242 else: 243 qnamew, tnamew, qaccw, taccw = 20, 20, 10, 10 244 245 header = "#%*s %22s %40s %11s %11s %11s\n" % \ 246 (tnamew+qnamew-1+15+taccw+qaccw, "", "--- full sequence ---", 247 "-------------- this domain -------------", "hmm coord", 248 "ali coord", "env coord") 249 header += "#%-*s %-*s %5s %-*s %-*s %5s %9s %6s %5s %3s %3s %9s " \ 250 "%9s %6s %5s %5s %5s %5s %5s %5s %5s %4s %s\n" % (tnamew-1, 251 " target name", taccw, "accession", "tlen", qnamew, 252 "query name", qaccw, "accession", "qlen", "E-value", "score", 253 "bias", "#", "of", "c-Evalue", "i-Evalue", "score", "bias", 254 "from", "to", "from", "to", "from", "to", "acc", 255 "description of target") 256 header += "#%*s %*s %5s %*s %*s %5s %9s %6s %5s %3s %3s %9s %9s " \ 257 "%6s %5s %5s %5s %5s %5s %5s %5s %4s %s\n" % (tnamew-1, 258 "-------------------", taccw, "----------", "-----", 259 qnamew, "--------------------", qaccw, "----------", 260 "-----", "---------", "------", "-----", "---", "---", 261 "---------", "---------", "------", "-----", "-----", "-----", 262 "-----", "-----", "-----", "-----", "----", 263 "---------------------") 264 265 return header
266
267 - def _build_row(self, qresult):
268 """Returns a string or one row or more of the QueryResult object.""" 269 rows = '' 270 271 # calculate whitespace required 272 # adapted from HMMER's source: src/p7_tophits.c#L1083 273 qnamew = max(20, len(qresult.id)) 274 tnamew = max(20, len(qresult[0].id)) 275 try: 276 qaccw = max(10, len(qresult.accession)) 277 taccw = max(10, len(qresult[0].accession)) 278 qresult_acc = qresult.accession 279 except AttributeError: 280 qaccw, taccw = 10, 10 281 qresult_acc = '-' 282 283 for hit in qresult: 284 285 # try to get hit accession 286 try: 287 hit_acc = hit.accession 288 except AttributeError: 289 hit_acc = '-' 290 291 for hsp in hit.hsps: 292 if self.hmm_as_hit: 293 hmm_to = hsp.hit_end 294 hmm_from = hsp.hit_start + 1 295 ali_to = hsp.query_end 296 ali_from = hsp.query_start + 1 297 else: 298 hmm_to = hsp.query_end 299 hmm_from = hsp.query_start + 1 300 ali_to = hsp.hit_end 301 ali_from = hsp.hit_start + 1 302 303 rows += "%-*s %-*s %5d %-*s %-*s %5d %9.2g %6.1f %5.1f %3d %3d" \ 304 " %9.2g %9.2g %6.1f %5.1f %5d %5d %5ld %5ld %5d %5d %4.2f %s\n" % \ 305 (tnamew, hit.id, taccw, hit_acc, hit.seq_len, qnamew, qresult.id, 306 qaccw, qresult_acc, qresult.seq_len, hit.evalue, hit.bitscore, 307 hit.bias, hsp.domain_index, len(hit.hsps), hsp.evalue_cond, hsp.evalue, 308 hsp.bitscore, hsp.bias, hmm_from, hmm_to, ali_from, ali_to, 309 hsp.env_start + 1, hsp.env_end, hsp.acc_avg, hit.description) 310 311 return rows
312 313
314 -class Hmmer3DomtabHmmqueryWriter(Hmmer3DomtabHmmhitWriter):
315 316 """Writer for hmmer3-domtab output format which writes query coordinates 317 as HMM profile coordinates.""" 318 319 hmm_as_hit = False
320 321 322 # if not used as a module, run the doctest 323 if __name__ == "__main__": 324 from Bio._utils import run_doctest 325 run_doctest() 326