Package Bio :: Package SearchIO :: Package HmmerIO :: Module hmmer3_tab
[hide private]
[frames] | no frames]

Source Code for Module Bio.SearchIO.HmmerIO.hmmer3_tab

  1  # Copyright 2012 by Wibowo Arindrarto.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Bio.SearchIO parser for HMMER table output format.""" 
  7   
  8  from itertools import chain 
  9   
 10  from Bio._py3k import _as_bytes, _bytes_to_string 
 11  from Bio.Alphabet import generic_protein 
 12  from Bio.SearchIO._index import SearchIndexer 
 13  from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment 
 14   
 15   
 16  __all__ = ['Hmmer3TabParser', 'Hmmer3TabIndexer', 'Hmmer3TabWriter'] 
 17   
 18   
19 -class Hmmer3TabParser(object):
20 21 """Parser for the HMMER table format.""" 22
23 - def __init__(self, handle):
24 self.handle = handle 25 self.line = self.handle.readline()
26
27 - def __iter__(self):
28 header_mark = '#' 29 # read through the header if it exists 30 while self.line.startswith(header_mark): 31 self.line = self.handle.readline() 32 # if we have result rows, parse it 33 if self.line: 34 for qresult in self._parse_qresult(): 35 yield qresult
36
37 - def _parse_row(self):
38 """Returns a dictionary of parsed row values.""" 39 cols = [x for x in self.line.strip().split(' ') if x] 40 # if len(cols) > 19, we have extra description columns 41 # combine them all into one string in the 19th column 42 if len(cols) > 19: 43 cols[18] = ' '.join(cols[18:]) 44 # if it's < 19, we have no description columns, so use an empty string 45 # instead 46 elif len(cols) < 19: 47 cols.append('') 48 assert len(cols) == 19 49 50 # assign parsed column data into qresult, hit, and hsp dicts 51 qresult = {} 52 qresult['id'] = cols[2] # query name 53 qresult['acc'] = cols[3] # query accession 54 hit = {} 55 hit['id'] = cols[0] # target name 56 hit['acc'] = cols[1] # target accession 57 hit['evalue'] = float(cols[4]) # evalue (full sequence) 58 hit['bitscore'] = float(cols[5]) # score (full sequence) 59 hit['bias'] = float(cols[6]) # bias (full sequence) 60 hit['domain_exp_num'] = float(cols[10]) # exp 61 hit['region_num'] = int(cols[11]) # reg 62 hit['cluster_num'] = int(cols[12]) # clu 63 hit['overlap_num'] = int(cols[13]) # ov 64 hit['env_num'] = int(cols[14]) # env 65 hit['domain_obs_num'] = int(cols[15]) # dom 66 hit['domain_reported_num'] = int(cols[16]) # rep 67 hit['domain_included_num'] = int(cols[17]) # inc 68 hit['description'] = cols[18] # description of target 69 hsp = {} 70 hsp['evalue'] = float(cols[7]) # evalue (best 1 domain) 71 hsp['bitscore'] = float(cols[8]) # score (best 1 domain) 72 hsp['bias'] = float(cols[9]) # bias (best 1 domain) 73 # strand is always 0, since HMMER now only handles protein 74 frag = {} 75 frag['hit_strand'] = frag['query_strand'] = 0 76 frag['alphabet'] = generic_protein 77 78 return {'qresult': qresult, 'hit': hit, 'hsp': hsp, 'frag': frag}
79
80 - def _parse_qresult(self):
81 """Generator function that returns QueryResult objects.""" 82 # state values, determines what to do for each line 83 state_EOF = 0 84 state_QRES_NEW = 1 85 state_QRES_SAME = 3 86 # initial value dummies 87 qres_state = None 88 file_state = None 89 prev_qid = None 90 cur, prev = None, None 91 # container for Hit objects, used to create QueryResult 92 hit_list = [] 93 94 while True: 95 # store previous line's parsed values for all lines after the first 96 if cur is not None: 97 prev = cur 98 prev_qid = cur_qid 99 # only parse the result row if it's not EOF 100 if self.line: 101 cur = self._parse_row() 102 cur_qid = cur['qresult']['id'] 103 else: 104 file_state = state_EOF 105 # mock value for cur_qid, since we have nothing to parse 106 cur_qid = None 107 108 if prev_qid != cur_qid: 109 qres_state = state_QRES_NEW 110 else: 111 qres_state = state_QRES_SAME 112 113 if prev is not None: 114 # since domain tab formats only have 1 Hit per line 115 # we always create HSPFragment, HSP, and Hit per line 116 prev_hid = prev['hit']['id'] 117 118 # create fragment and HSP and set their attributes 119 frag = HSPFragment(prev_hid, prev_qid) 120 for attr, value in prev['frag'].items(): 121 setattr(frag, attr, value) 122 hsp = HSP([frag]) 123 for attr, value in prev['hsp'].items(): 124 setattr(hsp, attr, value) 125 126 # create Hit and set its attributes 127 hit = Hit([hsp]) 128 for attr, value in prev['hit'].items(): 129 setattr(hit, attr, value) 130 hit_list.append(hit) 131 132 # create qresult and yield if we're at a new qresult or at EOF 133 if qres_state == state_QRES_NEW or file_state == state_EOF: 134 qresult = QueryResult(hit_list, prev_qid) 135 for attr, value in prev['qresult'].items(): 136 setattr(qresult, attr, value) 137 yield qresult 138 # if we're at EOF, break 139 if file_state == state_EOF: 140 break 141 hit_list = [] 142 143 self.line = self.handle.readline()
144 145
146 -class Hmmer3TabIndexer(SearchIndexer):
147 148 """Indexer class for HMMER table output.""" 149 150 _parser = Hmmer3TabParser 151 # denotes column location for query identifier 152 _query_id_idx = 2 153
154 - def __iter__(self):
155 """Iterates over the file handle; yields key, start offset, and length.""" 156 handle = self._handle 157 handle.seek(0) 158 query_id_idx = self._query_id_idx 159 qresult_key = None 160 header_mark = _as_bytes('#') 161 split_mark = _as_bytes(' ') 162 # set line with initial mock value, to emulate header 163 line = header_mark 164 165 # read through header 166 while line.startswith(header_mark): 167 start_offset = handle.tell() 168 line = handle.readline() 169 170 # and index the qresults 171 while True: 172 end_offset = handle.tell() 173 174 if not line: 175 break 176 177 cols = [x for x in line.strip().split(split_mark) if x] 178 if qresult_key is None: 179 qresult_key = cols[query_id_idx] 180 else: 181 curr_key = cols[query_id_idx] 182 183 if curr_key != qresult_key: 184 adj_end = end_offset - len(line) 185 yield _bytes_to_string(qresult_key), start_offset, \ 186 adj_end - start_offset 187 qresult_key = curr_key 188 start_offset = adj_end 189 190 line = handle.readline() 191 if not line: 192 yield _bytes_to_string(qresult_key), start_offset, \ 193 end_offset - start_offset 194 break
195
196 - def get_raw(self, offset):
197 """Returns the raw string of a QueryResult object from the given offset.""" 198 handle = self._handle 199 handle.seek(offset) 200 query_id_idx = self._query_id_idx 201 qresult_key = None 202 qresult_raw = _as_bytes('') 203 split_mark = _as_bytes(' ') 204 205 while True: 206 line = handle.readline() 207 if not line: 208 break 209 cols = [x for x in line.strip().split(split_mark) if x] 210 if qresult_key is None: 211 qresult_key = cols[query_id_idx] 212 else: 213 curr_key = cols[query_id_idx] 214 if curr_key != qresult_key: 215 break 216 qresult_raw += line 217 218 return qresult_raw
219 220
221 -class Hmmer3TabWriter(object):
222 223 """Writer for hmmer3-tab output format.""" 224
225 - def __init__(self, handle):
226 self.handle = handle
227
228 - def write_file(self, qresults):
229 """Writes to the handle. 230 231 Returns a tuple of how many QueryResult, Hit, and HSP objects were written. 232 233 """ 234 handle = self.handle 235 qresult_counter, hit_counter, hsp_counter, frag_counter = 0, 0, 0, 0 236 237 try: 238 first_qresult = next(qresults) 239 except StopIteration: 240 handle.write(self._build_header()) 241 else: 242 # write header 243 handle.write(self._build_header(first_qresult)) 244 # and then the qresults 245 for qresult in chain([first_qresult], qresults): 246 if qresult: 247 handle.write(self._build_row(qresult)) 248 qresult_counter += 1 249 hit_counter += len(qresult) 250 hsp_counter += sum(len(hit) for hit in qresult) 251 frag_counter += sum(len(hit.fragments) for hit in qresult) 252 253 return qresult_counter, hit_counter, hsp_counter, frag_counter
254
255 - def _build_header(self, first_qresult=None):
256 """Returns the header string of a HMMER table output.""" 257 258 # calculate whitespace required 259 # adapted from HMMER's source: src/p7_tophits.c#L1083 260 if first_qresult is not None: 261 #qnamew = max(20, len(first_qresult.id)) 262 qnamew = 20 # why doesn't the above work? 263 tnamew = max(20, len(first_qresult[0].id)) 264 qaccw = max(10, len(first_qresult.acc)) 265 taccw = max(10, len(first_qresult[0].acc)) 266 else: 267 qnamew, tnamew, qaccw, taccw = 20, 20, 10, 10 268 269 header = "#%*s %22s %22s %33s\n" % \ 270 (tnamew + qnamew + taccw + qaccw + 2, "", 271 "--- full sequence ----", "--- best 1 domain ----", 272 "--- domain number estimation ----") 273 header += "#%-*s %-*s %-*s %-*s %9s %6s %5s %9s %6s %5s %5s %3s " \ 274 "%3s %3s %3s %3s %3s %3s %s\n" % (tnamew-1, " target name", 275 taccw, "accession", qnamew, "query name", qaccw, 276 "accession", " E-value", " score", " bias", 277 " E-value", " score", " bias", "exp", 278 "reg", "clu", " ov", "env", "dom", "rep", 279 "inc", "description of target") 280 header += "#%*s %*s %*s %*s %9s %6s %5s %9s %6s %5s %5s %3s %3s " \ 281 "%3s %3s %3s %3s %3s %s\n" % (tnamew-1, "-------------------", 282 taccw, "----------", qnamew, "--------------------", qaccw, 283 "----------", "---------", "------", "-----", "---------", 284 "------", "-----", "---", "---", "---", "---", "---", "---", 285 "---", "---", "---------------------") 286 287 return header
288
289 - def _build_row(self, qresult):
290 """Returns a string or one row or more of the QueryResult object.""" 291 rows = '' 292 293 # calculate whitespace required 294 # adapted from HMMER's source: src/p7_tophits.c#L1083 295 qnamew = max(20, len(qresult.id)) 296 tnamew = max(20, len(qresult[0].id)) 297 qaccw = max(10, len(qresult.acc)) 298 taccw = max(10, len(qresult[0].acc)) 299 300 for hit in qresult: 301 rows += "%-*s %-*s %-*s %-*s %9.2g %6.1f %5.1f %9.2g %6.1f %5.1f " \ 302 "%5.1f %3d %3d %3d %3d %3d %3d %3d %s\n" % (tnamew, hit.id, taccw, 303 hit.acc, qnamew, qresult.id, qaccw, qresult.acc, hit.evalue, 304 hit.bitscore, hit.bias, hit.hsps[0].evalue, hit.hsps[0].bitscore, 305 hit.hsps[0].bias, hit.domain_exp_num, hit.region_num, hit.cluster_num, 306 hit.overlap_num, hit.env_num, hit.domain_obs_num, 307 hit.domain_reported_num, hit.domain_included_num, hit.description) 308 309 return rows
310 311 312 # if not used as a module, run the doctest 313 if __name__ == "__main__": 314 from Bio._utils import run_doctest 315 run_doctest() 316