Package Bio :: Package SearchIO :: Package BlastIO :: Module blast_tab
[hide private]
[frames] | no frames]

Source Code for Module Bio.SearchIO.BlastIO.blast_tab

  1  # Copyright 2012 by Wibowo Arindrarto.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Bio.SearchIO parser for BLAST+ tab output format, with or without comments.""" 
  7   
  8  import re 
  9   
 10  from Bio._py3k import _as_bytes, _bytes_to_string 
 11  from Bio._py3k import basestring 
 12   
 13  from Bio.SearchIO._index import SearchIndexer 
 14  from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment 
 15   
 16   
 17  __all__ = ['BlastTabIndexer', 'BlastTabParser', 'BlastTabWriter'] 
 18   
 19  __docformat__ = "restructuredtext en" 
 20   
 21   
 22  # longname-shortname map 
 23  # maps the column names shown in a commented output to its short name 
 24  # (the one used in the command line) 
 25  _LONG_SHORT_MAP = { 
 26      'query id': 'qseqid', 
 27      'query acc.': 'qacc', 
 28      'query acc.ver': 'qaccver', 
 29      'query length': 'qlen', 
 30      'subject id': 'sseqid', 
 31      'subject acc.': 'sacc', 
 32      'subject acc.ver': 'saccver', 
 33      'subject length': 'slen', 
 34      'alignment length': 'length', 
 35      'bit score': 'bitscore', 
 36      'score': 'score', 
 37      'evalue': 'evalue', 
 38      'identical': 'nident', 
 39      '% identity': 'pident', 
 40      'positives': 'positive', 
 41      '% positives': 'ppos', 
 42      'mismatches': 'mismatch', 
 43      'gaps': 'gaps', 
 44      'q. start': 'qstart', 
 45      'q. end': 'qend', 
 46      's. start': 'sstart', 
 47      's. end': 'send', 
 48      'query frame': 'qframe', 
 49      'sbjct frame': 'sframe', 
 50      'query/sbjct frames': 'frames', 
 51      'query seq': 'qseq', 
 52      'subject seq': 'sseq', 
 53      'gap opens': 'gapopen', 
 54      'query gi': 'qgi', 
 55      'subject ids': 'sallseqid', 
 56      'subject gi': 'sgi', 
 57      'subject gis': 'sallgi', 
 58      'BTOP': 'btop', 
 59      'subject accs.': 'sallacc', 
 60      'subject tax ids': 'staxids', 
 61      'subject sci names': 'sscinames', 
 62      'subject com names': 'scomnames', 
 63      'subject blast names': 'sblastnames', 
 64      'subject super kingdoms': 'sskingdoms', 
 65      'subject title': 'stitle', 
 66      'subject titles': 'salltitles', 
 67      'subject strand': 'sstrand', 
 68      '% subject coverage': 'qcovs', 
 69      '% hsp coverage': 'qcovhsp', 
 70  } 
 71   
 72  # function to create a list from semicolon-delimited string 
 73  # used in BlastTabParser._parse_result_row 
 74  _list_semicol = lambda x: x.split(';') 
 75  _list_diamond = lambda x: x.split('<>') 
 76  # column to class attribute map 
 77  _COLUMN_QRESULT = { 
 78      'qseqid': ('id', str), 
 79      'qacc': ('accession', str), 
 80      'qaccver': ('accession_version', str), 
 81      'qlen': ('seq_len', int), 
 82      'qgi': ('gi', str), 
 83  } 
 84  _COLUMN_HIT = { 
 85      'sseqid': ('id', str), 
 86      'sallseqid': ('id_all', _list_semicol), 
 87      'sacc': ('accession', str), 
 88      'saccver': ('accession_version', str), 
 89      'sallacc': ('accession_all', _list_semicol), 
 90      'sgi': ('gi', str), 
 91      'sallgi': ('gi_all', str), 
 92      'slen': ('seq_len', int), 
 93      'staxids': ('tax_ids', _list_semicol), 
 94      'sscinames': ('sci_names', _list_semicol), 
 95      'scomnames': ('com_names', _list_semicol), 
 96      'sblastnames': ('blast_names', _list_semicol), 
 97      'sskingdoms': ('super_kingdoms', _list_semicol), 
 98      'stitle': ('title', str), 
 99      'salltitles': ('title_all', _list_diamond), 
100      # set strand as HSP property? 
101      'sstrand': ('strand', str), 
102      'qcovs': ('query_coverage', float), 
103  } 
104  _COLUMN_HSP = { 
105      'bitscore': ('bitscore', float), 
106      'score': ('bitscore_raw', int), 
107      'evalue': ('evalue', float), 
108      'nident': ('ident_num', int), 
109      'pident': ('ident_pct', float), 
110      'positive': ('pos_num', int), 
111      'ppos': ('pos_pct', float), 
112      'mismatch': ('mismatch_num', int), 
113      'gaps': ('gap_num', int), 
114      'gapopen': ('gapopen_num', int), 
115      'btop': ('btop', str), 
116      'qcovhsp': ('query_coverage', float), 
117  } 
118  _COLUMN_FRAG = { 
119      'length': ('aln_span', int), 
120      'qstart': ('query_start', int), 
121      'qend': ('query_end', int), 
122      'sstart': ('hit_start', int), 
123      'send': ('hit_end', int), 
124      'qframe': ('query_frame', int), 
125      'sframe': ('hit_frame', int), 
126      'frames': ('frames', str), 
127      'qseq': ('query', str), 
128      'sseq': ('hit', str), 
129  } 
130  _SUPPORTED_FIELDS = set(list(_COLUMN_QRESULT) + list(_COLUMN_HIT) + 
131                          list(_COLUMN_HSP) + list(_COLUMN_FRAG)) 
132   
133  # column order in the non-commented tabular output variant 
134  # values must be keys inside the column-attribute maps above 
135  _DEFAULT_FIELDS = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 
136          'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore'] 
137  # one field from each of the following sets must exist in order for the 
138  # parser to work 
139  _MIN_QUERY_FIELDS = set(['qseqid', 'qacc', 'qaccver']) 
140  _MIN_HIT_FIELDS = set(['sseqid', 'sacc', 'saccver', 'sallseqid']) 
141   
142  # simple function to create BLAST HSP attributes that may be computed if 
143  # other certain attributes are present 
144  # This was previously implemented in the HSP objects in the old model 
145   
146  _RE_GAPOPEN = re.compile(r'\w-') 
147   
148   
149 -def _compute_gapopen_num(hsp):
150 """Returns the number of gap openings in the given HSP.""" 151 gapopen = 0 152 for seq_type in ('query', 'hit'): 153 seq = str(getattr(hsp, seq_type).seq) 154 gapopen += len(re.findall(_RE_GAPOPEN, seq)) 155 return gapopen
156 157
158 -def _augment_blast_hsp(hsp, attr):
159 """Calculates the given HSP attribute, for writing.""" 160 if attr == 'aln_span': 161 # aln_span is number of identical matches + mismatches + gaps 162 func = lambda hsp: hsp.ident_num + hsp.mismatch_num + hsp.gap_num 163 # ident and gap will require the num values to be computed first 164 elif attr.startswith('ident'): 165 func = lambda hsp: hsp.aln_span - hsp.mismatch_num - hsp.gap_num 166 elif attr.startswith('gap'): 167 func = lambda hsp: hsp.aln_span - hsp.ident_num - hsp.mismatch_num 168 elif attr == 'mismatch_num': 169 func = lambda hsp: hsp.aln_span - hsp.ident_num - hsp.gap_num 170 elif attr == 'gapopen_num': 171 if not hasattr(hsp, 'query') or not hasattr(hsp, 'hit'): 172 # mock function so that the except clause below is triggered 173 # as both the query and hit are required to compute gapopen 174 def mock(hsp): 175 raise AttributeError
176 func = mock 177 else: 178 func = _compute_gapopen_num 179 180 # set the num values 181 # requires the endswith check, since we only want to set 'num' or 'span' 182 # attributes here 183 if not hasattr(hsp, attr) and not attr.endswith('_pct'): 184 value = func(hsp) 185 setattr(hsp, attr, value) 186 187 # if the attr is a percent value, calculate it 188 if attr == 'ident_pct': 189 func2 = lambda hsp: hsp.ident_num / float(hsp.aln_span) * 100 190 elif attr == 'pos_pct': 191 func = lambda hsp: hsp.pos_num / float(hsp.aln_span) * 100 192 elif attr == 'gap_pct': 193 func2 = lambda hsp: hsp.gap_num / float(hsp.aln_span) * 100 194 else: 195 func2 = None 196 197 # set the pct values 198 if func2 is not None: 199 value = func2(hsp) 200 setattr(hsp, attr, value) 201 202
203 -class BlastTabParser(object):
204 205 """Parser for the BLAST tabular format.""" 206
207 - def __init__(self, handle, comments=False, fields=_DEFAULT_FIELDS):
208 self.handle = handle 209 self.has_comments = comments 210 self.fields = self._prep_fields(fields) 211 self.line = self.handle.readline().strip()
212
213 - def __iter__(self):
214 # stop iteration if file has no lines 215 if not self.line: 216 raise StopIteration 217 # determine which iterator to use 218 elif self.has_comments: 219 iterfunc = self._parse_commented_qresult 220 else: 221 iterfunc = self._parse_qresult 222 223 for qresult in iterfunc(): 224 yield qresult
225
226 - def _prep_fields(self, fields):
227 """Validates and formats the given fields for use by the parser.""" 228 # cast into list if fields is a space-separated string 229 if isinstance(fields, basestring): 230 fields = fields.strip().split(' ') 231 # blast allows 'std' as a proxy for the standard default lists 232 # we want to transform 'std' to its proper column names 233 if 'std' in fields: 234 idx = fields.index('std') 235 fields = fields[:idx] + _DEFAULT_FIELDS + fields[idx+1:] 236 # if set(fields) has a null intersection with minimum required 237 # fields for hit and query, raise an exception 238 if not set(fields).intersection(_MIN_QUERY_FIELDS) or \ 239 not set(fields).intersection(_MIN_HIT_FIELDS): 240 raise ValueError("Required query and/or hit ID field not found.") 241 242 return fields
243
244 - def _parse_commented_qresult(self):
245 """Iterator returning `QueryResult` objects from a commented file.""" 246 while True: 247 comments = self._parse_comments() 248 if comments: 249 try: 250 self.fields = comments['fields'] 251 # iterator for the query results 252 qres_iter = self._parse_qresult() 253 except KeyError: 254 # no fields means the query has no results 255 assert 'fields' not in comments 256 # create an iterator returning one empty qresult 257 # if the query has no results 258 qres_iter = iter([QueryResult()]) 259 260 for qresult in qres_iter: 261 for key, value in comments.items(): 262 setattr(qresult, key, value) 263 yield qresult 264 265 else: 266 break
267
268 - def _parse_comments(self):
269 """Returns a dictionary containing tab file comments.""" 270 comments = {} 271 while True: 272 # parse program and version 273 # example: # BLASTX 2.2.26+ 274 if 'BLAST' in self.line and 'processed' not in self.line: 275 program_line = self.line[len(' #'):].split(' ') 276 comments['program'] = program_line[0].lower() 277 comments['version'] = program_line[1] 278 # parse query id and description (if available) 279 # example: # Query: gi|356995852 Mus musculus POU domain 280 elif 'Query' in self.line: 281 query_line = self.line[len('# Query: '):].split(' ', 1) 282 comments['id'] = query_line[0] 283 if len(query_line) == 2: 284 comments['description'] = query_line[1] 285 # parse target database 286 # example: # Database: db/minirefseq_protein 287 elif 'Database' in self.line: 288 comments['target'] = self.line[len('# Database: '):] 289 # parse RID (from remote searches) 290 elif 'RID' in self.line: 291 comments['rid'] = self.line[len('# RID: '):] 292 # parse column order, required for parsing the result lines 293 # example: # Fields: query id, query gi, query acc., query length 294 elif 'Fields' in self.line: 295 comments['fields'] = self._parse_fields_line() 296 # if the line has these strings, it's either the end of a comment 297 # or the end of a file, so we return all the comments we've parsed 298 elif ' hits found' in self.line or 'processed' in self.line: 299 self.line = self.handle.readline().strip() 300 return comments 301 302 self.line = self.handle.readline() 303 304 if not self.line: 305 return comments 306 else: 307 self.line = self.line.strip()
308
309 - def _parse_fields_line(self):
310 """Returns a list of column short names from the 'Fields' 311 comment line.""" 312 raw_field_str = self.line[len('# Fields: '):] 313 long_fields = raw_field_str.split(', ') 314 fields = [_LONG_SHORT_MAP[long_name] for long_name in long_fields] 315 return self._prep_fields(fields)
316
317 - def _parse_result_row(self):
318 """Returns a dictionary of parsed row values.""" 319 fields = self.fields 320 columns = self.line.strip().split('\t') 321 assert len(fields) == len(columns), "Expected %i columns, found: " \ 322 "%i" % (len(fields), len(columns)) 323 324 qresult, hit, hsp, frag = {}, {}, {}, {} 325 for idx, value in enumerate(columns): 326 sname = fields[idx] 327 # flag to check if any of the _COLUMNs contain sname 328 in_mapping = False 329 # iterate over each dict, mapping pair to determine 330 # attribute name and value of each column 331 for parsed_dict, mapping in ( 332 (qresult, _COLUMN_QRESULT), 333 (hit, _COLUMN_HIT), 334 (hsp, _COLUMN_HSP), 335 (frag, _COLUMN_FRAG)): 336 # process parsed value according to mapping 337 if sname in mapping: 338 attr_name, caster = mapping[sname] 339 if caster is not str: 340 value = caster(value) 341 parsed_dict[attr_name] = value 342 in_mapping = True 343 # make sure that any unhandled field is not supported 344 if not in_mapping: 345 assert sname not in _SUPPORTED_FIELDS 346 347 return {'qresult': qresult, 'hit': hit, 'hsp': hsp, 'frag': frag}
348
349 - def _get_id(self, parsed):
350 """Returns the value used for a QueryResult or Hit ID from a parsed row.""" 351 # use 'id', with 'id_all', 'accession' and 'accession_version' 352 # fallbacks one of these must have a value since we've checked whether 353 # they exist or not when parsing the comments 354 id_cache = parsed.get('id') 355 if id_cache is None and 'id_all' in parsed: 356 id_cache = parsed.get('id_all')[0] 357 if id_cache is None: 358 id_cache = parsed.get('accession') 359 if id_cache is None: 360 id_cache = parsed.get('accession_version') 361 362 return id_cache
363
364 - def _parse_qresult(self):
365 """Generator function that returns QueryResult objects.""" 366 # state values, used to determine what to do with each line 367 state_EOF = 0 368 state_QRES_NEW = 1 369 state_QRES_SAME = 3 370 state_HIT_NEW = 2 371 state_HIT_SAME = 4 372 # dummies for initial states 373 qres_state = None 374 hit_state = None 375 file_state = None 376 # dummies for initial id caches 377 prev_qid = None 378 prev_hid = None 379 # dummies for initial parsed value containers 380 cur, prev = None, None 381 hit_list, hsp_list = [], [] 382 383 while True: 384 # store previous line's parsed values if we've past the first line 385 if cur is not None: 386 prev = cur 387 prev_qid = cur_qid 388 prev_hid = cur_hid 389 # only parse the line if it's not EOF or not a comment line 390 if self.line and not self.line.startswith('#'): 391 cur = self._parse_result_row() 392 cur_qid = self._get_id(cur['qresult']) 393 cur_hid = self._get_id(cur['hit']) 394 else: 395 file_state = state_EOF 396 # mock values for cur_qid and cur_hid since the line is empty 397 cur_qid, cur_hid = None, None 398 399 # get the state of hit and qresult 400 if prev_qid != cur_qid: 401 qres_state = state_QRES_NEW 402 else: 403 qres_state = state_QRES_SAME 404 # new hits are hits with different id or hits in a new qresult 405 if prev_hid != cur_hid or qres_state == state_QRES_NEW: 406 hit_state = state_HIT_NEW 407 else: 408 hit_state = state_HIT_SAME 409 410 # we're creating objects for the previously parsed line(s), 411 # so nothing is done in the first parsed line (prev == None) 412 if prev is not None: 413 # every line is essentially an HSP with one fragment, so we 414 # create both of these for every line 415 frag = HSPFragment(prev_hid, prev_qid) 416 for attr, value in prev['frag'].items(): 417 # adjust coordinates to Python range 418 # NOTE: this requires both start and end coords to be 419 # present, otherwise a KeyError will be raised. 420 # Without this limitation, we might misleadingly set the 421 # start / end coords 422 for seq_type in ('query', 'hit'): 423 if attr == seq_type + '_start': 424 value = min(value, 425 prev['frag'][seq_type + '_end']) - 1 426 elif attr == seq_type + '_end': 427 value = max(value, 428 prev['frag'][seq_type + '_start']) 429 setattr(frag, attr, value) 430 # strand and frame setattr require the full parsed values 431 # to be set first 432 for seq_type in ('hit', 'query'): 433 # try to set hit and query frame 434 frame = self._get_frag_frame(frag, seq_type, 435 prev['frag']) 436 setattr(frag, '%s_frame' % seq_type, frame) 437 # try to set hit and query strand 438 strand = self._get_frag_strand(frag, seq_type, 439 prev['frag']) 440 setattr(frag, '%s_strand' % seq_type, strand) 441 442 hsp = HSP([frag]) 443 for attr, value in prev['hsp'].items(): 444 setattr(hsp, attr, value) 445 hsp_list.append(hsp) 446 447 # create hit and append to temp hit container if hit_state 448 # says we're not at the same hit or at a new query 449 if hit_state == state_HIT_NEW: 450 hit = Hit(hsp_list) 451 for attr, value in prev['hit'].items(): 452 if attr != 'id_all': 453 setattr(hit, attr, value) 454 else: 455 # not setting hit ID since it's already set from the 456 # prev_hid above 457 setattr(hit, '_id_alt', value[1:]) 458 hit_list.append(hit) 459 hsp_list = [] 460 # create qresult and yield if we're at a new qresult or EOF 461 if qres_state == state_QRES_NEW or file_state == state_EOF: 462 qresult = QueryResult(hit_list, prev_qid) 463 for attr, value in prev['qresult'].items(): 464 setattr(qresult, attr, value) 465 yield qresult 466 # if current line is EOF, break 467 if file_state == state_EOF: 468 break 469 hit_list = [] 470 471 self.line = self.handle.readline().strip()
472
473 - def _get_frag_frame(self, frag, seq_type, parsedict):
474 """Returns `HSPFragment` frame given the object, its sequence type, 475 and its parsed dictionary values.""" 476 assert seq_type in ('query', 'hit') 477 frame = getattr(frag, '%s_frame' % seq_type, None) 478 if frame is not None: 479 return frame 480 else: 481 if 'frames' in parsedict: 482 # frames is 'x1/x2' string, x1 is query frame, x2 is hit frame 483 idx = 0 if seq_type == 'query' else 1 484 return int(parsedict['frames'].split('/')[idx])
485 # else implicit None return 486
487 - def _get_frag_strand(self, frag, seq_type, parsedict):
488 """Returns `HSPFragment` strand given the object, its sequence type, 489 and its parsed dictionary values.""" 490 # NOTE: this will never set the strands as 0 for protein 491 # queries / hits, since we can't detect the blast flavors 492 # from the columns alone. 493 assert seq_type in ('query', 'hit') 494 strand = getattr(frag, '%s_strand' % seq_type, None) 495 if strand is not None: 496 return strand 497 else: 498 # using parsedict instead of the fragment object since 499 # we need the unadjusted coordinated values 500 start = parsedict.get('%s_start' % seq_type) 501 end = parsedict.get('%s_end' % seq_type) 502 if start is not None and end is not None: 503 return 1 if start <= end else -1
504 # else implicit None return 505 506
507 -class BlastTabIndexer(SearchIndexer):
508 509 """Indexer class for BLAST+ tab output.""" 510 511 _parser = BlastTabParser 512
513 - def __init__(self, filename, comments=False, fields=_DEFAULT_FIELDS):
514 SearchIndexer.__init__(self, filename, comments=comments, fields=fields) 515 516 # if the file doesn't have comments, 517 # get index of column used as the key (qseqid / qacc / qaccver) 518 if not self._kwargs['comments']: 519 if 'qseqid' in fields: 520 self._key_idx = fields.index('qseqid') 521 elif 'qacc' in fields: 522 self._key_idx = fields.index('qacc') 523 elif 'qaccver' in fields: 524 self._key_idx = fields.index('qaccver') 525 else: 526 raise ValueError("Custom fields is missing an ID column. " 527 "One of these must be present: 'qseqid', 'qacc', or 'qaccver'.")
528
529 - def __iter__(self):
530 """Iterates over the file handle; yields key, start offset, and length.""" 531 handle = self._handle 532 handle.seek(0) 533 534 if not self._kwargs['comments']: 535 iterfunc = self._qresult_index 536 else: 537 iterfunc = self._qresult_index_commented 538 539 for key, offset, length in iterfunc(): 540 yield _bytes_to_string(key), offset, length
541
542 - def _qresult_index_commented(self):
543 """Indexer for commented BLAST tabular files.""" 544 handle = self._handle 545 handle.seek(0) 546 start_offset = 0 547 # mark of a new query 548 query_mark = None 549 # mark of the query's ID 550 qid_mark = _as_bytes('# Query: ') 551 # mark of the last line 552 end_mark = _as_bytes('# BLAST processed') 553 554 while True: 555 end_offset = handle.tell() 556 line = handle.readline() 557 558 if query_mark is None: 559 query_mark = line 560 start_offset = end_offset 561 elif line.startswith(qid_mark): 562 qresult_key = line[len(qid_mark):].split()[0] 563 elif line == query_mark or line.startswith(end_mark): 564 yield qresult_key, start_offset, end_offset - start_offset 565 start_offset = end_offset 566 elif not line: 567 break
568
569 - def _qresult_index(self):
570 """Indexer for noncommented BLAST tabular files.""" 571 handle = self._handle 572 handle.seek(0) 573 start_offset = 0 574 qresult_key = None 575 key_idx = self._key_idx 576 tab_char = _as_bytes('\t') 577 578 while True: 579 # get end offset here since we only know a qresult ends after 580 # encountering the next one 581 end_offset = handle.tell() 582 # line = handle.readline() 583 line = handle.readline() 584 585 if qresult_key is None: 586 qresult_key = line.split(tab_char)[key_idx] 587 else: 588 try: 589 curr_key = line.split(tab_char)[key_idx] 590 except IndexError: 591 curr_key = _as_bytes('') 592 593 if curr_key != qresult_key: 594 yield qresult_key, start_offset, end_offset - start_offset 595 qresult_key = curr_key 596 start_offset = end_offset 597 598 # break if we've reached EOF 599 if not line: 600 break
601
602 - def get_raw(self, offset):
603 """Returns the raw string of a QueryResult object from the given offset.""" 604 if self._kwargs['comments']: 605 getfunc = self._get_raw_qresult_commented 606 else: 607 getfunc = self._get_raw_qresult 608 609 return getfunc(offset)
610
611 - def _get_raw_qresult(self, offset):
612 """Returns the raw string of a single QueryResult from a noncommented file.""" 613 handle = self._handle 614 handle.seek(offset) 615 qresult_raw = _as_bytes('') 616 tab_char = _as_bytes('\t') 617 key_idx = self._key_idx 618 qresult_key = None 619 620 while True: 621 line = handle.readline() 622 # get the key if the first line (qresult key) 623 if qresult_key is None: 624 qresult_key = line.split(tab_char)[key_idx] 625 else: 626 try: 627 curr_key = line.split(tab_char)[key_idx] 628 except IndexError: 629 curr_key = _as_bytes('') 630 # only break when qresult is finished (key is different) 631 if curr_key != qresult_key: 632 break 633 # append to the raw string as long as qresult is the same 634 qresult_raw += line 635 636 return qresult_raw
637
638 - def _get_raw_qresult_commented(self, offset):
639 """Returns the raw string of a single QueryResult from a commented file.""" 640 handle = self._handle 641 handle.seek(offset) 642 qresult_raw = _as_bytes('') 643 end_mark = _as_bytes('# BLAST processed') 644 645 # query mark is the line marking a new query 646 # something like '# TBLASTN 2.2.25+' 647 query_mark = None 648 line = handle.readline() 649 while line: 650 # since query_mark depends on the BLAST search, we need to obtain it 651 # first 652 if query_mark is None: 653 query_mark = line 654 # break when we've reached the next qresult or the search ends 655 elif line == query_mark or line.startswith(end_mark): 656 break 657 658 qresult_raw += line 659 line = handle.readline() 660 661 return qresult_raw
662 663
664 -class BlastTabWriter(object):
665 666 """Writer for blast-tab output format.""" 667
668 - def __init__(self, handle, comments=False, fields=_DEFAULT_FIELDS):
669 self.handle = handle 670 self.has_comments = comments 671 self.fields = fields
672
673 - def write_file(self, qresults):
674 """Writes to the handle, returns how many QueryResult objects are written.""" 675 handle = self.handle 676 qresult_counter, hit_counter, hsp_counter, frag_counter = 0, 0, 0, 0 677 678 for qresult in qresults: 679 if self.has_comments: 680 handle.write(self._build_comments(qresult)) 681 if qresult: 682 handle.write(self._build_rows(qresult)) 683 if not self.has_comments: 684 qresult_counter += 1 685 hit_counter += len(qresult) 686 hsp_counter += sum(len(hit) for hit in qresult) 687 frag_counter += sum(len(hit.fragments) for hit in qresult) 688 # if it's commented and there are no hits in the qresult, we still 689 # increment the counter 690 if self.has_comments: 691 qresult_counter += 1 692 693 # commented files have a line saying how many queries were processed 694 if self.has_comments: 695 handle.write('# BLAST processed %i queries' % qresult_counter) 696 697 return qresult_counter, hit_counter, hsp_counter, frag_counter
698
699 - def _build_rows(self, qresult):
700 """Returns a string containing tabular rows of the QueryResult object.""" 701 coordinates = set(['qstart', 'qend', 'sstart', 'send']) 702 qresult_lines = '' 703 for hit in qresult: 704 for hsp in hit: 705 line = [] 706 for field in self.fields: 707 # get the column value ~ could either be an attribute 708 # of qresult, hit, or hsp 709 if field in _COLUMN_QRESULT: 710 value = getattr(qresult, _COLUMN_QRESULT[field][0]) 711 elif field in _COLUMN_HIT: 712 if field == 'sallseqid': 713 value = getattr(hit, 'id_all') 714 else: 715 value = getattr(hit, _COLUMN_HIT[field][0]) 716 # special case, since 'frames' can be determined from 717 # query frame and hit frame 718 elif field == 'frames': 719 value = '%i/%i' % (hsp.query_frame, hsp.hit_frame) 720 elif field in _COLUMN_HSP: 721 try: 722 value = getattr(hsp, _COLUMN_HSP[field][0]) 723 except AttributeError: 724 attr = _COLUMN_HSP[field][0] 725 _augment_blast_hsp(hsp, attr) 726 value = getattr(hsp, attr) 727 elif field in _COLUMN_FRAG: 728 value = getattr(hsp, _COLUMN_FRAG[field][0]) 729 else: 730 assert field not in _SUPPORTED_FIELDS 731 continue 732 733 # adjust from and to according to strand, if from and to 734 # is included in the output field 735 if field in coordinates: 736 value = self._adjust_coords(field, value, hsp) 737 # adjust output formatting 738 value = self._adjust_output(field, value) 739 740 line.append(value) 741 742 hsp_line = '\t'.join(line) 743 qresult_lines += hsp_line + '\n' 744 745 return qresult_lines
746
747 - def _adjust_coords(self, field, value, hsp):
748 """Adjusts start and end coordinates according to strand.""" 749 assert field in ('qstart', 'qend', 'sstart', 'send') 750 # determine sequence type to operate on based on field's first letter 751 seq_type = 'query' if field.startswith('q') else 'hit' 752 753 strand = getattr(hsp, '%s_strand' % seq_type, None) 754 if strand is None: 755 raise ValueError("Required attribute %r not found." % 756 ('%s_strand' % (seq_type))) 757 # switch start <--> end coordinates if strand is -1 758 if strand < 0: 759 if field.endswith('start'): 760 value = getattr(hsp, '%s_end' % seq_type) 761 elif field.endswith('end'): 762 value = getattr(hsp, '%s_start' % seq_type) + 1 763 elif field.endswith('start'): 764 # adjust start coordinate for positive strand 765 value += 1 766 767 return value
768
769 - def _adjust_output(self, field, value):
770 """Adjusts formatting of the given field and value to mimic native tab output.""" 771 # qseq and sseq are stored as SeqRecord, but here we only need the str 772 if field in ('qseq', 'sseq'): 773 value = str(value.seq) 774 775 # evalue formatting, adapted from BLAST+ source: 776 # src/objtools/align_format/align_format_util.cpp#L668 777 elif field == 'evalue': 778 if value < 1.0e-180: 779 value = '0.0' 780 elif value < 1.0e-99: 781 value = '%2.0e' % value 782 elif value < 0.0009: 783 value = '%3.0e' % value 784 elif value < 0.1: 785 value = '%4.3f' % value 786 elif value < 1.0: 787 value = '%3.2f' % value 788 elif value < 10.0: 789 value = '%2.1f' % value 790 else: 791 value = '%5.0f' % value 792 793 # pident and ppos formatting 794 elif field in ('pident', 'ppos'): 795 value = '%.2f' % value 796 797 # evalue formatting, adapted from BLAST+ source: 798 # src/objtools/align_format/align_format_util.cpp#L723 799 elif field == 'bitscore': 800 if value > 9999: 801 value = '%4.3e' % value 802 elif value > 99.9: 803 value = '%4.0d' % value 804 else: 805 value = '%4.1f' % value 806 807 # coverages have no comma (using floats still ~ a more proper 808 # representation) 809 elif field in ('qcovhsp', 'qcovs'): 810 value = '%.0f' % value 811 812 # list into '<>'-delimited string 813 elif field == 'salltitles': 814 value = '<>'.join(value) 815 816 # list into ';'-delimited string 817 elif field in ('sallseqid', 'sallacc', 'staxids', 'sscinames', 818 'scomnames', 'sblastnames', 'sskingdoms'): 819 value = ';'.join(value) 820 821 # everything else 822 else: 823 value = str(value) 824 825 return value
826
827 - def _build_comments(self, qres):
828 """Returns a string of a QueryResult tabular comment.""" 829 comments = [] 830 # inverse mapping of the long-short name map, required 831 # for writing comments 832 inv_field_map = dict((v, k) for k, v in _LONG_SHORT_MAP.items()) 833 834 # try to anticipate qress without version 835 if not hasattr(qres, 'version'): 836 program_line = '# %s' % qres.program.upper() 837 else: 838 program_line = '# %s %s' % (qres.program.upper(), qres.version) 839 comments.append(program_line) 840 # description may or may not be None 841 if qres.description is None: 842 comments.append('# Query: %s' % qres.id) 843 else: 844 comments.append('# Query: %s %s' % (qres.id, qres.description)) 845 # try appending RID line, if present 846 try: 847 comments.append('# RID: %s' % qres.rid) 848 except AttributeError: 849 pass 850 comments.append('# Database: %s' % qres.target) 851 # qresults without hits don't show the Fields comment 852 if qres: 853 comments.append('# Fields: %s' % 854 ', '.join(inv_field_map[field] for field in self.fields)) 855 comments.append('# %i hits found' % len(qres)) 856 857 return '\n'.join(comments) + '\n'
858 859 860 # if not used as a module, run the doctest 861 if __name__ == "__main__": 862 from Bio._utils import run_doctest 863 run_doctest() 864