Package Bio :: Package SearchIO :: Package BlastIO :: Module blast_tab
[hide private]
[frames] | no frames]

Source Code for Module Bio.SearchIO.BlastIO.blast_tab

  1  # Copyright 2012 by Wibowo Arindrarto.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Bio.SearchIO parser for BLAST+ tab output format, with or without comments.""" 
  7   
  8  import re 
  9   
 10  from Bio._py3k import _as_bytes, _bytes_to_string 
 11  from Bio._py3k import basestring 
 12   
 13  from Bio.SearchIO._index import SearchIndexer 
 14  from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment 
 15   
 16   
 17  __all__ = ['BlastTabIndexer', 'BlastTabParser', 'BlastTabWriter'] 
 18   
 19   
 20  # longname-shortname map 
 21  # maps the column names shown in a commented output to its short name 
 22  # (the one used in the command line) 
 23  _LONG_SHORT_MAP = { 
 24      'query id': 'qseqid', 
 25      'query acc.': 'qacc', 
 26      'query acc.ver': 'qaccver', 
 27      'query length': 'qlen', 
 28      'subject id': 'sseqid', 
 29      'subject acc.': 'sacc', 
 30      'subject acc.ver': 'saccver', 
 31      'subject length': 'slen', 
 32      'alignment length': 'length', 
 33      'bit score': 'bitscore', 
 34      'score': 'score', 
 35      'evalue': 'evalue', 
 36      'identical': 'nident', 
 37      '% identity': 'pident', 
 38      'positives': 'positive', 
 39      '% positives': 'ppos', 
 40      'mismatches': 'mismatch', 
 41      'gaps': 'gaps', 
 42      'q. start': 'qstart', 
 43      'q. end': 'qend', 
 44      's. start': 'sstart', 
 45      's. end': 'send', 
 46      'query frame': 'qframe', 
 47      'sbjct frame': 'sframe', 
 48      'query/sbjct frames': 'frames', 
 49      'query seq': 'qseq', 
 50      'subject seq': 'sseq', 
 51      'gap opens': 'gapopen', 
 52      'query gi': 'qgi', 
 53      'subject ids': 'sallseqid', 
 54      'subject gi': 'sgi', 
 55      'subject gis': 'sallgi', 
 56      'BTOP': 'btop', 
 57      'subject accs.': 'sallacc', 
 58      'subject tax ids': 'staxids', 
 59      'subject sci names': 'sscinames', 
 60      'subject com names': 'scomnames', 
 61      'subject blast names': 'sblastnames', 
 62      'subject super kingdoms': 'sskingdoms', 
 63      'subject title': 'stitle', 
 64      'subject titles': 'salltitles', 
 65      'subject strand': 'sstrand', 
 66      '% subject coverage': 'qcovs', 
 67      '% hsp coverage': 'qcovhsp', 
 68  } 
 69   
 70  # function to create a list from semicolon-delimited string 
 71  # used in BlastTabParser._parse_result_row 
 72  _list_semicol = lambda x: x.split(';') 
 73  _list_diamond = lambda x: x.split('<>') 
 74  # column to class attribute map 
 75  _COLUMN_QRESULT = { 
 76      'qseqid': ('id', str), 
 77      'qacc': ('accession', str), 
 78      'qaccver': ('accession_version', str), 
 79      'qlen': ('seq_len', int), 
 80      'qgi': ('gi', str), 
 81  } 
 82  _COLUMN_HIT = { 
 83      'sseqid': ('id', str), 
 84      'sallseqid': ('id_all', _list_semicol), 
 85      'sacc': ('accession', str), 
 86      'saccver': ('accession_version', str), 
 87      'sallacc': ('accession_all', _list_semicol), 
 88      'sgi': ('gi', str), 
 89      'sallgi': ('gi_all', str), 
 90      'slen': ('seq_len', int), 
 91      'staxids': ('tax_ids', _list_semicol), 
 92      'sscinames': ('sci_names', _list_semicol), 
 93      'scomnames': ('com_names', _list_semicol), 
 94      'sblastnames': ('blast_names', _list_semicol), 
 95      'sskingdoms': ('super_kingdoms', _list_semicol), 
 96      'stitle': ('title', str), 
 97      'salltitles': ('title_all', _list_diamond), 
 98      # set strand as HSP property? 
 99      'sstrand': ('strand', str), 
100      'qcovs': ('query_coverage', float), 
101  } 
102  _COLUMN_HSP = { 
103      'bitscore': ('bitscore', float), 
104      'score': ('bitscore_raw', int), 
105      'evalue': ('evalue', float), 
106      'nident': ('ident_num', int), 
107      'pident': ('ident_pct', float), 
108      'positive': ('pos_num', int), 
109      'ppos': ('pos_pct', float), 
110      'mismatch': ('mismatch_num', int), 
111      'gaps': ('gap_num', int), 
112      'gapopen': ('gapopen_num', int), 
113      'btop': ('btop', str), 
114      'qcovhsp': ('query_coverage', float), 
115  } 
116  _COLUMN_FRAG = { 
117      'length': ('aln_span', int), 
118      'qstart': ('query_start', int), 
119      'qend': ('query_end', int), 
120      'sstart': ('hit_start', int), 
121      'send': ('hit_end', int), 
122      'qframe': ('query_frame', int), 
123      'sframe': ('hit_frame', int), 
124      'frames': ('frames', str), 
125      'qseq': ('query', str), 
126      'sseq': ('hit', str), 
127  } 
128  _SUPPORTED_FIELDS = set(list(_COLUMN_QRESULT) + list(_COLUMN_HIT) + 
129                          list(_COLUMN_HSP) + list(_COLUMN_FRAG)) 
130   
131  # column order in the non-commented tabular output variant 
132  # values must be keys inside the column-attribute maps above 
133  _DEFAULT_FIELDS = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 
134          'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore'] 
135  # one field from each of the following sets must exist in order for the 
136  # parser to work 
137  _MIN_QUERY_FIELDS = set(['qseqid', 'qacc', 'qaccver']) 
138  _MIN_HIT_FIELDS = set(['sseqid', 'sacc', 'saccver']) 
139   
140  # simple function to create BLAST HSP attributes that may be computed if 
141  # other certain attributes are present 
142  # This was previously implemented in the HSP objects in the old model 
143   
144  _RE_GAPOPEN = re.compile(r'\w-') 
145   
146   
147 -def _compute_gapopen_num(hsp):
148 """Returns the number of gap openings in the given HSP.""" 149 gapopen = 0 150 for seq_type in ('query', 'hit'): 151 seq = str(getattr(hsp, seq_type).seq) 152 gapopen += len(re.findall(_RE_GAPOPEN, seq)) 153 return gapopen
154 155
156 -def _augment_blast_hsp(hsp, attr):
157 """Calculates the given HSP attribute, for writing.""" 158 if attr == 'aln_span': 159 # aln_span is number of identical matches + mismatches + gaps 160 func = lambda hsp: hsp.ident_num + hsp.mismatch_num + hsp.gap_num 161 # ident and gap will require the num values to be computed first 162 elif attr.startswith('ident'): 163 func = lambda hsp: hsp.aln_span - hsp.mismatch_num - hsp.gap_num 164 elif attr.startswith('gap'): 165 func = lambda hsp: hsp.aln_span - hsp.ident_num - hsp.mismatch_num 166 elif attr == 'mismatch_num': 167 func = lambda hsp: hsp.aln_span - hsp.ident_num - hsp.gap_num 168 elif attr == 'gapopen_num': 169 if not hasattr(hsp, 'query') or not hasattr(hsp, 'hit'): 170 # mock function so that the except clause below is triggered 171 # as both the query and hit are required to compute gapopen 172 def mock(hsp): 173 raise AttributeError
174 func = mock 175 else: 176 func = _compute_gapopen_num 177 178 # set the num values 179 # requires the endswith check, since we only want to set 'num' or 'span' 180 # attributes here 181 if not hasattr(hsp, attr) and not attr.endswith('_pct'): 182 value = func(hsp) 183 setattr(hsp, attr, value) 184 185 # if the attr is a percent value, calculate it 186 if attr == 'ident_pct': 187 func2 = lambda hsp: hsp.ident_num / float(hsp.aln_span) * 100 188 elif attr == 'pos_pct': 189 func = lambda hsp: hsp.pos_num / float(hsp.aln_span) * 100 190 elif attr == 'gap_pct': 191 func2 = lambda hsp: hsp.gap_num / float(hsp.aln_span) * 100 192 else: 193 func2 = None 194 195 # set the pct values 196 if func2 is not None: 197 value = func2(hsp) 198 setattr(hsp, attr, value) 199 200
201 -class BlastTabParser(object):
202 203 """Parser for the BLAST tabular format.""" 204
205 - def __init__(self, handle, comments=False, fields=_DEFAULT_FIELDS):
206 self.handle = handle 207 self.has_comments = comments 208 self.fields = self._prep_fields(fields) 209 self.line = self.handle.readline().strip()
210
211 - def __iter__(self):
212 # stop iteration if file has no lines 213 if not self.line: 214 raise StopIteration 215 # determine which iterator to use 216 elif self.has_comments: 217 iterfunc = self._parse_commented_qresult 218 else: 219 iterfunc = self._parse_qresult 220 221 for qresult in iterfunc(): 222 yield qresult
223
224 - def _prep_fields(self, fields):
225 """Validates and formats the given fields for use by the parser.""" 226 # cast into list if fields is a space-separated string 227 if isinstance(fields, basestring): 228 fields = fields.strip().split(' ') 229 # blast allows 'std' as a proxy for the standard default lists 230 # we want to transform 'std' to its proper column names 231 if 'std' in fields: 232 idx = fields.index('std') 233 fields = fields[:idx] + _DEFAULT_FIELDS + fields[idx+1:] 234 # if set(fields) has a null intersection with minimum required 235 # fields for hit and query, raise an exception 236 if not set(fields).intersection(_MIN_QUERY_FIELDS) or \ 237 not set(fields).intersection(_MIN_HIT_FIELDS): 238 raise ValueError("Required query and/or hit ID field not found.") 239 240 return fields
241
242 - def _parse_commented_qresult(self):
243 """Iterator returning `QueryResult` objects from a commented file.""" 244 while True: 245 comments = self._parse_comments() 246 if comments: 247 try: 248 self.fields = comments['fields'] 249 # iterator for the query results 250 qres_iter = self._parse_qresult() 251 except KeyError: 252 # no fields means the query has no results 253 assert 'fields' not in comments 254 # create an iterator returning one empty qresult 255 # if the query has no results 256 qres_iter = iter([QueryResult()]) 257 258 for qresult in qres_iter: 259 for key, value in comments.items(): 260 setattr(qresult, key, value) 261 yield qresult 262 263 else: 264 break
265
266 - def _parse_comments(self):
267 """Returns a dictionary containing tab file comments.""" 268 comments = {} 269 while True: 270 # parse program and version 271 # example: # BLASTX 2.2.26+ 272 if 'BLAST' in self.line and 'processed' not in self.line: 273 program_line = self.line[len(' #'):].split(' ') 274 comments['program'] = program_line[0].lower() 275 comments['version'] = program_line[1] 276 # parse query id and description (if available) 277 # example: # Query: gi|356995852 Mus musculus POU domain 278 elif 'Query' in self.line: 279 query_line = self.line[len('# Query: '):].split(' ', 1) 280 comments['id'] = query_line[0] 281 if len(query_line) == 2: 282 comments['description'] = query_line[1] 283 # parse target database 284 # example: # Database: db/minirefseq_protein 285 elif 'Database' in self.line: 286 comments['target'] = self.line[len('# Database: '):] 287 # parse RID (from remote searches) 288 elif 'RID' in self.line: 289 comments['rid'] = self.line[len('# RID: '):] 290 # parse column order, required for parsing the result lines 291 # example: # Fields: query id, query gi, query acc., query length 292 elif 'Fields' in self.line: 293 comments['fields'] = self._parse_fields_line() 294 # if the line has these strings, it's either the end of a comment 295 # or the end of a file, so we return all the comments we've parsed 296 elif ' hits found' in self.line or 'processed' in self.line: 297 self.line = self.handle.readline().strip() 298 return comments 299 300 self.line = self.handle.readline() 301 302 if not self.line: 303 return comments 304 else: 305 self.line = self.line.strip()
306
307 - def _parse_fields_line(self):
308 """Returns a list of column short names from the 'Fields' 309 comment line.""" 310 raw_field_str = self.line[len('# Fields: '):] 311 long_fields = raw_field_str.split(', ') 312 fields = [_LONG_SHORT_MAP[long_name] for long_name in long_fields] 313 return self._prep_fields(fields)
314
315 - def _parse_result_row(self):
316 """Returns a dictionary of parsed row values.""" 317 fields = self.fields 318 columns = self.line.strip().split('\t') 319 assert len(fields) == len(columns), "Expected %i columns, found: " \ 320 "%i" % (len(fields), len(columns)) 321 322 qresult, hit, hsp, frag = {}, {}, {}, {} 323 for idx, value in enumerate(columns): 324 sname = fields[idx] 325 # flag to check if any of the _COLUMNs contain sname 326 in_mapping = False 327 # iterate over each dict, mapping pair to determine 328 # attribute name and value of each column 329 for parsed_dict, mapping in ( 330 (qresult, _COLUMN_QRESULT), 331 (hit, _COLUMN_HIT), 332 (hsp, _COLUMN_HSP), 333 (frag, _COLUMN_FRAG)): 334 # process parsed value according to mapping 335 if sname in mapping: 336 attr_name, caster = mapping[sname] 337 if caster is not str: 338 value = caster(value) 339 parsed_dict[attr_name] = value 340 in_mapping = True 341 # make sure that any unhandled field is not supported 342 if not in_mapping: 343 assert sname not in _SUPPORTED_FIELDS 344 345 return {'qresult': qresult, 'hit': hit, 'hsp': hsp, 'frag': frag}
346
347 - def _get_id(self, parsed):
348 """Returns the value used for a QueryResult or Hit ID from a parsed row.""" 349 # use 'id', with 'accession' and 'accession_version' fallbacks 350 # one of these must have a value since we've checked whether 351 # they exist or not when parsing the comments 352 id_cache = parsed.get('id') 353 if id_cache is None: 354 id_cache = parsed.get('accession') 355 if id_cache is None: 356 id_cache = parsed.get('accession_version') 357 358 return id_cache
359
360 - def _parse_qresult(self):
361 """Generator function that returns QueryResult objects.""" 362 # state values, used to determine what to do with each line 363 state_EOF = 0 364 state_QRES_NEW = 1 365 state_QRES_SAME = 3 366 state_HIT_NEW = 2 367 state_HIT_SAME = 4 368 # dummies for initial states 369 qres_state = None 370 hit_state = None 371 file_state = None 372 # dummies for initial id caches 373 prev_qid = None 374 prev_hid = None 375 # dummies for initial parsed value containers 376 cur, prev = None, None 377 hit_list, hsp_list = [], [] 378 379 while True: 380 # store previous line's parsed values if we've past the first line 381 if cur is not None: 382 prev = cur 383 prev_qid = cur_qid 384 prev_hid = cur_hid 385 # only parse the line if it's not EOF or not a comment line 386 if self.line and not self.line.startswith('#'): 387 cur = self._parse_result_row() 388 cur_qid = self._get_id(cur['qresult']) 389 cur_hid = self._get_id(cur['hit']) 390 else: 391 file_state = state_EOF 392 # mock values for cur_qid and cur_hid since the line is empty 393 cur_qid, cur_hid = None, None 394 395 # get the state of hit and qresult 396 if prev_qid != cur_qid: 397 qres_state = state_QRES_NEW 398 else: 399 qres_state = state_QRES_SAME 400 # new hits are hits with different id or hits in a new qresult 401 if prev_hid != cur_hid or qres_state == state_QRES_NEW: 402 hit_state = state_HIT_NEW 403 else: 404 hit_state = state_HIT_SAME 405 406 # we're creating objects for the previously parsed line(s), 407 # so nothing is done in the first parsed line (prev == None) 408 if prev is not None: 409 # every line is essentially an HSP with one fragment, so we 410 # create both of these for every line 411 frag = HSPFragment(prev_hid, prev_qid) 412 for attr, value in prev['frag'].items(): 413 # adjust coordinates to Python range 414 # NOTE: this requires both start and end coords to be 415 # present, otherwise a KeyError will be raised. 416 # Without this limitation, we might misleadingly set the 417 # start / end coords 418 for seq_type in ('query', 'hit'): 419 if attr == seq_type + '_start': 420 value = min(value, 421 prev['frag'][seq_type + '_end']) - 1 422 elif attr == seq_type + '_end': 423 value = max(value, 424 prev['frag'][seq_type + '_start']) 425 setattr(frag, attr, value) 426 # strand and frame setattr require the full parsed values 427 # to be set first 428 for seq_type in ('hit', 'query'): 429 # try to set hit and query frame 430 frame = self._get_frag_frame(frag, seq_type, 431 prev['frag']) 432 setattr(frag, '%s_frame' % seq_type, frame) 433 # try to set hit and query strand 434 strand = self._get_frag_strand(frag, seq_type, 435 prev['frag']) 436 setattr(frag, '%s_strand' % seq_type, strand) 437 438 hsp = HSP([frag]) 439 for attr, value in prev['hsp'].items(): 440 setattr(hsp, attr, value) 441 hsp_list.append(hsp) 442 443 # create hit and append to temp hit container if hit_state 444 # says we're not at the same hit or at a new query 445 if hit_state == state_HIT_NEW: 446 hit = Hit(hsp_list) 447 for attr, value in prev['hit'].items(): 448 setattr(hit, attr, value) 449 hit_list.append(hit) 450 hsp_list = [] 451 # create qresult and yield if we're at a new qresult or EOF 452 if qres_state == state_QRES_NEW or file_state == state_EOF: 453 qresult = QueryResult(hit_list, prev_qid) 454 for attr, value in prev['qresult'].items(): 455 setattr(qresult, attr, value) 456 yield qresult 457 # if current line is EOF, break 458 if file_state == state_EOF: 459 break 460 hit_list = [] 461 462 self.line = self.handle.readline().strip()
463
464 - def _get_frag_frame(self, frag, seq_type, parsedict):
465 """Returns `HSPFragment` frame given the object, its sequence type, 466 and its parsed dictionary values.""" 467 assert seq_type in ('query', 'hit') 468 frame = getattr(frag, '%s_frame' % seq_type, None) 469 if frame is not None: 470 return frame 471 else: 472 if 'frames' in parsedict: 473 # frames is 'x1/x2' string, x1 is query frame, x2 is hit frame 474 idx = 0 if seq_type == 'query' else 1 475 return int(parsedict['frames'].split('/')[idx])
476 # else implicit None return 477
478 - def _get_frag_strand(self, frag, seq_type, parsedict):
479 """Returns `HSPFragment` strand given the object, its sequence type, 480 and its parsed dictionary values.""" 481 # NOTE: this will never set the strands as 0 for protein 482 # queries / hits, since we can't detect the blast flavors 483 # from the columns alone. 484 assert seq_type in ('query', 'hit') 485 strand = getattr(frag, '%s_strand' % seq_type, None) 486 if strand is not None: 487 return strand 488 else: 489 # using parsedict instead of the fragment object since 490 # we need the unadjusted coordinated values 491 start = parsedict.get('%s_start' % seq_type) 492 end = parsedict.get('%s_end' % seq_type) 493 if start is not None and end is not None: 494 return 1 if start <= end else -1
495 # else implicit None return 496 497
498 -class BlastTabIndexer(SearchIndexer):
499 500 """Indexer class for BLAST+ tab output.""" 501 502 _parser = BlastTabParser 503
504 - def __init__(self, filename, comments=False, fields=_DEFAULT_FIELDS):
505 SearchIndexer.__init__(self, filename, comments=comments, fields=fields) 506 507 # if the file doesn't have comments, 508 # get index of column used as the key (qseqid / qacc / qaccver) 509 if not self._kwargs['comments']: 510 if 'qseqid' in fields: 511 self._key_idx = fields.index('qseqid') 512 elif 'qacc' in fields: 513 self._key_idx = fields.index('qacc') 514 elif 'qaccver' in fields: 515 self._key_idx = fields.index('qaccver') 516 else: 517 raise ValueError("Custom fields is missing an ID column. " 518 "One of these must be present: 'qseqid', 'qacc', or 'qaccver'.")
519
520 - def __iter__(self):
521 """Iterates over the file handle; yields key, start offset, and length.""" 522 handle = self._handle 523 handle.seek(0) 524 525 if not self._kwargs['comments']: 526 iterfunc = self._qresult_index 527 else: 528 iterfunc = self._qresult_index_commented 529 530 for key, offset, length in iterfunc(): 531 yield _bytes_to_string(key), offset, length
532
533 - def _qresult_index_commented(self):
534 """Indexer for commented BLAST tabular files.""" 535 handle = self._handle 536 handle.seek(0) 537 start_offset = 0 538 # mark of a new query 539 query_mark = None 540 # mark of the query's ID 541 qid_mark = _as_bytes('# Query: ') 542 # mark of the last line 543 end_mark = _as_bytes('# BLAST processed') 544 545 while True: 546 end_offset = handle.tell() 547 line = handle.readline() 548 549 if query_mark is None: 550 query_mark = line 551 start_offset = end_offset 552 elif line.startswith(qid_mark): 553 qresult_key = line[len(qid_mark):].split()[0] 554 elif line == query_mark or line.startswith(end_mark): 555 yield qresult_key, start_offset, end_offset - start_offset 556 start_offset = end_offset 557 elif not line: 558 break
559
560 - def _qresult_index(self):
561 """Indexer for noncommented BLAST tabular files.""" 562 handle = self._handle 563 handle.seek(0) 564 start_offset = 0 565 qresult_key = None 566 key_idx = self._key_idx 567 tab_char = _as_bytes('\t') 568 569 while True: 570 # get end offset here since we only know a qresult ends after 571 # encountering the next one 572 end_offset = handle.tell() 573 #line = handle.readline() 574 line = handle.readline() 575 576 if qresult_key is None: 577 qresult_key = line.split(tab_char)[key_idx] 578 else: 579 try: 580 curr_key = line.split(tab_char)[key_idx] 581 except IndexError: 582 curr_key = _as_bytes('') 583 584 if curr_key != qresult_key: 585 yield qresult_key, start_offset, end_offset - start_offset 586 qresult_key = curr_key 587 start_offset = end_offset 588 589 # break if we've reached EOF 590 if not line: 591 break
592
593 - def get_raw(self, offset):
594 """Returns the raw string of a QueryResult object from the given offset.""" 595 if self._kwargs['comments']: 596 getfunc = self._get_raw_qresult_commented 597 else: 598 getfunc = self._get_raw_qresult 599 600 return getfunc(offset)
601
602 - def _get_raw_qresult(self, offset):
603 """Returns the raw string of a single QueryResult from a noncommented file.""" 604 handle = self._handle 605 handle.seek(offset) 606 qresult_raw = _as_bytes('') 607 tab_char = _as_bytes('\t') 608 key_idx = self._key_idx 609 qresult_key = None 610 611 while True: 612 line = handle.readline() 613 # get the key if the first line (qresult key) 614 if qresult_key is None: 615 qresult_key = line.split(tab_char)[key_idx] 616 else: 617 try: 618 curr_key = line.split(tab_char)[key_idx] 619 except IndexError: 620 curr_key = _as_bytes('') 621 # only break when qresult is finished (key is different) 622 if curr_key != qresult_key: 623 break 624 # append to the raw string as long as qresult is the same 625 qresult_raw += line 626 627 return qresult_raw
628
629 - def _get_raw_qresult_commented(self, offset):
630 """Returns the raw string of a single QueryResult from a commented file.""" 631 handle = self._handle 632 handle.seek(offset) 633 qresult_raw = _as_bytes('') 634 end_mark = _as_bytes('# BLAST processed') 635 636 # query mark is the line marking a new query 637 # something like '# TBLASTN 2.2.25+' 638 query_mark = None 639 line = handle.readline() 640 while line: 641 # since query_mark depends on the BLAST search, we need to obtain it 642 # first 643 if query_mark is None: 644 query_mark = line 645 # break when we've reached the next qresult or the search ends 646 elif line == query_mark or line.startswith(end_mark): 647 break 648 649 qresult_raw += line 650 line = handle.readline() 651 652 return qresult_raw
653 654
655 -class BlastTabWriter(object):
656 657 """Writer for blast-tab output format.""" 658
659 - def __init__(self, handle, comments=False, fields=_DEFAULT_FIELDS):
660 self.handle = handle 661 self.has_comments = comments 662 self.fields = fields
663
664 - def write_file(self, qresults):
665 """Writes to the handle, returns how many QueryResult objects are written.""" 666 handle = self.handle 667 qresult_counter, hit_counter, hsp_counter, frag_counter = 0, 0, 0, 0 668 669 for qresult in qresults: 670 if self.has_comments: 671 handle.write(self._build_comments(qresult)) 672 if qresult: 673 handle.write(self._build_rows(qresult)) 674 if not self.has_comments: 675 qresult_counter += 1 676 hit_counter += len(qresult) 677 hsp_counter += sum(len(hit) for hit in qresult) 678 frag_counter += sum(len(hit.fragments) for hit in qresult) 679 # if it's commented and there are no hits in the qresult, we still 680 # increment the counter 681 if self.has_comments: 682 qresult_counter += 1 683 684 # commented files have a line saying how many queries were processed 685 if self.has_comments: 686 handle.write('# BLAST processed %i queries' % qresult_counter) 687 688 return qresult_counter, hit_counter, hsp_counter, frag_counter
689
690 - def _build_rows(self, qresult):
691 """Returns a string containing tabular rows of the QueryResult object.""" 692 coordinates = set(['qstart', 'qend', 'sstart', 'send']) 693 qresult_lines = '' 694 for hit in qresult: 695 for hsp in hit: 696 line = [] 697 for field in self.fields: 698 # get the column value ~ could either be an attribute 699 # of qresult, hit, or hsp 700 if field in _COLUMN_QRESULT: 701 value = getattr(qresult, _COLUMN_QRESULT[field][0]) 702 elif field in _COLUMN_HIT: 703 value = getattr(hit, _COLUMN_HIT[field][0]) 704 # special case, since 'frames' can be determined from 705 # query frame and hit frame 706 elif field == 'frames': 707 value = '%i/%i' % (hsp.query_frame, hsp.hit_frame) 708 elif field in _COLUMN_HSP: 709 try: 710 value = getattr(hsp, _COLUMN_HSP[field][0]) 711 except AttributeError: 712 attr = _COLUMN_HSP[field][0] 713 _augment_blast_hsp(hsp, attr) 714 value = getattr(hsp, attr) 715 elif field in _COLUMN_FRAG: 716 value = getattr(hsp, _COLUMN_FRAG[field][0]) 717 else: 718 assert field not in _SUPPORTED_FIELDS 719 continue 720 721 # adjust from and to according to strand, if from and to 722 # is included in the output field 723 if field in coordinates: 724 value = self._adjust_coords(field, value, hsp) 725 # adjust output formatting 726 value = self._adjust_output(field, value) 727 728 line.append(value) 729 730 hsp_line = '\t'.join(line) 731 qresult_lines += hsp_line + '\n' 732 733 return qresult_lines
734
735 - def _adjust_coords(self, field, value, hsp):
736 """Adjusts start and end coordinates according to strand.""" 737 assert field in ('qstart', 'qend', 'sstart', 'send') 738 # determine sequence type to operate on based on field's first letter 739 seq_type = 'query' if field.startswith('q') else 'hit' 740 741 strand = getattr(hsp, '%s_strand' % seq_type, None) 742 if strand is None: 743 raise ValueError("Required attribute %r not found." % 744 ('%s_strand' % (seq_type))) 745 # switch start <--> end coordinates if strand is -1 746 if strand < 0: 747 if field.endswith('start'): 748 value = getattr(hsp, '%s_end' % seq_type) 749 elif field.endswith('end'): 750 value = getattr(hsp, '%s_start' % seq_type) + 1 751 elif field.endswith('start'): 752 # adjust start coordinate for positive strand 753 value += 1 754 755 return value
756
757 - def _adjust_output(self, field, value):
758 """Adjusts formatting of the given field and value to mimic native tab output.""" 759 # qseq and sseq are stored as SeqRecord, but here we only need the str 760 if field in ('qseq', 'sseq'): 761 value = str(value.seq) 762 763 # evalue formatting, adapted from BLAST+ source: 764 # src/objtools/align_format/align_format_util.cpp#L668 765 elif field == 'evalue': 766 if value < 1.0e-180: 767 value = '0.0' 768 elif value < 1.0e-99: 769 value = '%2.0e' % value 770 elif value < 0.0009: 771 value = '%3.0e' % value 772 elif value < 0.1: 773 value = '%4.3f' % value 774 elif value < 1.0: 775 value = '%3.2f' % value 776 elif value < 10.0: 777 value = '%2.1f' % value 778 else: 779 value = '%5.0f' % value 780 781 # pident and ppos formatting 782 elif field in ('pident', 'ppos'): 783 value = '%.2f' % value 784 785 # evalue formatting, adapted from BLAST+ source: 786 # src/objtools/align_format/align_format_util.cpp#L723 787 elif field == 'bitscore': 788 if value > 9999: 789 value = '%4.3e' % value 790 elif value > 99.9: 791 value = '%4.0d' % value 792 else: 793 value = '%4.1f' % value 794 795 # coverages have no comma (using floats still ~ a more proper 796 # representation) 797 elif field in ('qcovhsp', 'qcovs'): 798 value = '%.0f' % value 799 800 # list into '<>'-delimited string 801 elif field == 'salltitles': 802 value = '<>'.join(value) 803 804 # list into ';'-delimited string 805 elif field in ('sallseqid', 'sallacc', 'staxids', 'sscinames', 806 'scomnames', 'sblastnames', 'sskingdoms'): 807 value = ';'.join(value) 808 809 # everything else 810 else: 811 value = str(value) 812 813 return value
814
815 - def _build_comments(self, qres):
816 """Returns a string of a QueryResult tabular comment.""" 817 comments = [] 818 # inverse mapping of the long-short name map, required 819 # for writing comments 820 inv_field_map = dict((v, k) for k, v in _LONG_SHORT_MAP.items()) 821 822 # try to anticipate qress without version 823 if not hasattr(qres, 'version'): 824 program_line = '# %s' % qres.program.upper() 825 else: 826 program_line = '# %s %s' % (qres.program.upper(), qres.version) 827 comments.append(program_line) 828 # description may or may not be None 829 if qres.description is None: 830 comments.append('# Query: %s' % qres.id) 831 else: 832 comments.append('# Query: %s %s' % (qres.id, qres.description)) 833 # try appending RID line, if present 834 try: 835 comments.append('# RID: %s' % qres.rid) 836 except AttributeError: 837 pass 838 comments.append('# Database: %s' % qres.target) 839 # qresults without hits don't show the Fields comment 840 if qres: 841 comments.append('# Fields: %s' % 842 ', '.join(inv_field_map[field] for field in self.fields)) 843 comments.append('# %i hits found' % len(qres)) 844 845 return '\n'.join(comments) + '\n'
846 847 848 # if not used as a module, run the doctest 849 if __name__ == "__main__": 850 from Bio._utils import run_doctest 851 run_doctest() 852