Package Bio :: Package SearchIO :: Package BlastIO :: Module blast_text
[hide private]
[frames] | no frames]

Source Code for Module Bio.SearchIO.BlastIO.blast_text

  1  # Copyright 2012 by Wibowo Arindrarto.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Bio.SearchIO parser for BLAST+ plain text output formats. 
  7   
  8  At the moment this is a wrapper around Biopython's NCBIStandalone text 
  9  parser. 
 10   
 11  """ 
 12   
 13  from Bio.Alphabet import generic_dna, generic_protein 
 14  from Bio.Blast import NCBIStandalone 
 15  from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment 
 16   
 17   
 18  __all__ = ['BlastTextParser'] 
 19   
 20   
21 -class BlastTextParser(object):
22 23 """Parser for the BLAST text format.""" 24
25 - def __init__(self, handle):
26 self.handle = handle 27 blast_parser = NCBIStandalone.BlastParser() 28 self.blast_iter = NCBIStandalone.Iterator(handle, blast_parser)
29
30 - def __iter__(self):
31 for rec in self.blast_iter: 32 # set attributes to SearchIO's 33 # get id and desc 34 if rec.query.startswith('>'): 35 rec.query = rec.query[1:] 36 try: 37 qid, qdesc = rec.query.split(' ', 1) 38 except ValueError: 39 qid, qdesc = rec.query, '' 40 qdesc = qdesc.replace('\n', '').replace('\r', '') 41 42 qresult = QueryResult(qid) 43 qresult.program = rec.application.lower() 44 qresult.target = rec.database 45 qresult.seq_len = rec.query_letters 46 qresult.version = rec.version 47 48 # determine alphabet based on program 49 if qresult.program == 'blastn': 50 alphabet = generic_dna 51 elif qresult.program in ['blastp', 'blastx', 'tblastn', 'tblastx']: 52 alphabet = generic_protein 53 54 # iterate over the 'alignments' (hits) and the hit table 55 for idx, aln in enumerate(rec.alignments): 56 # get id and desc 57 if aln.title.startswith('> '): 58 aln.title = aln.title[2:] 59 elif aln.title.startswith('>'): 60 aln.title = aln.title[1:] 61 try: 62 hid, hdesc = aln.title.split(' ', 1) 63 except ValueError: 64 hid, hdesc = aln.title, '' 65 hdesc = hdesc.replace('\n', '').replace('\r', '') 66 67 # iterate over the hsps and group them in a list 68 hsp_list = [] 69 for bhsp in aln.hsps: 70 frag = HSPFragment(hid, qid) 71 frag.alphabet = alphabet 72 # set alignment length 73 frag.aln_span = bhsp.identities[1] 74 # set frames 75 try: 76 frag.query_frame = int(bhsp.frame[0]) 77 except IndexError: 78 if qresult.program in ('blastp', 'tblastn'): 79 frag.query_frame = 0 80 else: 81 frag.query_frame = 1 82 try: 83 frag.hit_frame = int(bhsp.frame[1]) 84 except IndexError: 85 if qresult.program in ('blastp', 'tblastn'): 86 frag.hit_frame = 0 87 else: 88 frag.hit_frame = 1 89 # set query coordinates 90 frag.query_start = min(bhsp.query_start, 91 bhsp.query_end) - 1 92 frag.query_end = max(bhsp.query_start, bhsp.query_end) 93 # set hit coordinates 94 frag.hit_start = min(bhsp.sbjct_start, 95 bhsp.sbjct_end) - 1 96 frag.hit_end = max(bhsp.sbjct_start, bhsp.sbjct_end) 97 # set query, hit sequences and its annotation 98 qseq = '' 99 hseq = '' 100 midline = '' 101 for seqtrio in zip(bhsp.query, bhsp.sbjct, bhsp.match): 102 qchar, hchar, mchar = seqtrio 103 if qchar == ' ' or hchar == ' ': 104 assert all([' ' == x for x in seqtrio]) 105 else: 106 qseq += qchar 107 hseq += hchar 108 midline += mchar 109 frag.query, frag.hit = qseq, hseq 110 frag.aln_annotation['homology'] = midline 111 112 # create HSP object with the fragment 113 hsp = HSP([frag]) 114 hsp.evalue = bhsp.expect 115 hsp.bitscore = bhsp.bits 116 hsp.bitscore_raw = bhsp.score 117 # set gap 118 try: 119 hsp.gap_num = bhsp.gaps[0] 120 except IndexError: 121 hsp.gap_num = 0 122 # set identity 123 hsp.ident_num = bhsp.identities[0] 124 hsp.pos_num = bhsp.positives[0] 125 if hsp.pos_num is None: 126 hsp.pos_num = hsp[0].aln_span 127 128 hsp_list.append(hsp) 129 130 hit = Hit(hsp_list) 131 hit.seq_len = aln.length 132 hit.description = hdesc 133 qresult.append(hit) 134 135 qresult.description = qdesc 136 yield qresult
137 138 139 # if not used as a module, run the doctest 140 if __name__ == "__main__": 141 from Bio._utils import run_doctest 142 run_doctest() 143