Package Bio :: Package Blast :: Module NCBIWWW
[hide private]
[frames] | no frames]

Source Code for Module Bio.Blast.NCBIWWW

  1  # Copyright 1999 by Jeffrey Chang.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  # Patched by Brad Chapman. 
  7  # Chris Wroe added modifications for work in myGrid 
  8   
  9  """Code to invoke the NCBI BLAST server over the internet. 
 10   
 11  This module provides code to work with the WWW version of BLAST 
 12  provided by the NCBI. 
 13  http://blast.ncbi.nlm.nih.gov/ 
 14   
 15  Functions: 
 16  qblast        Do a BLAST search using the QBLAST API. 
 17  """ 
 18   
 19  from __future__ import print_function 
 20   
 21  from Bio._py3k import StringIO 
 22  from Bio._py3k import _as_string, _as_bytes 
 23  from Bio._py3k import urlopen as _urlopen 
 24  from Bio._py3k import urlencode as _urlencode 
 25  from Bio._py3k import Request as _Request 
 26   
 27   
28 -def qblast(program, database, sequence, 29 auto_format=None, composition_based_statistics=None, 30 db_genetic_code=None, endpoints=None, entrez_query='(none)', 31 expect=10.0, filter=None, gapcosts=None, genetic_code=None, 32 hitlist_size=50, i_thresh=None, layout=None, lcase_mask=None, 33 matrix_name=None, nucl_penalty=None, nucl_reward=None, 34 other_advanced=None, perc_ident=None, phi_pattern=None, 35 query_file=None, query_believe_defline=None, query_from=None, 36 query_to=None, searchsp_eff=None, service=None, threshold=None, 37 ungapped_alignment=None, word_size=None, 38 alignments=500, alignment_view=None, descriptions=500, 39 entrez_links_new_window=None, expect_low=None, expect_high=None, 40 format_entrez_query=None, format_object=None, format_type='XML', 41 ncbi_gi=None, results_file=None, show_overview=None, megablast=None, 42 ):
43 """Do a BLAST search using the QBLAST server at NCBI. 44 45 Supports all parameters of the qblast API for Put and Get. 46 Some useful parameters: 47 program blastn, blastp, blastx, tblastn, or tblastx (lower case) 48 database Which database to search against (e.g. "nr"). 49 sequence The sequence to search. 50 ncbi_gi TRUE/FALSE whether to give 'gi' identifier. 51 descriptions Number of descriptions to show. Def 500. 52 alignments Number of alignments to show. Def 500. 53 expect An expect value cutoff. Def 10.0. 54 matrix_name Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45). 55 filter "none" turns off filtering. Default no filtering 56 format_type "HTML", "Text", "ASN.1", or "XML". Def. "XML". 57 entrez_query Entrez query to limit Blast search 58 hitlist_size Number of hits to return. Default 50 59 megablast TRUE/FALSE whether to use MEga BLAST algorithm (blastn only) 60 service plain, psi, phi, rpsblast, megablast (lower case) 61 62 This function does no checking of the validity of the parameters 63 and passes the values to the server as is. More help is available at: 64 http://www.ncbi.nlm.nih.gov/BLAST/Doc/urlapi.html 65 66 """ 67 import time 68 69 assert program in ['blastn', 'blastp', 'blastx', 'tblastn', 'tblastx'] 70 71 # Format the "Put" command, which sends search requests to qblast. 72 # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node5.html on 9 July 2007 73 # Additional parameters are taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node9.html on 8 Oct 2010 74 # To perform a PSI-BLAST or PHI-BLAST search the service ("Put" and "Get" commands) must be specified 75 # (e.g. psi_blast = NCBIWWW.qblast("blastp", "refseq_protein", input_sequence, service="psi")) 76 parameters = [ 77 ('AUTO_FORMAT', auto_format), 78 ('COMPOSITION_BASED_STATISTICS', composition_based_statistics), 79 ('DATABASE', database), 80 ('DB_GENETIC_CODE', db_genetic_code), 81 ('ENDPOINTS', endpoints), 82 ('ENTREZ_QUERY', entrez_query), 83 ('EXPECT', expect), 84 ('FILTER', filter), 85 ('GAPCOSTS', gapcosts), 86 ('GENETIC_CODE', genetic_code), 87 ('HITLIST_SIZE', hitlist_size), 88 ('I_THRESH', i_thresh), 89 ('LAYOUT', layout), 90 ('LCASE_MASK', lcase_mask), 91 ('MEGABLAST', megablast), 92 ('MATRIX_NAME', matrix_name), 93 ('NUCL_PENALTY', nucl_penalty), 94 ('NUCL_REWARD', nucl_reward), 95 ('OTHER_ADVANCED', other_advanced), 96 ('PERC_IDENT', perc_ident), 97 ('PHI_PATTERN', phi_pattern), 98 ('PROGRAM', program), 99 #('PSSM',pssm), - It is possible to use PSI-BLAST via this API? 100 ('QUERY', sequence), 101 ('QUERY_FILE', query_file), 102 ('QUERY_BELIEVE_DEFLINE', query_believe_defline), 103 ('QUERY_FROM', query_from), 104 ('QUERY_TO', query_to), 105 #('RESULTS_FILE',...), - Can we use this parameter? 106 ('SEARCHSP_EFF', searchsp_eff), 107 ('SERVICE', service), 108 ('THRESHOLD', threshold), 109 ('UNGAPPED_ALIGNMENT', ungapped_alignment), 110 ('WORD_SIZE', word_size), 111 ('CMD', 'Put'), 112 ] 113 query = [x for x in parameters if x[1] is not None] 114 message = _as_bytes(_urlencode(query)) 115 116 # Send off the initial query to qblast. 117 # Note the NCBI do not currently impose a rate limit here, other 118 # than the request not to make say 50 queries at once using multiple 119 # threads. 120 request = _Request("http://blast.ncbi.nlm.nih.gov/Blast.cgi", 121 message, 122 {"User-Agent":"BiopythonClient"}) 123 handle = _urlopen(request) 124 125 # Format the "Get" command, which gets the formatted results from qblast 126 # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node6.html on 9 July 2007 127 rid, rtoe = _parse_qblast_ref_page(handle) 128 parameters = [ 129 ('ALIGNMENTS', alignments), 130 ('ALIGNMENT_VIEW', alignment_view), 131 ('DESCRIPTIONS', descriptions), 132 ('ENTREZ_LINKS_NEW_WINDOW', entrez_links_new_window), 133 ('EXPECT_LOW', expect_low), 134 ('EXPECT_HIGH', expect_high), 135 ('FORMAT_ENTREZ_QUERY', format_entrez_query), 136 ('FORMAT_OBJECT', format_object), 137 ('FORMAT_TYPE', format_type), 138 ('NCBI_GI', ncbi_gi), 139 ('RID', rid), 140 ('RESULTS_FILE', results_file), 141 ('SERVICE', service), 142 ('SHOW_OVERVIEW', show_overview), 143 ('CMD', 'Get'), 144 ] 145 query = [x for x in parameters if x[1] is not None] 146 message = _as_bytes(_urlencode(query)) 147 148 # Poll NCBI until the results are ready. Use a backoff delay from 2 - 120 second wait 149 delay = 2.0 150 previous = time.time() 151 while True: 152 current = time.time() 153 wait = previous + delay - current 154 if wait > 0: 155 time.sleep(wait) 156 previous = current + wait 157 else: 158 previous = current 159 if delay + .5*delay <= 120: 160 delay += .5*delay 161 else: 162 delay = 120 163 164 request = _Request("http://blast.ncbi.nlm.nih.gov/Blast.cgi", 165 message, 166 {"User-Agent":"BiopythonClient"}) 167 handle = _urlopen(request) 168 results = _as_string(handle.read()) 169 170 # Can see an "\n\n" page while results are in progress, 171 # if so just wait a bit longer... 172 if results=="\n\n": 173 continue 174 # XML results don't have the Status tag when finished 175 if "Status=" not in results: 176 break 177 i = results.index("Status=") 178 j = results.index("\n", i) 179 status = results[i+len("Status="):j].strip() 180 if status.upper() == "READY": 181 break 182 183 return StringIO(results)
184 185
186 -def _parse_qblast_ref_page(handle):
187 """Extract a tuple of RID, RTOE from the 'please wait' page (PRIVATE). 188 189 The NCBI FAQ pages use TOE for 'Time of Execution', so RTOE is proably 190 'Request Time of Execution' and RID would be 'Request Identifier'. 191 """ 192 s = _as_string(handle.read()) 193 i = s.find("RID =") 194 if i == -1: 195 rid = None 196 else: 197 j = s.find("\n", i) 198 rid = s[i+len("RID ="):j].strip() 199 200 i = s.find("RTOE =") 201 if i == -1: 202 rtoe = None 203 else: 204 j = s.find("\n", i) 205 rtoe = s[i+len("RTOE ="):j].strip() 206 207 if not rid and not rtoe: 208 #Can we reliably extract the error message from the HTML page? 209 #e.g. "Message ID#24 Error: Failed to read the Blast query: 210 # Nucleotide FASTA provided for protein sequence" 211 #or "Message ID#32 Error: Query contains no data: Query 212 # contains no sequence data" 213 # 214 #This used to occur inside a <div class="error msInf"> entry: 215 i = s.find('<div class="error msInf">') 216 if i != -1: 217 msg = s[i+len('<div class="error msInf">'):].strip() 218 msg = msg.split("</div>", 1)[0].split("\n", 1)[0].strip() 219 if msg: 220 raise ValueError("Error message from NCBI: %s" % msg) 221 #In spring 2010 the markup was like this: 222 i = s.find('<p class="error">') 223 if i != -1: 224 msg = s[i+len('<p class="error">'):].strip() 225 msg = msg.split("</p>", 1)[0].split("\n", 1)[0].strip() 226 if msg: 227 raise ValueError("Error message from NCBI: %s" % msg) 228 #Generic search based on the way the error messages start: 229 i = s.find('Message ID#') 230 if i != -1: 231 #Break the message at the first HTML tag 232 msg = s[i:].split("<", 1)[0].split("\n", 1)[0].strip() 233 raise ValueError("Error message from NCBI: %s" % msg) 234 #We didn't recognise the error layout :( 235 #print s 236 raise ValueError("No RID and no RTOE found in the 'please wait' page, " 237 "there was probably an error in your request but we " 238 "could not extract a helpful error message.") 239 elif not rid: 240 #Can this happen? 241 raise ValueError("No RID found in the 'please wait' page." 242 " (although RTOE = %s)" % repr(rtoe)) 243 elif not rtoe: 244 #Can this happen? 245 raise ValueError("No RTOE found in the 'please wait' page." 246 " (although RID = %s)" % repr(rid)) 247 248 try: 249 return rid, int(rtoe) 250 except ValueError: 251 raise ValueError("A non-integer RTOE found in " 252 +"the 'please wait' page, %s" % repr(rtoe))
253