Package Bio :: Package Blast :: Module NCBIWWW
[hide private]
[frames] | no frames]

Source Code for Module Bio.Blast.NCBIWWW

  1  # Copyright 1999 by Jeffrey Chang.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  # Patched by Brad Chapman. 
  7  # Chris Wroe added modifications for work in myGrid 
  8   
  9  """Code to invoke the NCBI BLAST server over the internet. 
 10   
 11  This module provides code to work with the WWW version of BLAST 
 12  provided by the NCBI. 
 13  http://blast.ncbi.nlm.nih.gov/ 
 14  """ 
 15   
 16  from __future__ import print_function 
 17   
 18  from Bio._py3k import StringIO 
 19  from Bio._py3k import _as_string, _as_bytes 
 20  from Bio._py3k import urlopen as _urlopen 
 21  from Bio._py3k import urlencode as _urlencode 
 22  from Bio._py3k import Request as _Request 
 23   
 24  __docformat__ = "restructuredtext en" 
 25   
 26   
27 -def qblast(program, database, sequence, 28 auto_format=None, composition_based_statistics=None, 29 db_genetic_code=None, endpoints=None, entrez_query='(none)', 30 expect=10.0, filter=None, gapcosts=None, genetic_code=None, 31 hitlist_size=50, i_thresh=None, layout=None, lcase_mask=None, 32 matrix_name=None, nucl_penalty=None, nucl_reward=None, 33 other_advanced=None, perc_ident=None, phi_pattern=None, 34 query_file=None, query_believe_defline=None, query_from=None, 35 query_to=None, searchsp_eff=None, service=None, threshold=None, 36 ungapped_alignment=None, word_size=None, 37 alignments=500, alignment_view=None, descriptions=500, 38 entrez_links_new_window=None, expect_low=None, expect_high=None, 39 format_entrez_query=None, format_object=None, format_type='XML', 40 ncbi_gi=None, results_file=None, show_overview=None, megablast=None, 41 ):
42 """Do a BLAST search using the QBLAST server at NCBI. 43 44 Supports all parameters of the qblast API for Put and Get. 45 Some useful parameters: 46 47 - program blastn, blastp, blastx, tblastn, or tblastx (lower case) 48 - database Which database to search against (e.g. "nr"). 49 - sequence The sequence to search. 50 - ncbi_gi TRUE/FALSE whether to give 'gi' identifier. 51 - descriptions Number of descriptions to show. Def 500. 52 - alignments Number of alignments to show. Def 500. 53 - expect An expect value cutoff. Def 10.0. 54 - matrix_name Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45). 55 - filter "none" turns off filtering. Default no filtering 56 - format_type "HTML", "Text", "ASN.1", or "XML". Def. "XML". 57 - entrez_query Entrez query to limit Blast search 58 - hitlist_size Number of hits to return. Default 50 59 - megablast TRUE/FALSE whether to use MEga BLAST algorithm (blastn only) 60 - service plain, psi, phi, rpsblast, megablast (lower case) 61 62 This function does no checking of the validity of the parameters 63 and passes the values to the server as is. More help is available at: 64 http://www.ncbi.nlm.nih.gov/BLAST/Doc/urlapi.html 65 """ 66 import time 67 68 assert program in ['blastn', 'blastp', 'blastx', 'tblastn', 'tblastx'] 69 70 # Format the "Put" command, which sends search requests to qblast. 71 # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node5.html on 9 July 2007 72 # Additional parameters are taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node9.html on 8 Oct 2010 73 # To perform a PSI-BLAST or PHI-BLAST search the service ("Put" and "Get" commands) must be specified 74 # (e.g. psi_blast = NCBIWWW.qblast("blastp", "refseq_protein", input_sequence, service="psi")) 75 parameters = [ 76 ('AUTO_FORMAT', auto_format), 77 ('COMPOSITION_BASED_STATISTICS', composition_based_statistics), 78 ('DATABASE', database), 79 ('DB_GENETIC_CODE', db_genetic_code), 80 ('ENDPOINTS', endpoints), 81 ('ENTREZ_QUERY', entrez_query), 82 ('EXPECT', expect), 83 ('FILTER', filter), 84 ('GAPCOSTS', gapcosts), 85 ('GENETIC_CODE', genetic_code), 86 ('HITLIST_SIZE', hitlist_size), 87 ('I_THRESH', i_thresh), 88 ('LAYOUT', layout), 89 ('LCASE_MASK', lcase_mask), 90 ('MEGABLAST', megablast), 91 ('MATRIX_NAME', matrix_name), 92 ('NUCL_PENALTY', nucl_penalty), 93 ('NUCL_REWARD', nucl_reward), 94 ('OTHER_ADVANCED', other_advanced), 95 ('PERC_IDENT', perc_ident), 96 ('PHI_PATTERN', phi_pattern), 97 ('PROGRAM', program), 98 # ('PSSM',pssm), - It is possible to use PSI-BLAST via this API? 99 ('QUERY', sequence), 100 ('QUERY_FILE', query_file), 101 ('QUERY_BELIEVE_DEFLINE', query_believe_defline), 102 ('QUERY_FROM', query_from), 103 ('QUERY_TO', query_to), 104 # ('RESULTS_FILE',...), - Can we use this parameter? 105 ('SEARCHSP_EFF', searchsp_eff), 106 ('SERVICE', service), 107 ('THRESHOLD', threshold), 108 ('UNGAPPED_ALIGNMENT', ungapped_alignment), 109 ('WORD_SIZE', word_size), 110 ('CMD', 'Put'), 111 ] 112 query = [x for x in parameters if x[1] is not None] 113 message = _as_bytes(_urlencode(query)) 114 115 # Send off the initial query to qblast. 116 # Note the NCBI do not currently impose a rate limit here, other 117 # than the request not to make say 50 queries at once using multiple 118 # threads. 119 request = _Request("http://blast.ncbi.nlm.nih.gov/Blast.cgi", 120 message, 121 {"User-Agent": "BiopythonClient"}) 122 handle = _urlopen(request) 123 124 # Format the "Get" command, which gets the formatted results from qblast 125 # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node6.html on 9 July 2007 126 rid, rtoe = _parse_qblast_ref_page(handle) 127 parameters = [ 128 ('ALIGNMENTS', alignments), 129 ('ALIGNMENT_VIEW', alignment_view), 130 ('DESCRIPTIONS', descriptions), 131 ('ENTREZ_LINKS_NEW_WINDOW', entrez_links_new_window), 132 ('EXPECT_LOW', expect_low), 133 ('EXPECT_HIGH', expect_high), 134 ('FORMAT_ENTREZ_QUERY', format_entrez_query), 135 ('FORMAT_OBJECT', format_object), 136 ('FORMAT_TYPE', format_type), 137 ('NCBI_GI', ncbi_gi), 138 ('RID', rid), 139 ('RESULTS_FILE', results_file), 140 ('SERVICE', service), 141 ('SHOW_OVERVIEW', show_overview), 142 ('CMD', 'Get'), 143 ] 144 query = [x for x in parameters if x[1] is not None] 145 message = _as_bytes(_urlencode(query)) 146 147 # Poll NCBI until the results are ready. Use a backoff delay from 2 - 120 second wait 148 delay = 2.0 149 previous = time.time() 150 while True: 151 current = time.time() 152 wait = previous + delay - current 153 if wait > 0: 154 time.sleep(wait) 155 previous = current + wait 156 else: 157 previous = current 158 if delay + .5 * delay <= 120: 159 delay += .5 * delay 160 else: 161 delay = 120 162 163 request = _Request("http://blast.ncbi.nlm.nih.gov/Blast.cgi", 164 message, 165 {"User-Agent": "BiopythonClient"}) 166 handle = _urlopen(request) 167 results = _as_string(handle.read()) 168 169 # Can see an "\n\n" page while results are in progress, 170 # if so just wait a bit longer... 171 if results == "\n\n": 172 continue 173 # XML results don't have the Status tag when finished 174 if "Status=" not in results: 175 break 176 i = results.index("Status=") 177 j = results.index("\n", i) 178 status = results[i + len("Status="):j].strip() 179 if status.upper() == "READY": 180 break 181 182 return StringIO(results)
183 184
185 -def _parse_qblast_ref_page(handle):
186 """Extract a tuple of RID, RTOE from the 'please wait' page (PRIVATE). 187 188 The NCBI FAQ pages use TOE for 'Time of Execution', so RTOE is proably 189 'Request Time of Execution' and RID would be 'Request Identifier'. 190 """ 191 s = _as_string(handle.read()) 192 i = s.find("RID =") 193 if i == -1: 194 rid = None 195 else: 196 j = s.find("\n", i) 197 rid = s[i + len("RID ="):j].strip() 198 199 i = s.find("RTOE =") 200 if i == -1: 201 rtoe = None 202 else: 203 j = s.find("\n", i) 204 rtoe = s[i + len("RTOE ="):j].strip() 205 206 if not rid and not rtoe: 207 # Can we reliably extract the error message from the HTML page? 208 # e.g. "Message ID#24 Error: Failed to read the Blast query: 209 # Nucleotide FASTA provided for protein sequence" 210 # or "Message ID#32 Error: Query contains no data: Query 211 # contains no sequence data" 212 # 213 # This used to occur inside a <div class="error msInf"> entry: 214 i = s.find('<div class="error msInf">') 215 if i != -1: 216 msg = s[i + len('<div class="error msInf">'):].strip() 217 msg = msg.split("</div>", 1)[0].split("\n", 1)[0].strip() 218 if msg: 219 raise ValueError("Error message from NCBI: %s" % msg) 220 # In spring 2010 the markup was like this: 221 i = s.find('<p class="error">') 222 if i != -1: 223 msg = s[i + len('<p class="error">'):].strip() 224 msg = msg.split("</p>", 1)[0].split("\n", 1)[0].strip() 225 if msg: 226 raise ValueError("Error message from NCBI: %s" % msg) 227 # Generic search based on the way the error messages start: 228 i = s.find('Message ID#') 229 if i != -1: 230 # Break the message at the first HTML tag 231 msg = s[i:].split("<", 1)[0].split("\n", 1)[0].strip() 232 raise ValueError("Error message from NCBI: %s" % msg) 233 # We didn't recognise the error layout :( 234 # print s 235 raise ValueError("No RID and no RTOE found in the 'please wait' page, " 236 "there was probably an error in your request but we " 237 "could not extract a helpful error message.") 238 elif not rid: 239 # Can this happen? 240 raise ValueError("No RID found in the 'please wait' page." 241 " (although RTOE = %s)" % repr(rtoe)) 242 elif not rtoe: 243 # Can this happen? 244 raise ValueError("No RTOE found in the 'please wait' page." 245 " (although RID = %s)" % repr(rid)) 246 247 try: 248 return rid, int(rtoe) 249 except ValueError: 250 raise ValueError("A non-integer RTOE found in " 251 "the 'please wait' page, %s" % repr(rtoe))
252