Package Bio :: Package TogoWS
[hide private]
[frames] | no frames]

Source Code for Package Bio.TogoWS

  1  # Copyright 2010-2011 by Peter Cock.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Provides code to access the TogoWS integrated websevices of DBCLS, Japan. 
  7   
  8  This module aims to make the TogoWS (from DBCLS, Japan) easier to use. See: 
  9  http://togows.dbcls.jp/ 
 10   
 11  The TogoWS REST service provides simple access to a range of databases, acting 
 12  as a proxy to shield you from all the different provider APIs. This works using 
 13  simple URLs (which this module will construct for you). For more details, see 
 14  http://togows.dbcls.jp/site/en/rest.html 
 15   
 16  The functionality is somewhat similar to Biopython's Bio.Entrez module which 
 17  provides access to the NCBI's Entrez Utilities (E-Utils) which also covers a 
 18  wide range of databases. 
 19   
 20  Currently TogoWS does not provide any usage guidelines (unlike the NCBI whose 
 21  requirements are reasonably clear). To avoid risking overloading the service, 
 22  Biopython will only allow three calls per second. 
 23   
 24  The TogoWS SOAP service offers a more complex API for calling web services 
 25  (essentially calling remote functions) provided by DDBJ, KEGG and PDBj. For 
 26  example, this allows you to run a remote BLAST search at the DDBJ. This is 
 27  not yet covered by this module, however there are lots of Python examples 
 28  on the TogoWS website using the SOAPpy python library. See: 
 29  http://togows.dbcls.jp/site/en/soap.html 
 30  http://soapy.sourceforge.net/ 
 31  """ 
 32   
 33  import urllib 
 34  import urllib2 
 35  import time 
 36  from Bio._py3k import _binary_to_string_handle, _as_bytes 
 37   
 38  #Constant 
 39  _BASE_URL = "http://togows.dbcls.jp" 
 40   
 41  #Caches: 
 42  _search_db_names = None 
 43  _entry_db_names = None 
 44  _entry_db_fields = {} 
 45  _entry_db_formats = {} 
 46  _convert_formats = [] 
 47   
 48   
49 -def _get_fields(url):
50 """Queries a TogoWS URL for a plain text list of values (PRIVATE).""" 51 handle = _open(url) 52 fields = handle.read().strip().split() 53 handle.close() 54 return fields
55 56
57 -def _get_entry_dbs():
58 return _get_fields(_BASE_URL + "/entry")
59 60
61 -def _get_entry_fields(db):
62 return _get_fields(_BASE_URL + "/entry/%s?fields" % db)
63 64
65 -def _get_entry_formats(db):
66 return _get_fields(_BASE_URL + "/entry/%s?formats" % db)
67 68
69 -def _get_convert_formats():
70 return [pair.split(".") for pair in 71 _get_fields(_BASE_URL + "/convert/")]
72 73
74 -def entry(db, id, format=None, field=None):
75 """TogoWS fetch entry (returns a handle). 76 77 db - database (string), see list below. 78 id - identier (string) or a list of identifiers (either as a list of 79 strings or a single string with comma separators). 80 format - return data file format (string), options depend on the database 81 e.g. "xml", "json", "gff", "fasta", "ttl" (RDF Turtle) 82 field - specific field from within the database record (string) 83 e.g. "au" or "authors" for pubmed. 84 85 At the time of writing, this includes the following: 86 87 KEGG: compound, drug, enzyme, genes, glycan, orthology, reaction, 88 module, pathway 89 DDBj: ddbj, dad, pdb 90 NCBI: nuccore, nucest, nucgss, nucleotide, protein, gene, onim, 91 homologue, snp, mesh, pubmed 92 EBI: embl, uniprot, uniparc, uniref100, uniref90, uniref50 93 94 For the current list, please see http://togows.dbcls.jp/entry/ 95 96 This function is essentially equivalent to the NCBI Entrez service 97 EFetch, available in Biopython as Bio.Entrez.efetch(...), but that 98 does not offer field extraction. 99 """ 100 global _entry_db_names, _entry_db_fields, fetch_db_formats 101 if _entry_db_names is None: 102 _entry_db_names = _get_entry_dbs() 103 if db not in _entry_db_names: 104 raise ValueError("TogoWS entry fetch does not officially support " 105 "database '%s'." % db) 106 if field: 107 try: 108 fields = _entry_db_fields[db] 109 except KeyError: 110 fields = _get_entry_fields(db) 111 _entry_db_fields[db] = fields 112 if field not in fields: 113 raise ValueError("TogoWS entry fetch does not explicitly support " 114 "field '%s' for database '%s'. Only: %s" 115 % (field, db, ", ".join(sorted(fields)))) 116 if format: 117 try: 118 formats = _entry_db_formats[db] 119 except KeyError: 120 formats = _get_entry_formats(db) 121 _entry_db_formats[db] = formats 122 if format not in formats: 123 raise ValueError("TogoWS entry fetch does not explicitly support " 124 "format '%s' for database '%s'. Only: %s" 125 % (format, db, ", ".join(sorted(formats)))) 126 127 if isinstance(id, list): 128 id = ",".join(id) 129 url = _BASE_URL + "/entry/%s/%s" % (db, urllib.quote(id)) 130 if field: 131 url += "/" + field 132 if format: 133 url += "." + format 134 return _open(url)
135 136
137 -def search_count(db, query):
138 """TogoWS search count (returns an integer). 139 140 db - database (string), see http://togows.dbcls.jp/search 141 query - search term (string) 142 143 You could then use the count to download a large set of search results in 144 batches using the offset and limit options to Bio.TogoWS.search(). In 145 general however the Bio.TogoWS.search_iter() function is simpler to use. 146 """ 147 global _search_db_names 148 if _search_db_names is None: 149 _search_db_names = _get_fields(_BASE_URL + "/search") 150 if db not in _search_db_names: 151 #TODO - Make this a ValueError? Right now despite the HTML website 152 #claiming to, the "gene" or "ncbi-gene" don't work and are not listed. 153 import warnings 154 warnings.warn("TogoWS search does not officially support database '%s'. " 155 "See %s/search/ for options." % (db, _BASE_URL)) 156 handle = _open(_BASE_URL + "/search/%s/%s/count" 157 % (db, urllib.quote(query))) 158 count = int(handle.read().strip()) 159 handle.close() 160 return count
161 162
163 -def search_iter(db, query, limit=None, batch=100):
164 """TogoWS search iteratating over the results (generator function). 165 166 db - database (string), see http://togows.dbcls.jp/search 167 query - search term (string) 168 limit - optional upper bound on number of search results 169 batch - number of search results to pull back each time talk to 170 TogoWS (currently limited to 100). 171 172 You would use this function within a for loop, e.g. 173 174 >>> for id in search_iter("pubmed", "lung+cancer+drug", limit=10): 175 ... print id #maybe fetch data with entry? 176 177 Internally this first calls the Bio.TogoWS.search_count() and then 178 uses Bio.TogoWS.search() to get the results in batches. 179 """ 180 count = search_count(db, query) 181 if not count: 182 raise StopIteration 183 #NOTE - We leave it to TogoWS to enforce any upper bound on each 184 #batch, they currently return an HTTP 400 Bad Request if above 100. 185 remain = count 186 if limit is not None: 187 remain = min(remain, limit) 188 offset = 1 # They don't use zero based counting 189 prev_ids = [] # Just cache the last batch for error checking 190 while remain: 191 batch = min(batch, remain) 192 #print "%r left, asking for %r" % (remain, batch) 193 ids = search(db, query, offset, batch).read().strip().split() 194 assert len(ids) == batch, "Got %i, expected %i" % (len(ids), batch) 195 #print "offset %i, %s ... %s" % (offset, ids[0], ids[-1]) 196 if ids == prev_ids: 197 raise RuntimeError("Same search results for previous offset") 198 for identifier in ids: 199 if identifier in prev_ids: 200 raise RuntimeError("Result %s was in previous batch" 201 % identifier) 202 yield identifier 203 offset += batch 204 remain -= batch 205 prev_ids = ids
206 207
208 -def search(db, query, offset=None, limit=None, format=None):
209 """TogoWS search (returns a handle). 210 211 This is a low level wrapper for the TogoWS search function, which 212 can return results in a several formats. In general, the search_iter 213 function is more suitable for end users. 214 215 db - database (string), see http://togows.dbcls.jp/search/ 216 query - search term (string) 217 offset, limit - optional integers specifying which result to start from 218 (1 based) and the number of results to return. 219 format - return data file format (string), e.g. "json", "ttl" (RDF) 220 By default plain text is returned, one result per line. 221 222 At the time of writing, TogoWS applies a default count limit of 100 223 search results, and this is an upper bound. To access more results, 224 use the offset argument or the search_iter(...) function. 225 226 TogoWS supports a long list of databases, including many from the NCBI 227 (e.g. "ncbi-pubmed" or "pubmed", "ncbi-genbank" or "genbank", and 228 "ncbi-taxonomy"), EBI (e.g. "ebi-ebml" or "embl", "ebi-uniprot" or 229 "uniprot, "ebi-go"), and KEGG (e.g. "kegg-compound" or "compound"). 230 For the current list, see http://togows.dbcls.jp/search/ 231 232 The NCBI provide the Entrez Search service (ESearch) which is similar, 233 available in Biopython as the Bio.Entrez.esearch() function. 234 235 See also the function Bio.TogoWS.search_count() which returns the number 236 of matches found, and the Bio.TogoWS.search_iter() function which allows 237 you to iterate over the search results (taking care of batching for you). 238 """ 239 global _search_db_names 240 if _search_db_names is None: 241 _search_db_names = _get_fields(_BASE_URL + "/search") 242 if db not in _search_db_names: 243 #TODO - Make this a ValueError? Right now despite the HTML website 244 #claiming to, the "gene" or "ncbi-gene" don't work and are not listed. 245 import warnings 246 warnings.warn("TogoWS search does not explicitly support database '%s'. " 247 "See %s/search/ for options." % (db, _BASE_URL)) 248 url = _BASE_URL + "/search/%s/%s" % (db, urllib.quote(query)) 249 if offset is not None and limit is not None: 250 try: 251 offset = int(offset) 252 except: 253 raise ValueError("Offset should be an integer (at least one), not %r" % offset) 254 try: 255 limit = int(limit) 256 except: 257 raise ValueError("Limit should be an integer (at least one), not %r" % limit) 258 if offset <= 0: 259 raise ValueError("Offset should be at least one, not %i" % offset) 260 if limit <= 0: 261 raise ValueError("Count should be at least one, not %i" % limit) 262 url += "/%i,%i" % (offset, limit) 263 elif offset is not None or limit is not None: 264 raise ValueError("Expect BOTH offset AND limit to be provided (or neither)") 265 if format: 266 url += "." + format 267 #print url 268 return _open(url)
269 270
271 -def convert(data, in_format, out_format):
272 """TogoWS convert (returns a handle). 273 274 data - string or handle containing input record(s) 275 in_format - string describing the input file format (e.g. "genbank") 276 out_format - string describing the requested output format (e.g. "fasta") 277 278 For a list of supported conversions (e.g. "genbank" to "fasta"), see 279 http://togows.dbcls.jp/convert/ 280 281 Note that Biopython has built in support for conversion of sequence and 282 alignnent file formats (functions Bio.SeqIO.convert and Bio.AlignIO.convert) 283 """ 284 global _convert_formats 285 if not _convert_formats: 286 _convert_formats = _get_convert_formats() 287 if [in_format, out_format] not in _convert_formats: 288 msg = "\n".join("%s -> %s" % tuple(pair) for pair in _convert_formats) 289 raise ValueError("Unsupported conversion. Choose from:\n%s" % msg) 290 url = _BASE_URL + "/convert/%s.%s" % (in_format, out_format) 291 #TODO - Should we just accept a string not a handle? What about a filename? 292 if hasattr(data, "read"): 293 #Handle 294 return _open(url, post={"data": data.read()}) 295 else: 296 #String 297 return _open(url, post={"data": data})
298 299
300 -def _open(url, post=None):
301 """Helper function to build the URL and open a handle to it (PRIVATE). 302 303 Open a handle to TogoWS, will raise an IOError if it encounters an error. 304 305 In the absense of clear guidelines, this function enforces a limit of 306 "up to three queries per second" to avoid abusing the TogoWS servers. 307 """ 308 delay = 0.333333333 # one third of a second 309 current = time.time() 310 wait = _open.previous + delay - current 311 if wait > 0: 312 time.sleep(wait) 313 _open.previous = current + wait 314 else: 315 _open.previous = current 316 317 #print url 318 try: 319 if post: 320 handle = urllib2.urlopen(url, _as_bytes(urllib.urlencode(post))) 321 else: 322 handle = urllib2.urlopen(url) 323 except urllib2.HTTPError, exception: 324 raise exception 325 326 #We now trust TogoWS to have set an HTTP error code, that 327 #suffices for my current unit tests. Previously we would 328 #examine the start of the data returned back. 329 return _binary_to_string_handle(handle)
330 331 _open.previous = 0 332