Package Bio :: Package Entrez
[hide private]
[frames] | no frames]

Source Code for Package Bio.Entrez

  1  # Copyright 1999-2000 by Jeffrey Chang.  All rights reserved. 
  2  # Copyright 2008 by Michiel de Hoon.  All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6   
  7  """Provides code to access NCBI over the WWW. 
  8   
  9  The main Entrez web page is available at: 
 10  http://www.ncbi.nlm.nih.gov/Entrez/ 
 11   
 12  A list of the Entrez utilities is available at: 
 13  http://www.ncbi.nlm.nih.gov/entrez/utils/utils_index.html 
 14   
 15  Variables: 
 16  email        Set the Entrez email parameter (default is not set). 
 17  tool         Set the Entrez tool parameter (default is  biopython). 
 18   
 19  Functions: 
 20  efetch       Retrieves records in the requested format from a list of one or 
 21               more primary IDs or from the user's environment 
 22  epost        Posts a file containing a list of primary IDs for future use in 
 23               the user's environment to use with subsequent search strategies 
 24  esearch      Searches and retrieves primary IDs (for use in EFetch, ELink, 
 25               and ESummary) and term translations and optionally retains 
 26               results for future use in the user's environment. 
 27  elink        Checks for the existence of an external or Related Articles link 
 28               from a list of one or more primary IDs.  Retrieves primary IDs 
 29               and relevancy scores for links to Entrez databases or Related 
 30               Articles;  creates a hyperlink to the primary LinkOut provider 
 31               for a specific ID and database, or lists LinkOut URLs 
 32               and Attributes for multiple IDs. 
 33  einfo        Provides field index term counts, last update, and available 
 34               links for each database. 
 35  esummary     Retrieves document summaries from a list of primary IDs or from 
 36               the user's environment. 
 37  egquery      Provides Entrez database counts in XML for a single search 
 38               using Global Query. 
 39  espell       Retrieves spelling suggestions. 
 40   
 41  read         Parses the XML results returned by any of the above functions. 
 42               Typical usage is: 
 43   
 44               >>> from Bio import Entrez 
 45               >>> Entrez.email = "Your.Name.Here@example.org" 
 46               >>> handle = Entrez.einfo() # or esearch, efetch, ... 
 47               >>> record = Entrez.read(handle) 
 48               >>> handle.close() 
 49   
 50               where record is now a Python dictionary or list. 
 51   
 52  parse        Parses the XML results returned by those of the above functions 
 53               which can return multiple records - such as efetch, esummary 
 54               and elink. Typical usage is: 
 55   
 56               >>> handle = Entrez.efetch("pubmed", id="19304878,14630660", retmode="xml") 
 57               >>> records = Entrez.parse(handle) 
 58               >>> for record in records: 
 59               ...     # each record is a Python dictionary or list. 
 60               ...     print record['MedlineCitation']['Article']['ArticleTitle'] 
 61               Biopython: freely available Python tools for computational molecular biology and bioinformatics. 
 62               PDB file parser and structure class implemented in Python. 
 63               >>> handle.close() 
 64   
 65               This function is appropriate only if the XML file contains 
 66               multiple records, and is particular useful for large files. 
 67   
 68  _open        Internally used function. 
 69   
 70  """ 
 71  import urllib 
 72  import urllib2 
 73  import time 
 74  import warnings 
 75  import os.path 
 76   
 77  from Bio._py3k import _binary_to_string_handle 
 78   
 79  email = None 
 80  tool = "biopython" 
 81   
 82   
 83  # XXX retmode? 
84 -def epost(db, **keywds):
85 """Post a file of identifiers for future use. 86 87 Posts a file containing a list of UIs for future use in the user's 88 environment to use with subsequent search strategies. 89 90 See the online documentation for an explanation of the parameters: 91 http://www.ncbi.nlm.nih.gov/entrez/query/static/epost_help.html 92 93 Return a handle to the results. 94 95 Raises an IOError exception if there's a network error. 96 """ 97 cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/epost.fcgi' 98 variables = {'db': db} 99 variables.update(keywds) 100 return _open(cgi, variables, post=True)
101 102
103 -def efetch(db, **keywds):
104 """Fetches Entrez results which are returned as a handle. 105 106 EFetch retrieves records in the requested format from a list of one or 107 more UIs or from user's environment. 108 109 See the online documentation for an explanation of the parameters: 110 http://www.ncbi.nlm.nih.gov/entrez/query/static/efetch_help.html 111 112 Return a handle to the results. 113 114 Raises an IOError exception if there's a network error. 115 116 Short example: 117 118 >>> from Bio import Entrez 119 >>> Entrez.email = "Your.Name.Here@example.org" 120 >>> handle = Entrez.efetch(db="nucleotide", id="57240072", rettype="gb", retmode="text") 121 >>> print handle.readline().strip() 122 LOCUS AY851612 892 bp DNA linear PLN 10-APR-2007 123 >>> handle.close() 124 125 Warning: The NCBI changed the default retmode in Feb 2012, so many 126 databases which previously returned text output now give XML. 127 """ 128 cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi' 129 variables = {'db': db} 130 keywords = keywds 131 if "id" in keywds and isinstance(keywds["id"], list): 132 #Fix for NCBI change (probably part of EFetch 2,0, Feb 2012) where 133 #a list of ID strings now gives HTTP Error 500: Internal server error 134 #This was turned into ...&id=22307645&id=22303114&... which used to work 135 #while now the NCBI appear to insist on ...&id=22301129,22299544,... 136 keywords = keywds.copy() # Don't alter input dict! 137 keywords["id"] = ",".join(keywds["id"]) 138 variables.update(keywords) 139 return _open(cgi, variables)
140 141
142 -def esearch(db, term, **keywds):
143 """ESearch runs an Entrez search and returns a handle to the results. 144 145 ESearch searches and retrieves primary IDs (for use in EFetch, ELink 146 and ESummary) and term translations, and optionally retains results 147 for future use in the user's environment. 148 149 See the online documentation for an explanation of the parameters: 150 http://www.ncbi.nlm.nih.gov/entrez/query/static/esearch_help.html 151 152 Return a handle to the results which are always in XML format. 153 154 Raises an IOError exception if there's a network error. 155 156 Short example: 157 158 >>> from Bio import Entrez 159 >>> Entrez.email = "Your.Name.Here@example.org" 160 >>> handle = Entrez.esearch(db="nucleotide", retmax=10, term="opuntia[ORGN] accD") 161 >>> record = Entrez.read(handle) 162 >>> handle.close() 163 >>> record["Count"] >= 2 164 True 165 >>> "156535671" in record["IdList"] 166 True 167 >>> "156535673" in record["IdList"] 168 True 169 170 """ 171 cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi' 172 variables = {'db': db, 173 'term': term} 174 variables.update(keywds) 175 return _open(cgi, variables)
176 177 215 216
217 -def einfo(**keywds):
218 """EInfo returns a summary of the Entez databases as a results handle. 219 220 EInfo provides field names, index term counts, last update, and 221 available links for each Entrez database. 222 223 See the online documentation for an explanation of the parameters: 224 http://www.ncbi.nlm.nih.gov/entrez/query/static/einfo_help.html 225 226 Return a handle to the results, by default in XML format. 227 228 Raises an IOError exception if there's a network error. 229 230 Short example: 231 232 >>> from Bio import Entrez 233 >>> Entrez.email = "Your.Name.Here@example.org" 234 >>> record = Entrez.read(Entrez.einfo()) 235 >>> 'pubmed' in record['DbList'] 236 True 237 238 """ 239 cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi' 240 variables = {} 241 variables.update(keywds) 242 return _open(cgi, variables)
243 244
245 -def esummary(**keywds):
246 """ESummary retrieves document summaries as a results handle. 247 248 ESummary retrieves document summaries from a list of primary IDs or 249 from the user's environment. 250 251 See the online documentation for an explanation of the parameters: 252 http://www.ncbi.nlm.nih.gov/entrez/query/static/esummary_help.html 253 254 Return a handle to the results, by default in XML format. 255 256 Raises an IOError exception if there's a network error. 257 258 This example discovers more about entry 30367 in the journals database: 259 260 >>> from Bio import Entrez 261 >>> Entrez.email = "Your.Name.Here@example.org" 262 >>> handle = Entrez.esummary(db="journals", id="30367") 263 >>> record = Entrez.read(handle) 264 >>> handle.close() 265 >>> print record[0]["Id"] 266 30367 267 >>> print record[0]["Title"] 268 Computational biology and chemistry 269 270 """ 271 cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi' 272 variables = {} 273 variables.update(keywds) 274 return _open(cgi, variables)
275 276
277 -def egquery(**keywds):
278 """EGQuery provides Entrez database counts for a global search. 279 280 EGQuery provides Entrez database counts in XML for a single search 281 using Global Query. 282 283 See the online documentation for an explanation of the parameters: 284 http://www.ncbi.nlm.nih.gov/entrez/query/static/egquery_help.html 285 286 Return a handle to the results in XML format. 287 288 Raises an IOError exception if there's a network error. 289 290 This quick example based on a longer version from the Biopython 291 Tutorial just checks there are over 60 matches for 'Biopython' 292 in PubMedCentral: 293 294 >>> from Bio import Entrez 295 >>> Entrez.email = "Your.Name.Here@example.org" 296 >>> handle = Entrez.egquery(term="biopython") 297 >>> record = Entrez.read(handle) 298 >>> handle.close() 299 >>> for row in record["eGQueryResult"]: 300 ... if "pmc" in row["DbName"]: 301 ... print row["Count"] > 60 302 True 303 304 """ 305 cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/egquery.fcgi' 306 variables = {} 307 variables.update(keywds) 308 return _open(cgi, variables)
309 310
311 -def espell(**keywds):
312 """ESpell retrieves spelling suggestions, returned in a results handle. 313 314 ESpell retrieves spelling suggestions, if available. 315 316 See the online documentation for an explanation of the parameters: 317 http://www.ncbi.nlm.nih.gov/entrez/query/static/espell_help.html 318 319 Return a handle to the results, by default in XML format. 320 321 Raises an IOError exception if there's a network error. 322 323 Short example: 324 325 >>> from Bio import Entrez 326 >>> Entrez.email = "Your.Name.Here@example.org" 327 >>> record = Entrez.read(Entrez.espell(term="biopythooon")) 328 >>> print record["Query"] 329 biopythooon 330 >>> print record["CorrectedQuery"] 331 biopython 332 333 """ 334 cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/espell.fcgi' 335 variables = {} 336 variables.update(keywds) 337 return _open(cgi, variables)
338 339
340 -def read(handle, validate=True):
341 """Parses an XML file from the NCBI Entrez Utilities into python objects. 342 343 This function parses an XML file created by NCBI's Entrez Utilities, 344 returning a multilevel data structure of Python lists and dictionaries. 345 Most XML files returned by NCBI's Entrez Utilities can be parsed by 346 this function, provided its DTD is available. Biopython includes the 347 DTDs for most commonly used Entrez Utilities. 348 349 If validate is True (default), the parser will validate the XML file 350 against the DTD, and raise an error if the XML file contains tags that 351 are not represented in the DTD. If validate is False, the parser will 352 simply skip such tags. 353 354 Whereas the data structure seems to consist of generic Python lists, 355 dictionaries, strings, and so on, each of these is actually a class 356 derived from the base type. This allows us to store the attributes 357 (if any) of each element in a dictionary my_element.attributes, and 358 the tag name in my_element.tag. 359 """ 360 from Parser import DataHandler 361 handler = DataHandler(validate) 362 record = handler.read(handle) 363 return record
364 365
366 -def parse(handle, validate=True):
367 """Parses an XML file from the NCBI Entrez Utilities into python objects. 368 369 This function parses an XML file created by NCBI's Entrez Utilities, 370 returning a multilevel data structure of Python lists and dictionaries. 371 This function is suitable for XML files that (in Python) can be represented 372 as a list of individual records. Whereas 'read' reads the complete file 373 and returns a single Python list, 'parse' is a generator function that 374 returns the records one by one. This function is therefore particularly 375 useful for parsing large files. 376 377 Most XML files returned by NCBI's Entrez Utilities can be parsed by 378 this function, provided its DTD is available. Biopython includes the 379 DTDs for most commonly used Entrez Utilities. 380 381 If validate is True (default), the parser will validate the XML file 382 against the DTD, and raise an error if the XML file contains tags that 383 are not represented in the DTD. If validate is False, the parser will 384 simply skip such tags. 385 386 Whereas the data structure seems to consist of generic Python lists, 387 dictionaries, strings, and so on, each of these is actually a class 388 derived from the base type. This allows us to store the attributes 389 (if any) of each element in a dictionary my_element.attributes, and 390 the tag name in my_element.tag. 391 """ 392 from Parser import DataHandler 393 handler = DataHandler(validate) 394 records = handler.parse(handle) 395 return records
396 397
398 -def _open(cgi, params={}, post=False):
399 """Helper function to build the URL and open a handle to it (PRIVATE). 400 401 Open a handle to Entrez. cgi is the URL for the cgi script to access. 402 params is a dictionary with the options to pass to it. Does some 403 simple error checking, and will raise an IOError if it encounters one. 404 405 This function also enforces the "up to three queries per second rule" 406 to avoid abusing the NCBI servers. 407 """ 408 # NCBI requirement: At most three queries per second. 409 # Equivalently, at least a third of second between queries 410 delay = 0.333333334 411 current = time.time() 412 wait = _open.previous + delay - current 413 if wait > 0: 414 time.sleep(wait) 415 _open.previous = current + wait 416 else: 417 _open.previous = current 418 # Remove None values from the parameters 419 for key, value in params.items(): 420 if value is None: 421 del params[key] 422 # Tell Entrez that we are using Biopython (or whatever the user has 423 # specified explicitly in the parameters or by changing the default) 424 if not "tool" in params: 425 params["tool"] = tool 426 # Tell Entrez who we are 427 if not "email" in params: 428 if email is not None: 429 params["email"] = email 430 else: 431 warnings.warn(""" 432 Email address is not specified. 433 434 To make use of NCBI's E-utilities, NCBI strongly recommends you to specify 435 your email address with each request. From June 1, 2010, this will be 436 mandatory. As an example, if your email address is A.N.Other@example.com, you 437 can specify it as follows: 438 from Bio import Entrez 439 Entrez.email = 'A.N.Other@example.com' 440 In case of excessive usage of the E-utilities, NCBI will attempt to contact 441 a user at the email address provided before blocking access to the 442 E-utilities.""", UserWarning) 443 # Open a handle to Entrez. 444 options = urllib.urlencode(params, doseq=True) 445 #print cgi + "?" + options 446 try: 447 if post: 448 #HTTP POST 449 handle = urllib2.urlopen(cgi, data=options) 450 else: 451 #HTTP GET 452 cgi += "?" + options 453 handle = urllib2.urlopen(cgi) 454 except urllib2.HTTPError, exception: 455 raise exception 456 457 return _binary_to_string_handle(handle)
458 459 _open.previous = 0 460 461
462 -def _test():
463 """Run the module's doctests (PRIVATE).""" 464 print "Running doctests..." 465 import doctest 466 doctest.testmod() 467 print "Done"
468 469 if __name__ == "__main__": 470 _test() 471