Package Bio :: Package Entrez
[hide private]
[frames] | no frames]

Source Code for Package Bio.Entrez

  1  # Copyright 1999-2000 by Jeffrey Chang.  All rights reserved. 
  2  # Copyright 2008 by Michiel de Hoon.  All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6   
  7  """Provides code to access NCBI over the WWW. 
  8   
  9  The main Entrez web page is available at: 
 10  http://www.ncbi.nlm.nih.gov/Entrez/ 
 11   
 12  A list of the Entrez utilities is available at: 
 13  http://www.ncbi.nlm.nih.gov/entrez/utils/utils_index.html 
 14   
 15  Variables: 
 16  email        Set the Entrez email parameter (default is not set). 
 17  tool         Set the Entrez tool parameter (default is  biopython). 
 18   
 19  Functions: 
 20  efetch       Retrieves records in the requested format from a list of one or 
 21               more primary IDs or from the user's environment 
 22  epost        Posts a file containing a list of primary IDs for future use in 
 23               the user's environment to use with subsequent search strategies 
 24  esearch      Searches and retrieves primary IDs (for use in EFetch, ELink, 
 25               and ESummary) and term translations and optionally retains 
 26               results for future use in the user's environment. 
 27  elink        Checks for the existence of an external or Related Articles link 
 28               from a list of one or more primary IDs.  Retrieves primary IDs 
 29               and relevancy scores for links to Entrez databases or Related 
 30               Articles;  creates a hyperlink to the primary LinkOut provider 
 31               for a specific ID and database, or lists LinkOut URLs 
 32               and Attributes for multiple IDs. 
 33  einfo        Provides field index term counts, last update, and available 
 34               links for each database. 
 35  esummary     Retrieves document summaries from a list of primary IDs or from 
 36               the user's environment. 
 37  egquery      Provides Entrez database counts in XML for a single search 
 38               using Global Query. 
 39  espell       Retrieves spelling suggestions. 
 40   
 41  read         Parses the XML results returned by any of the above functions. 
 42               Typical usage is: 
 43   
 44               >>> from Bio import Entrez 
 45               >>> Entrez.email = "Your.Name.Here@example.org" 
 46               >>> handle = Entrez.einfo() # or esearch, efetch, ... 
 47               >>> record = Entrez.read(handle) 
 48               >>> handle.close() 
 49   
 50               where record is now a Python dictionary or list. 
 51   
 52  parse        Parses the XML results returned by those of the above functions 
 53               which can return multiple records - such as efetch, esummary 
 54               and elink. Typical usage is: 
 55   
 56               >>> handle = Entrez.efetch("pubmed", id="19304878,14630660", retmode="xml") 
 57               >>> records = Entrez.parse(handle) 
 58               >>> for record in records: 
 59               ...     # each record is a Python dictionary or list. 
 60               ...     print(record['MedlineCitation']['Article']['ArticleTitle']) 
 61               Biopython: freely available Python tools for computational molecular biology and bioinformatics. 
 62               PDB file parser and structure class implemented in Python. 
 63               >>> handle.close() 
 64   
 65               This function is appropriate only if the XML file contains 
 66               multiple records, and is particular useful for large files. 
 67   
 68  _open        Internally used function. 
 69   
 70  """ 
 71  from __future__ import print_function 
 72   
 73  import time 
 74  import warnings 
 75  import os.path 
 76   
 77  #Importing these functions with leading underscore as not intended for reuse 
 78  from Bio._py3k import urlopen as _urlopen 
 79  from Bio._py3k import urlencode as _urlencode 
 80  from Bio._py3k import HTTPError as _HTTPError 
 81   
 82  from Bio._py3k import _binary_to_string_handle, _as_bytes 
 83   
 84  email = None 
 85  tool = "biopython" 
 86   
 87   
 88  # XXX retmode? 
89 -def epost(db, **keywds):
90 """Post a file of identifiers for future use. 91 92 Posts a file containing a list of UIs for future use in the user's 93 environment to use with subsequent search strategies. 94 95 See the online documentation for an explanation of the parameters: 96 http://www.ncbi.nlm.nih.gov/entrez/query/static/epost_help.html 97 98 Return a handle to the results. 99 100 Raises an IOError exception if there's a network error. 101 """ 102 cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/epost.fcgi' 103 variables = {'db': db} 104 variables.update(keywds) 105 return _open(cgi, variables, post=True)
106 107
108 -def efetch(db, **keywords):
109 """Fetches Entrez results which are returned as a handle. 110 111 EFetch retrieves records in the requested format from a list of one or 112 more UIs or from user's environment. 113 114 See the online documentation for an explanation of the parameters: 115 http://www.ncbi.nlm.nih.gov/entrez/query/static/efetch_help.html 116 117 Return a handle to the results. 118 119 Raises an IOError exception if there's a network error. 120 121 Short example: 122 123 >>> from Bio import Entrez 124 >>> Entrez.email = "Your.Name.Here@example.org" 125 >>> handle = Entrez.efetch(db="nucleotide", id="57240072", rettype="gb", retmode="text") 126 >>> print(handle.readline().strip()) 127 LOCUS AY851612 892 bp DNA linear PLN 10-APR-2007 128 >>> handle.close() 129 130 Warning: The NCBI changed the default retmode in Feb 2012, so many 131 databases which previously returned text output now give XML. 132 """ 133 cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi' 134 variables = {'db': db} 135 variables.update(keywords) 136 post = False 137 try: 138 ids = variables["id"] 139 except KeyError: 140 pass 141 else: 142 if isinstance(ids, list): 143 ids = ",".join(ids) 144 variables["id"] = ids 145 if ids.count(",") >= 200: 146 # NCBI prefers an HTTP POST instead of an HTTP GET if there are 147 # more than about 200 IDs 148 post = True 149 return _open(cgi, variables, post)
150 151
152 -def esearch(db, term, **keywds):
153 """ESearch runs an Entrez search and returns a handle to the results. 154 155 ESearch searches and retrieves primary IDs (for use in EFetch, ELink 156 and ESummary) and term translations, and optionally retains results 157 for future use in the user's environment. 158 159 See the online documentation for an explanation of the parameters: 160 http://www.ncbi.nlm.nih.gov/entrez/query/static/esearch_help.html 161 162 Return a handle to the results which are always in XML format. 163 164 Raises an IOError exception if there's a network error. 165 166 Short example: 167 168 >>> from Bio import Entrez 169 >>> Entrez.email = "Your.Name.Here@example.org" 170 >>> handle = Entrez.esearch(db="nucleotide", retmax=10, term="opuntia[ORGN] accD") 171 >>> record = Entrez.read(handle) 172 >>> handle.close() 173 >>> record["Count"] >= 2 174 True 175 >>> "156535671" in record["IdList"] 176 True 177 >>> "156535673" in record["IdList"] 178 True 179 180 """ 181 cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi' 182 variables = {'db': db, 183 'term': term} 184 variables.update(keywds) 185 return _open(cgi, variables)
186 187 225 226
227 -def einfo(**keywds):
228 """EInfo returns a summary of the Entez databases as a results handle. 229 230 EInfo provides field names, index term counts, last update, and 231 available links for each Entrez database. 232 233 See the online documentation for an explanation of the parameters: 234 http://www.ncbi.nlm.nih.gov/entrez/query/static/einfo_help.html 235 236 Return a handle to the results, by default in XML format. 237 238 Raises an IOError exception if there's a network error. 239 240 Short example: 241 242 >>> from Bio import Entrez 243 >>> Entrez.email = "Your.Name.Here@example.org" 244 >>> record = Entrez.read(Entrez.einfo()) 245 >>> 'pubmed' in record['DbList'] 246 True 247 248 """ 249 cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi' 250 variables = {} 251 variables.update(keywds) 252 return _open(cgi, variables)
253 254
255 -def esummary(**keywds):
256 """ESummary retrieves document summaries as a results handle. 257 258 ESummary retrieves document summaries from a list of primary IDs or 259 from the user's environment. 260 261 See the online documentation for an explanation of the parameters: 262 http://www.ncbi.nlm.nih.gov/entrez/query/static/esummary_help.html 263 264 Return a handle to the results, by default in XML format. 265 266 Raises an IOError exception if there's a network error. 267 268 This example discovers more about entry 30367 in the journals database: 269 270 >>> from Bio import Entrez 271 >>> Entrez.email = "Your.Name.Here@example.org" 272 >>> handle = Entrez.esummary(db="journals", id="30367") 273 >>> record = Entrez.read(handle) 274 >>> handle.close() 275 >>> print(record[0]["Id"]) 276 30367 277 >>> print(record[0]["Title"]) 278 Computational biology and chemistry 279 280 """ 281 cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi' 282 variables = {} 283 variables.update(keywds) 284 return _open(cgi, variables)
285 286
287 -def egquery(**keywds):
288 """EGQuery provides Entrez database counts for a global search. 289 290 EGQuery provides Entrez database counts in XML for a single search 291 using Global Query. 292 293 See the online documentation for an explanation of the parameters: 294 http://www.ncbi.nlm.nih.gov/entrez/query/static/egquery_help.html 295 296 Return a handle to the results in XML format. 297 298 Raises an IOError exception if there's a network error. 299 300 This quick example based on a longer version from the Biopython 301 Tutorial just checks there are over 60 matches for 'Biopython' 302 in PubMedCentral: 303 304 >>> from Bio import Entrez 305 >>> Entrez.email = "Your.Name.Here@example.org" 306 >>> handle = Entrez.egquery(term="biopython") 307 >>> record = Entrez.read(handle) 308 >>> handle.close() 309 >>> for row in record["eGQueryResult"]: 310 ... if "pmc" in row["DbName"]: 311 ... print(row["Count"] > 60) 312 True 313 314 """ 315 cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/egquery.fcgi' 316 variables = {} 317 variables.update(keywds) 318 return _open(cgi, variables)
319 320
321 -def espell(**keywds):
322 """ESpell retrieves spelling suggestions, returned in a results handle. 323 324 ESpell retrieves spelling suggestions, if available. 325 326 See the online documentation for an explanation of the parameters: 327 http://www.ncbi.nlm.nih.gov/entrez/query/static/espell_help.html 328 329 Return a handle to the results, by default in XML format. 330 331 Raises an IOError exception if there's a network error. 332 333 Short example: 334 335 >>> from Bio import Entrez 336 >>> Entrez.email = "Your.Name.Here@example.org" 337 >>> record = Entrez.read(Entrez.espell(term="biopythooon")) 338 >>> print(record["Query"]) 339 biopythooon 340 >>> print(record["CorrectedQuery"]) 341 biopython 342 343 """ 344 cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/espell.fcgi' 345 variables = {} 346 variables.update(keywds) 347 return _open(cgi, variables)
348 349
350 -def read(handle, validate=True):
351 """Parses an XML file from the NCBI Entrez Utilities into python objects. 352 353 This function parses an XML file created by NCBI's Entrez Utilities, 354 returning a multilevel data structure of Python lists and dictionaries. 355 Most XML files returned by NCBI's Entrez Utilities can be parsed by 356 this function, provided its DTD is available. Biopython includes the 357 DTDs for most commonly used Entrez Utilities. 358 359 If validate is True (default), the parser will validate the XML file 360 against the DTD, and raise an error if the XML file contains tags that 361 are not represented in the DTD. If validate is False, the parser will 362 simply skip such tags. 363 364 Whereas the data structure seems to consist of generic Python lists, 365 dictionaries, strings, and so on, each of these is actually a class 366 derived from the base type. This allows us to store the attributes 367 (if any) of each element in a dictionary my_element.attributes, and 368 the tag name in my_element.tag. 369 """ 370 from .Parser import DataHandler 371 handler = DataHandler(validate) 372 record = handler.read(handle) 373 return record
374 375
376 -def parse(handle, validate=True):
377 """Parses an XML file from the NCBI Entrez Utilities into python objects. 378 379 This function parses an XML file created by NCBI's Entrez Utilities, 380 returning a multilevel data structure of Python lists and dictionaries. 381 This function is suitable for XML files that (in Python) can be represented 382 as a list of individual records. Whereas 'read' reads the complete file 383 and returns a single Python list, 'parse' is a generator function that 384 returns the records one by one. This function is therefore particularly 385 useful for parsing large files. 386 387 Most XML files returned by NCBI's Entrez Utilities can be parsed by 388 this function, provided its DTD is available. Biopython includes the 389 DTDs for most commonly used Entrez Utilities. 390 391 If validate is True (default), the parser will validate the XML file 392 against the DTD, and raise an error if the XML file contains tags that 393 are not represented in the DTD. If validate is False, the parser will 394 simply skip such tags. 395 396 Whereas the data structure seems to consist of generic Python lists, 397 dictionaries, strings, and so on, each of these is actually a class 398 derived from the base type. This allows us to store the attributes 399 (if any) of each element in a dictionary my_element.attributes, and 400 the tag name in my_element.tag. 401 """ 402 from .Parser import DataHandler 403 handler = DataHandler(validate) 404 records = handler.parse(handle) 405 return records
406 407
408 -def _open(cgi, params={}, post=False):
409 """Helper function to build the URL and open a handle to it (PRIVATE). 410 411 Open a handle to Entrez. cgi is the URL for the cgi script to access. 412 params is a dictionary with the options to pass to it. Does some 413 simple error checking, and will raise an IOError if it encounters one. 414 415 This function also enforces the "up to three queries per second rule" 416 to avoid abusing the NCBI servers. 417 """ 418 # NCBI requirement: At most three queries per second. 419 # Equivalently, at least a third of second between queries 420 delay = 0.333333334 421 current = time.time() 422 wait = _open.previous + delay - current 423 if wait > 0: 424 time.sleep(wait) 425 _open.previous = current + wait 426 else: 427 _open.previous = current 428 # Remove None values from the parameters 429 for key, value in list(params.items()): 430 if value is None: 431 del params[key] 432 # Tell Entrez that we are using Biopython (or whatever the user has 433 # specified explicitly in the parameters or by changing the default) 434 if not "tool" in params: 435 params["tool"] = tool 436 # Tell Entrez who we are 437 if not "email" in params: 438 if email is not None: 439 params["email"] = email 440 else: 441 warnings.warn(""" 442 Email address is not specified. 443 444 To make use of NCBI's E-utilities, NCBI requires you to specify your 445 email address with each request. As an example, if your email address 446 is A.N.Other@example.com, you can specify it as follows: 447 from Bio import Entrez 448 Entrez.email = 'A.N.Other@example.com' 449 In case of excessive usage of the E-utilities, NCBI will attempt to contact 450 a user at the email address provided before blocking access to the 451 E-utilities.""", UserWarning) 452 # Open a handle to Entrez. 453 options = _urlencode(params, doseq=True) 454 #print cgi + "?" + options 455 try: 456 if post: 457 #HTTP POST 458 handle = _urlopen(cgi, data=_as_bytes(options)) 459 else: 460 #HTTP GET 461 cgi += "?" + options 462 handle = _urlopen(cgi) 463 except _HTTPError as exception: 464 raise exception 465 466 return _binary_to_string_handle(handle)
467 468 _open.previous = 0 469 470
471 -def _test():
472 """Run the module's doctests (PRIVATE).""" 473 print("Running doctests...") 474 import doctest 475 doctest.testmod() 476 print("Done")
477 478 if __name__ == "__main__": 479 _test() 480