Package Bio :: Package Entrez
[hide private]
[frames] | no frames]

Source Code for Package Bio.Entrez

  1  # Copyright 1999-2000 by Jeffrey Chang.  All rights reserved. 
  2  # Copyright 2008 by Michiel de Hoon.  All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6   
  7  """Provides code to access NCBI over the WWW. 
  8   
  9  The main Entrez web page is available at: 
 10  http://www.ncbi.nlm.nih.gov/Entrez/ 
 11   
 12  A list of the Entrez utilities is available at: 
 13  http://www.ncbi.nlm.nih.gov/entrez/utils/utils_index.html 
 14   
 15  Variables: 
 16   
 17      - email        Set the Entrez email parameter (default is not set). 
 18      - tool         Set the Entrez tool parameter (default is  biopython). 
 19   
 20  Functions: 
 21   
 22      - efetch       Retrieves records in the requested format from a list of one or 
 23        more primary IDs or from the user's environment 
 24      - epost        Posts a file containing a list of primary IDs for future use in 
 25        the user's environment to use with subsequent search strategies 
 26      - esearch      Searches and retrieves primary IDs (for use in EFetch, ELink, 
 27        and ESummary) and term translations and optionally retains 
 28        results for future use in the user's environment. 
 29      - elink        Checks for the existence of an external or Related Articles link 
 30        from a list of one or more primary IDs.  Retrieves primary IDs 
 31        and relevancy scores for links to Entrez databases or Related 
 32        Articles;  creates a hyperlink to the primary LinkOut provider 
 33        for a specific ID and database, or lists LinkOut URLs 
 34        and Attributes for multiple IDs. 
 35      - einfo        Provides field index term counts, last update, and available 
 36        links for each database. 
 37      - esummary     Retrieves document summaries from a list of primary IDs or from 
 38        the user's environment. 
 39      - egquery      Provides Entrez database counts in XML for a single search 
 40        using Global Query. 
 41      - espell       Retrieves spelling suggestions. 
 42   
 43      - read         Parses the XML results returned by any of the above functions. 
 44        Typical usage is: 
 45   
 46            >>> from Bio import Entrez 
 47            >>> Entrez.email = "Your.Name.Here@example.org" 
 48            >>> handle = Entrez.einfo() # or esearch, efetch, ... 
 49            >>> record = Entrez.read(handle) 
 50            >>> handle.close() 
 51   
 52         where record is now a Python dictionary or list. 
 53   
 54      - parse        Parses the XML results returned by those of the above functions 
 55        which can return multiple records - such as efetch, esummary 
 56        and elink. Typical usage is: 
 57   
 58            >>> handle = Entrez.efetch("pubmed", id="19304878,14630660", retmode="xml") 
 59            >>> records = Entrez.parse(handle) 
 60            >>> for record in records: 
 61            ...     # each record is a Python dictionary or list. 
 62            ...     print(record['MedlineCitation']['Article']['ArticleTitle']) 
 63            Biopython: freely available Python tools for computational molecular biology and bioinformatics. 
 64            PDB file parser and structure class implemented in Python. 
 65            >>> handle.close() 
 66   
 67        This function is appropriate only if the XML file contains 
 68        multiple records, and is particular useful for large files. 
 69   
 70      - _open        Internally used function. 
 71   
 72  """ 
 73  from __future__ import print_function 
 74   
 75  import time 
 76  import warnings 
 77  import os.path 
 78   
 79  # Importing these functions with leading underscore as not intended for reuse 
 80  from Bio._py3k import urlopen as _urlopen 
 81  from Bio._py3k import urlencode as _urlencode 
 82  from Bio._py3k import HTTPError as _HTTPError 
 83   
 84  from Bio._py3k import _binary_to_string_handle, _as_bytes 
 85   
 86  __docformat__ = "restructuredtext en" 
 87   
 88  email = None 
 89  tool = "biopython" 
 90   
 91   
 92  # XXX retmode? 
93 -def epost(db, **keywds):
94 """Post a file of identifiers for future use. 95 96 Posts a file containing a list of UIs for future use in the user's 97 environment to use with subsequent search strategies. 98 99 See the online documentation for an explanation of the parameters: 100 http://www.ncbi.nlm.nih.gov/entrez/query/static/epost_help.html 101 102 Return a handle to the results. 103 104 Raises an IOError exception if there's a network error. 105 """ 106 cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/epost.fcgi' 107 variables = {'db': db} 108 variables.update(keywds) 109 return _open(cgi, variables, post=True)
110 111
112 -def efetch(db, **keywords):
113 """Fetches Entrez results which are returned as a handle. 114 115 EFetch retrieves records in the requested format from a list of one or 116 more UIs or from user's environment. 117 118 See the online documentation for an explanation of the parameters: 119 http://www.ncbi.nlm.nih.gov/entrez/query/static/efetch_help.html 120 121 Return a handle to the results. 122 123 Raises an IOError exception if there's a network error. 124 125 Short example: 126 127 >>> from Bio import Entrez 128 >>> Entrez.email = "Your.Name.Here@example.org" 129 >>> handle = Entrez.efetch(db="nucleotide", id="57240072", rettype="gb", retmode="text") 130 >>> print(handle.readline().strip()) 131 LOCUS AY851612 892 bp DNA linear PLN 10-APR-2007 132 >>> handle.close() 133 134 **Warning:** The NCBI changed the default retmode in Feb 2012, so many 135 databases which previously returned text output now give XML. 136 """ 137 cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi' 138 variables = {'db': db} 139 variables.update(keywords) 140 post = False 141 try: 142 ids = variables["id"] 143 except KeyError: 144 pass 145 else: 146 if isinstance(ids, list): 147 ids = ",".join(ids) 148 variables["id"] = ids 149 if ids.count(",") >= 200: 150 # NCBI prefers an HTTP POST instead of an HTTP GET if there are 151 # more than about 200 IDs 152 post = True 153 return _open(cgi, variables, post)
154 155
156 -def esearch(db, term, **keywds):
157 """ESearch runs an Entrez search and returns a handle to the results. 158 159 ESearch searches and retrieves primary IDs (for use in EFetch, ELink 160 and ESummary) and term translations, and optionally retains results 161 for future use in the user's environment. 162 163 See the online documentation for an explanation of the parameters: 164 http://www.ncbi.nlm.nih.gov/entrez/query/static/esearch_help.html 165 166 Return a handle to the results which are always in XML format. 167 168 Raises an IOError exception if there's a network error. 169 170 Short example: 171 172 >>> from Bio import Entrez 173 >>> Entrez.email = "Your.Name.Here@example.org" 174 >>> handle = Entrez.esearch(db="nucleotide", retmax=10, term="opuntia[ORGN] accD") 175 >>> record = Entrez.read(handle) 176 >>> handle.close() 177 >>> record["Count"] >= 2 178 True 179 >>> "156535671" in record["IdList"] 180 True 181 >>> "156535673" in record["IdList"] 182 True 183 184 """ 185 cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi' 186 variables = {'db': db, 187 'term': term} 188 variables.update(keywds) 189 return _open(cgi, variables)
190 191 229 230
231 -def einfo(**keywds):
232 """EInfo returns a summary of the Entez databases as a results handle. 233 234 EInfo provides field names, index term counts, last update, and 235 available links for each Entrez database. 236 237 See the online documentation for an explanation of the parameters: 238 http://www.ncbi.nlm.nih.gov/entrez/query/static/einfo_help.html 239 240 Return a handle to the results, by default in XML format. 241 242 Raises an IOError exception if there's a network error. 243 244 Short example: 245 246 >>> from Bio import Entrez 247 >>> Entrez.email = "Your.Name.Here@example.org" 248 >>> record = Entrez.read(Entrez.einfo()) 249 >>> 'pubmed' in record['DbList'] 250 True 251 252 """ 253 cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi' 254 variables = {} 255 variables.update(keywds) 256 return _open(cgi, variables)
257 258
259 -def esummary(**keywds):
260 """ESummary retrieves document summaries as a results handle. 261 262 ESummary retrieves document summaries from a list of primary IDs or 263 from the user's environment. 264 265 See the online documentation for an explanation of the parameters: 266 http://www.ncbi.nlm.nih.gov/entrez/query/static/esummary_help.html 267 268 Return a handle to the results, by default in XML format. 269 270 Raises an IOError exception if there's a network error. 271 272 This example discovers more about entry 30367 in the journals database: 273 274 >>> from Bio import Entrez 275 >>> Entrez.email = "Your.Name.Here@example.org" 276 >>> handle = Entrez.esummary(db="journals", id="30367") 277 >>> record = Entrez.read(handle) 278 >>> handle.close() 279 >>> print(record[0]["Id"]) 280 30367 281 >>> print(record[0]["Title"]) 282 Computational biology and chemistry 283 284 """ 285 cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi' 286 variables = {} 287 variables.update(keywds) 288 return _open(cgi, variables)
289 290
291 -def egquery(**keywds):
292 """EGQuery provides Entrez database counts for a global search. 293 294 EGQuery provides Entrez database counts in XML for a single search 295 using Global Query. 296 297 See the online documentation for an explanation of the parameters: 298 http://www.ncbi.nlm.nih.gov/entrez/query/static/egquery_help.html 299 300 Return a handle to the results in XML format. 301 302 Raises an IOError exception if there's a network error. 303 304 This quick example based on a longer version from the Biopython 305 Tutorial just checks there are over 60 matches for 'Biopython' 306 in PubMedCentral: 307 308 >>> from Bio import Entrez 309 >>> Entrez.email = "Your.Name.Here@example.org" 310 >>> handle = Entrez.egquery(term="biopython") 311 >>> record = Entrez.read(handle) 312 >>> handle.close() 313 >>> for row in record["eGQueryResult"]: 314 ... if "pmc" in row["DbName"]: 315 ... print(row["Count"] > 60) 316 True 317 318 """ 319 cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/egquery.fcgi' 320 variables = {} 321 variables.update(keywds) 322 return _open(cgi, variables)
323 324
325 -def espell(**keywds):
326 """ESpell retrieves spelling suggestions, returned in a results handle. 327 328 ESpell retrieves spelling suggestions, if available. 329 330 See the online documentation for an explanation of the parameters: 331 http://www.ncbi.nlm.nih.gov/entrez/query/static/espell_help.html 332 333 Return a handle to the results, by default in XML format. 334 335 Raises an IOError exception if there's a network error. 336 337 Short example: 338 339 >>> from Bio import Entrez 340 >>> Entrez.email = "Your.Name.Here@example.org" 341 >>> record = Entrez.read(Entrez.espell(term="biopythooon")) 342 >>> print(record["Query"]) 343 biopythooon 344 >>> print(record["CorrectedQuery"]) 345 biopython 346 347 """ 348 cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/espell.fcgi' 349 variables = {} 350 variables.update(keywds) 351 return _open(cgi, variables)
352 353
354 -def read(handle, validate=True):
355 """Parses an XML file from the NCBI Entrez Utilities into python objects. 356 357 This function parses an XML file created by NCBI's Entrez Utilities, 358 returning a multilevel data structure of Python lists and dictionaries. 359 Most XML files returned by NCBI's Entrez Utilities can be parsed by 360 this function, provided its DTD is available. Biopython includes the 361 DTDs for most commonly used Entrez Utilities. 362 363 If validate is True (default), the parser will validate the XML file 364 against the DTD, and raise an error if the XML file contains tags that 365 are not represented in the DTD. If validate is False, the parser will 366 simply skip such tags. 367 368 Whereas the data structure seems to consist of generic Python lists, 369 dictionaries, strings, and so on, each of these is actually a class 370 derived from the base type. This allows us to store the attributes 371 (if any) of each element in a dictionary my_element.attributes, and 372 the tag name in my_element.tag. 373 """ 374 from .Parser import DataHandler 375 handler = DataHandler(validate) 376 record = handler.read(handle) 377 return record
378 379
380 -def parse(handle, validate=True):
381 """Parses an XML file from the NCBI Entrez Utilities into python objects. 382 383 This function parses an XML file created by NCBI's Entrez Utilities, 384 returning a multilevel data structure of Python lists and dictionaries. 385 This function is suitable for XML files that (in Python) can be represented 386 as a list of individual records. Whereas 'read' reads the complete file 387 and returns a single Python list, 'parse' is a generator function that 388 returns the records one by one. This function is therefore particularly 389 useful for parsing large files. 390 391 Most XML files returned by NCBI's Entrez Utilities can be parsed by 392 this function, provided its DTD is available. Biopython includes the 393 DTDs for most commonly used Entrez Utilities. 394 395 If validate is True (default), the parser will validate the XML file 396 against the DTD, and raise an error if the XML file contains tags that 397 are not represented in the DTD. If validate is False, the parser will 398 simply skip such tags. 399 400 Whereas the data structure seems to consist of generic Python lists, 401 dictionaries, strings, and so on, each of these is actually a class 402 derived from the base type. This allows us to store the attributes 403 (if any) of each element in a dictionary my_element.attributes, and 404 the tag name in my_element.tag. 405 """ 406 from .Parser import DataHandler 407 handler = DataHandler(validate) 408 records = handler.parse(handle) 409 return records
410 411
412 -def _open(cgi, params={}, post=False):
413 """Helper function to build the URL and open a handle to it (PRIVATE). 414 415 Open a handle to Entrez. cgi is the URL for the cgi script to access. 416 params is a dictionary with the options to pass to it. Does some 417 simple error checking, and will raise an IOError if it encounters one. 418 419 This function also enforces the "up to three queries per second rule" 420 to avoid abusing the NCBI servers. 421 """ 422 # NCBI requirement: At most three queries per second. 423 # Equivalently, at least a third of second between queries 424 delay = 0.333333334 425 current = time.time() 426 wait = _open.previous + delay - current 427 if wait > 0: 428 time.sleep(wait) 429 _open.previous = current + wait 430 else: 431 _open.previous = current 432 # Remove None values from the parameters 433 for key, value in list(params.items()): 434 if value is None: 435 del params[key] 436 # Tell Entrez that we are using Biopython (or whatever the user has 437 # specified explicitly in the parameters or by changing the default) 438 if "tool" not in params: 439 params["tool"] = tool 440 # Tell Entrez who we are 441 if "email" not in params: 442 if email is not None: 443 params["email"] = email 444 else: 445 warnings.warn(""" 446 Email address is not specified. 447 448 To make use of NCBI's E-utilities, NCBI requires you to specify your 449 email address with each request. As an example, if your email address 450 is A.N.Other@example.com, you can specify it as follows: 451 from Bio import Entrez 452 Entrez.email = 'A.N.Other@example.com' 453 In case of excessive usage of the E-utilities, NCBI will attempt to contact 454 a user at the email address provided before blocking access to the 455 E-utilities.""", UserWarning) 456 # Open a handle to Entrez. 457 options = _urlencode(params, doseq=True) 458 # print cgi + "?" + options 459 try: 460 if post: 461 # HTTP POST 462 handle = _urlopen(cgi, data=_as_bytes(options)) 463 else: 464 # HTTP GET 465 cgi += "?" + options 466 handle = _urlopen(cgi) 467 except _HTTPError as exception: 468 raise exception 469 470 return _binary_to_string_handle(handle)
471 472 _open.previous = 0 473 474
475 -def _test():
476 """Run the module's doctests (PRIVATE).""" 477 print("Running doctests...") 478 import doctest 479 doctest.testmod() 480 print("Done")
481 482 if __name__ == "__main__": 483 _test() 484