Package Bio :: Package Entrez
[hide private]
[frames] | no frames]

Source Code for Package Bio.Entrez

  1  # Copyright 1999-2000 by Jeffrey Chang.  All rights reserved. 
  2  # Copyright 2008 by Michiel de Hoon.  All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6   
  7  """Provides code to access NCBI over the WWW. 
  8   
  9  The main Entrez web page is available at: 
 10  http://www.ncbi.nlm.nih.gov/Entrez/ 
 11   
 12  A list of the Entrez utilities is available at: 
 13  http://www.ncbi.nlm.nih.gov/entrez/utils/utils_index.html 
 14   
 15  Variables: 
 16  email        Set the Entrez email parameter (default is not set). 
 17  tool         Set the Entrez tool parameter (default is  biopython). 
 18   
 19  Functions: 
 20  efetch       Retrieves records in the requested format from a list of one or 
 21               more primary IDs or from the user's environment 
 22  epost        Posts a file containing a list of primary IDs for future use in 
 23               the user's environment to use with subsequent search strategies 
 24  esearch      Searches and retrieves primary IDs (for use in EFetch, ELink, 
 25               and ESummary) and term translations and optionally retains 
 26               results for future use in the user's environment. 
 27  elink        Checks for the existence of an external or Related Articles link 
 28               from a list of one or more primary IDs.  Retrieves primary IDs 
 29               and relevancy scores for links to Entrez databases or Related 
 30               Articles;  creates a hyperlink to the primary LinkOut provider 
 31               for a specific ID and database, or lists LinkOut URLs 
 32               and Attributes for multiple IDs. 
 33  einfo        Provides field index term counts, last update, and available 
 34               links for each database. 
 35  esummary     Retrieves document summaries from a list of primary IDs or from 
 36               the user's environment. 
 37  egquery      Provides Entrez database counts in XML for a single search 
 38               using Global Query. 
 39  espell       Retrieves spelling suggestions. 
 40   
 41  read         Parses the XML results returned by any of the above functions. 
 42               Typical usage is: 
 43   
 44               >>> from Bio import Entrez 
 45               >>> Entrez.email = "Your.Name.Here@example.org" 
 46               >>> handle = Entrez.einfo() # or esearch, efetch, ... 
 47               >>> record = Entrez.read(handle) 
 48               >>> handle.close() 
 49   
 50               where record is now a Python dictionary or list. 
 51   
 52  parse        Parses the XML results returned by those of the above functions 
 53               which can return multiple records - such as efetch, esummary 
 54               and elink. Typical usage is: 
 55   
 56               >>> handle = Entrez.efetch("pubmed", id="19304878,14630660", retmode="xml") 
 57               >>> records = Entrez.parse(handle) 
 58               >>> for record in records: 
 59               ...     # each record is a Python dictionary or list. 
 60               ...     print record['MedlineCitation']['Article']['ArticleTitle'] 
 61               Biopython: freely available Python tools for computational molecular biology and bioinformatics. 
 62               PDB file parser and structure class implemented in Python. 
 63               >>> handle.close() 
 64   
 65               This function is appropriate only if the XML file contains 
 66               multiple records, and is particular useful for large files.  
 67   
 68  _open        Internally used function. 
 69   
 70  """ 
 71  import urllib, urllib2, time, warnings 
 72  import os.path 
 73   
 74  from Bio._py3k import _binary_to_string_handle 
 75   
 76  email = None 
 77  tool = "biopython" 
 78   
 79   
 80  # XXX retmode? 
81 -def epost(db, **keywds):
82 """Post a file of identifiers for future use. 83 84 Posts a file containing a list of UIs for future use in the user's 85 environment to use with subsequent search strategies. 86 87 See the online documentation for an explanation of the parameters: 88 http://www.ncbi.nlm.nih.gov/entrez/query/static/epost_help.html 89 90 Return a handle to the results. 91 92 Raises an IOError exception if there's a network error. 93 """ 94 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/epost.fcgi' 95 variables = {'db' : db} 96 variables.update(keywds) 97 return _open(cgi, variables, post=True)
98
99 -def efetch(db, **keywds):
100 """Fetches Entrez results which are returned as a handle. 101 102 EFetch retrieves records in the requested format from a list of one or 103 more UIs or from user's environment. 104 105 See the online documentation for an explanation of the parameters: 106 http://www.ncbi.nlm.nih.gov/entrez/query/static/efetch_help.html 107 108 Return a handle to the results. 109 110 Raises an IOError exception if there's a network error. 111 112 Short example: 113 114 >>> from Bio import Entrez 115 >>> Entrez.email = "Your.Name.Here@example.org" 116 >>> handle = Entrez.efetch(db="nucleotide", id="57240072", rettype="gb", retmode="text") 117 >>> print handle.readline().strip() 118 LOCUS AY851612 892 bp DNA linear PLN 10-APR-2007 119 >>> handle.close() 120 121 Warning: The NCBI changed the default retmode in Feb 2012, so many 122 databases which previously returned text output now give XML. 123 """ 124 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi' 125 variables = {'db' : db} 126 keywords = keywds 127 if "id" in keywds and isinstance(keywds["id"], list): 128 #Fix for NCBI change (probably part of EFetch 2,0, Feb 2012) where 129 #a list of ID strings now gives HTTP Error 500: Internal server error 130 #This was turned into ...&id=22307645&id=22303114&... which used to work 131 #while now the NCBI appear to insist on ...&id=22301129,22299544,... 132 keywords = keywds.copy() #Don't alter input dict! 133 keywords["id"] = ",".join(keywds["id"]) 134 variables.update(keywords) 135 return _open(cgi, variables)
136
137 -def esearch(db, term, **keywds):
138 """ESearch runs an Entrez search and returns a handle to the results. 139 140 ESearch searches and retrieves primary IDs (for use in EFetch, ELink 141 and ESummary) and term translations, and optionally retains results 142 for future use in the user's environment. 143 144 See the online documentation for an explanation of the parameters: 145 http://www.ncbi.nlm.nih.gov/entrez/query/static/esearch_help.html 146 147 Return a handle to the results which are always in XML format. 148 149 Raises an IOError exception if there's a network error. 150 151 Short example: 152 153 >>> from Bio import Entrez 154 >>> Entrez.email = "Your.Name.Here@example.org" 155 >>> handle = Entrez.esearch(db="nucleotide", retmax=10, term="opuntia[ORGN] accD") 156 >>> record = Entrez.read(handle) 157 >>> handle.close() 158 >>> record["Count"] >= 2 159 True 160 >>> "156535671" in record["IdList"] 161 True 162 >>> "156535673" in record["IdList"] 163 True 164 165 """ 166 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi' 167 variables = {'db' : db, 168 'term' : term} 169 variables.update(keywds) 170 return _open(cgi, variables)
171 209
210 -def einfo(**keywds):
211 """EInfo returns a summary of the Entez databases as a results handle. 212 213 EInfo provides field names, index term counts, last update, and 214 available links for each Entrez database. 215 216 See the online documentation for an explanation of the parameters: 217 http://www.ncbi.nlm.nih.gov/entrez/query/static/einfo_help.html 218 219 Return a handle to the results, by default in XML format. 220 221 Raises an IOError exception if there's a network error. 222 223 Short example: 224 225 >>> from Bio import Entrez 226 >>> Entrez.email = "Your.Name.Here@example.org" 227 >>> record = Entrez.read(Entrez.einfo()) 228 >>> 'pubmed' in record['DbList'] 229 True 230 231 """ 232 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi' 233 variables = {} 234 variables.update(keywds) 235 return _open(cgi, variables)
236
237 -def esummary(**keywds):
238 """ESummary retrieves document summaries as a results handle. 239 240 ESummary retrieves document summaries from a list of primary IDs or 241 from the user's environment. 242 243 See the online documentation for an explanation of the parameters: 244 http://www.ncbi.nlm.nih.gov/entrez/query/static/esummary_help.html 245 246 Return a handle to the results, by default in XML format. 247 248 Raises an IOError exception if there's a network error. 249 250 This example discovers more about entry 30367 in the journals database: 251 252 >>> from Bio import Entrez 253 >>> Entrez.email = "Your.Name.Here@example.org" 254 >>> handle = Entrez.esummary(db="journals", id="30367") 255 >>> record = Entrez.read(handle) 256 >>> handle.close() 257 >>> print record[0]["Id"] 258 30367 259 >>> print record[0]["Title"] 260 Computational biology and chemistry 261 262 """ 263 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi' 264 variables = {} 265 variables.update(keywds) 266 return _open(cgi, variables)
267
268 -def egquery(**keywds):
269 """EGQuery provides Entrez database counts for a global search. 270 271 EGQuery provides Entrez database counts in XML for a single search 272 using Global Query. 273 274 See the online documentation for an explanation of the parameters: 275 http://www.ncbi.nlm.nih.gov/entrez/query/static/egquery_help.html 276 277 Return a handle to the results in XML format. 278 279 Raises an IOError exception if there's a network error. 280 281 This quick example based on a longer version from the Biopython 282 Tutorial just checks there are over 60 matches for 'Biopython' 283 in PubMedCentral: 284 285 >>> from Bio import Entrez 286 >>> Entrez.email = "Your.Name.Here@example.org" 287 >>> handle = Entrez.egquery(term="biopython") 288 >>> record = Entrez.read(handle) 289 >>> handle.close() 290 >>> for row in record["eGQueryResult"]: 291 ... if "pmc" in row["DbName"]: 292 ... print row["Count"] > 60 293 True 294 295 """ 296 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/egquery.fcgi' 297 variables = {} 298 variables.update(keywds) 299 return _open(cgi, variables)
300
301 -def espell(**keywds):
302 """ESpell retrieves spelling suggestions, returned in a results handle. 303 304 ESpell retrieves spelling suggestions, if available. 305 306 See the online documentation for an explanation of the parameters: 307 http://www.ncbi.nlm.nih.gov/entrez/query/static/espell_help.html 308 309 Return a handle to the results, by default in XML format. 310 311 Raises an IOError exception if there's a network error. 312 313 Short example: 314 315 >>> from Bio import Entrez 316 >>> Entrez.email = "Your.Name.Here@example.org" 317 >>> record = Entrez.read(Entrez.espell(term="biopythooon")) 318 >>> print record["Query"] 319 biopythooon 320 >>> print record["CorrectedQuery"] 321 biopython 322 323 """ 324 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/espell.fcgi' 325 variables = {} 326 variables.update(keywds) 327 return _open(cgi, variables)
328
329 -def read(handle, validate=True):
330 """Parses an XML file from the NCBI Entrez Utilities into python objects. 331 332 This function parses an XML file created by NCBI's Entrez Utilities, 333 returning a multilevel data structure of Python lists and dictionaries. 334 Most XML files returned by NCBI's Entrez Utilities can be parsed by 335 this function, provided its DTD is available. Biopython includes the 336 DTDs for most commonly used Entrez Utilities. 337 338 If validate is True (default), the parser will validate the XML file 339 against the DTD, and raise an error if the XML file contains tags that 340 are not represented in the DTD. If validate is False, the parser will 341 simply skip such tags. 342 343 Whereas the data structure seems to consist of generic Python lists, 344 dictionaries, strings, and so on, each of these is actually a class 345 derived from the base type. This allows us to store the attributes 346 (if any) of each element in a dictionary my_element.attributes, and 347 the tag name in my_element.tag. 348 """ 349 from Parser import DataHandler 350 handler = DataHandler(validate) 351 record = handler.read(handle) 352 return record
353
354 -def parse(handle, validate=True):
355 """Parses an XML file from the NCBI Entrez Utilities into python objects. 356 357 This function parses an XML file created by NCBI's Entrez Utilities, 358 returning a multilevel data structure of Python lists and dictionaries. 359 This function is suitable for XML files that (in Python) can be represented 360 as a list of individual records. Whereas 'read' reads the complete file 361 and returns a single Python list, 'parse' is a generator function that 362 returns the records one by one. This function is therefore particularly 363 useful for parsing large files. 364 365 Most XML files returned by NCBI's Entrez Utilities can be parsed by 366 this function, provided its DTD is available. Biopython includes the 367 DTDs for most commonly used Entrez Utilities. 368 369 If validate is True (default), the parser will validate the XML file 370 against the DTD, and raise an error if the XML file contains tags that 371 are not represented in the DTD. If validate is False, the parser will 372 simply skip such tags. 373 374 Whereas the data structure seems to consist of generic Python lists, 375 dictionaries, strings, and so on, each of these is actually a class 376 derived from the base type. This allows us to store the attributes 377 (if any) of each element in a dictionary my_element.attributes, and 378 the tag name in my_element.tag. 379 """ 380 from Parser import DataHandler 381 handler = DataHandler(validate) 382 records = handler.parse(handle) 383 return records
384
385 -def _open(cgi, params={}, post=False):
386 """Helper function to build the URL and open a handle to it (PRIVATE). 387 388 Open a handle to Entrez. cgi is the URL for the cgi script to access. 389 params is a dictionary with the options to pass to it. Does some 390 simple error checking, and will raise an IOError if it encounters one. 391 392 This function also enforces the "up to three queries per second rule" 393 to avoid abusing the NCBI servers. 394 """ 395 # NCBI requirement: At most three queries per second. 396 # Equivalently, at least a third of second between queries 397 delay = 0.333333334 398 current = time.time() 399 wait = _open.previous + delay - current 400 if wait > 0: 401 time.sleep(wait) 402 _open.previous = current + wait 403 else: 404 _open.previous = current 405 # Remove None values from the parameters 406 for key, value in params.items(): 407 if value is None: 408 del params[key] 409 # Tell Entrez that we are using Biopython (or whatever the user has 410 # specified explicitly in the parameters or by changing the default) 411 if not "tool" in params: 412 params["tool"] = tool 413 # Tell Entrez who we are 414 if not "email" in params: 415 if email!=None: 416 params["email"] = email 417 else: 418 warnings.warn(""" 419 Email address is not specified. 420 421 To make use of NCBI's E-utilities, NCBI strongly recommends you to specify 422 your email address with each request. From June 1, 2010, this will be 423 mandatory. As an example, if your email address is A.N.Other@example.com, you 424 can specify it as follows: 425 from Bio import Entrez 426 Entrez.email = 'A.N.Other@example.com' 427 In case of excessive usage of the E-utilities, NCBI will attempt to contact 428 a user at the email address provided before blocking access to the 429 E-utilities.""", UserWarning) 430 # Open a handle to Entrez. 431 options = urllib.urlencode(params, doseq=True) 432 #print cgi + "?" + options 433 try: 434 if post: 435 #HTTP POST 436 handle = urllib2.urlopen(cgi, data=options) 437 else: 438 #HTTP GET 439 cgi += "?" + options 440 handle = urllib2.urlopen(cgi) 441 except urllib2.HTTPError, exception: 442 raise exception 443 444 return _binary_to_string_handle(handle)
445 446 _open.previous = 0 447 448
449 -def _test():
450 """Run the module's doctests (PRIVATE).""" 451 print "Runing doctests..." 452 import doctest 453 doctest.testmod() 454 print "Done"
455 456 if __name__ == "__main__": 457 _test() 458