1
2
3
4
5
6
7 """Provides code to access NCBI over the WWW.
8
9 The main Entrez web page is available at:
10 http://www.ncbi.nlm.nih.gov/Entrez/
11
12 A list of the Entrez utilities is available at:
13 http://www.ncbi.nlm.nih.gov/entrez/utils/utils_index.html
14
15 Variables:
16 email Set the Entrez email parameter (default is not set).
17 tool Set the Entrez tool parameter (default is biopython).
18
19 Functions:
20 efetch Retrieves records in the requested format from a list of one or
21 more primary IDs or from the user's environment
22 epost Posts a file containing a list of primary IDs for future use in
23 the user's environment to use with subsequent search strategies
24 esearch Searches and retrieves primary IDs (for use in EFetch, ELink,
25 and ESummary) and term translations and optionally retains
26 results for future use in the user's environment.
27 elink Checks for the existence of an external or Related Articles link
28 from a list of one or more primary IDs. Retrieves primary IDs
29 and relevancy scores for links to Entrez databases or Related
30 Articles; creates a hyperlink to the primary LinkOut provider
31 for a specific ID and database, or lists LinkOut URLs
32 and Attributes for multiple IDs.
33 einfo Provides field index term counts, last update, and available
34 links for each database.
35 esummary Retrieves document summaries from a list of primary IDs or from
36 the user's environment.
37 egquery Provides Entrez database counts in XML for a single search
38 using Global Query.
39 espell Retrieves spelling suggestions.
40
41 read Parses the XML results returned by any of the above functions.
42 Typical usage is:
43
44 >>> from Bio import Entrez
45 >>> Entrez.email = "Your.Name.Here@example.org"
46 >>> handle = Entrez.einfo() # or esearch, efetch, ...
47 >>> record = Entrez.read(handle)
48 >>> handle.close()
49
50 where record is now a Python dictionary or list.
51
52 parse Parses the XML results returned by those of the above functions
53 which can return multiple records - such as efetch, esummary
54 and elink. Typical usage is:
55
56 >>> handle = Entrez.efetch("pubmed", id="19304878,14630660", retmode="xml")
57 >>> records = Entrez.parse(handle)
58 >>> for record in records:
59 ... # each record is a Python dictionary or list.
60 ... print record['MedlineCitation']['Article']['ArticleTitle']
61 Biopython: freely available Python tools for computational molecular biology and bioinformatics.
62 PDB file parser and structure class implemented in Python.
63 >>> handle.close()
64
65 This function is appropriate only if the XML file contains
66 multiple records, and is particular useful for large files.
67
68 _open Internally used function.
69
70 """
71 import urllib
72 import urllib2
73 import time
74 import warnings
75 import os.path
76
77 from Bio._py3k import _binary_to_string_handle
78
79 email = None
80 tool = "biopython"
81
82
83
84 -def epost(db, **keywds):
85 """Post a file of identifiers for future use.
86
87 Posts a file containing a list of UIs for future use in the user's
88 environment to use with subsequent search strategies.
89
90 See the online documentation for an explanation of the parameters:
91 http://www.ncbi.nlm.nih.gov/entrez/query/static/epost_help.html
92
93 Return a handle to the results.
94
95 Raises an IOError exception if there's a network error.
96 """
97 cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/epost.fcgi'
98 variables = {'db': db}
99 variables.update(keywds)
100 return _open(cgi, variables, post=True)
101
102
104 """Fetches Entrez results which are returned as a handle.
105
106 EFetch retrieves records in the requested format from a list of one or
107 more UIs or from user's environment.
108
109 See the online documentation for an explanation of the parameters:
110 http://www.ncbi.nlm.nih.gov/entrez/query/static/efetch_help.html
111
112 Return a handle to the results.
113
114 Raises an IOError exception if there's a network error.
115
116 Short example:
117
118 >>> from Bio import Entrez
119 >>> Entrez.email = "Your.Name.Here@example.org"
120 >>> handle = Entrez.efetch(db="nucleotide", id="57240072", rettype="gb", retmode="text")
121 >>> print handle.readline().strip()
122 LOCUS AY851612 892 bp DNA linear PLN 10-APR-2007
123 >>> handle.close()
124
125 Warning: The NCBI changed the default retmode in Feb 2012, so many
126 databases which previously returned text output now give XML.
127 """
128 cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
129 variables = {'db': db}
130 keywords = keywds
131 if "id" in keywds and isinstance(keywds["id"], list):
132
133
134
135
136 keywords = keywds.copy()
137 keywords["id"] = ",".join(keywds["id"])
138 variables.update(keywords)
139 return _open(cgi, variables)
140
141
143 """ESearch runs an Entrez search and returns a handle to the results.
144
145 ESearch searches and retrieves primary IDs (for use in EFetch, ELink
146 and ESummary) and term translations, and optionally retains results
147 for future use in the user's environment.
148
149 See the online documentation for an explanation of the parameters:
150 http://www.ncbi.nlm.nih.gov/entrez/query/static/esearch_help.html
151
152 Return a handle to the results which are always in XML format.
153
154 Raises an IOError exception if there's a network error.
155
156 Short example:
157
158 >>> from Bio import Entrez
159 >>> Entrez.email = "Your.Name.Here@example.org"
160 >>> handle = Entrez.esearch(db="nucleotide", retmax=10, term="opuntia[ORGN] accD")
161 >>> record = Entrez.read(handle)
162 >>> handle.close()
163 >>> record["Count"] >= 2
164 True
165 >>> "156535671" in record["IdList"]
166 True
167 >>> "156535673" in record["IdList"]
168 True
169
170 """
171 cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
172 variables = {'db': db,
173 'term': term}
174 variables.update(keywds)
175 return _open(cgi, variables)
176
177
179 """ELink checks for linked external articles and returns a handle.
180
181 ELink checks for the existence of an external or Related Articles link
182 from a list of one or more primary IDs; retrieves IDs and relevancy
183 scores for links to Entrez databases or Related Articles; creates a
184 hyperlink to the primary LinkOut provider for a specific ID and
185 database, or lists LinkOut URLs and attributes for multiple IDs.
186
187 See the online documentation for an explanation of the parameters:
188 http://www.ncbi.nlm.nih.gov/entrez/query/static/elink_help.html
189
190 Return a handle to the results, by default in XML format.
191
192 Raises an IOError exception if there's a network error.
193
194 This example finds articles related to the Biopython application
195 note's entry in the PubMed database:
196
197 >>> from Bio import Entrez
198 >>> Entrez.email = "Your.Name.Here@example.org"
199 >>> pmid = "19304878"
200 >>> handle = Entrez.elink(dbfrom="pubmed", id=pmid, linkname="pubmed_pubmed")
201 >>> record = Entrez.read(handle)
202 >>> handle.close()
203 >>> print record[0]["LinkSetDb"][0]["LinkName"]
204 pubmed_pubmed
205 >>> linked = [link["Id"] for link in record[0]["LinkSetDb"][0]["Link"]]
206 >>> "17121776" in linked
207 True
208
209 This is explained in much more detail in the Biopython Tutorial.
210 """
211 cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi'
212 variables = {}
213 variables.update(keywds)
214 return _open(cgi, variables)
215
216
218 """EInfo returns a summary of the Entez databases as a results handle.
219
220 EInfo provides field names, index term counts, last update, and
221 available links for each Entrez database.
222
223 See the online documentation for an explanation of the parameters:
224 http://www.ncbi.nlm.nih.gov/entrez/query/static/einfo_help.html
225
226 Return a handle to the results, by default in XML format.
227
228 Raises an IOError exception if there's a network error.
229
230 Short example:
231
232 >>> from Bio import Entrez
233 >>> Entrez.email = "Your.Name.Here@example.org"
234 >>> record = Entrez.read(Entrez.einfo())
235 >>> 'pubmed' in record['DbList']
236 True
237
238 """
239 cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi'
240 variables = {}
241 variables.update(keywds)
242 return _open(cgi, variables)
243
244
246 """ESummary retrieves document summaries as a results handle.
247
248 ESummary retrieves document summaries from a list of primary IDs or
249 from the user's environment.
250
251 See the online documentation for an explanation of the parameters:
252 http://www.ncbi.nlm.nih.gov/entrez/query/static/esummary_help.html
253
254 Return a handle to the results, by default in XML format.
255
256 Raises an IOError exception if there's a network error.
257
258 This example discovers more about entry 30367 in the journals database:
259
260 >>> from Bio import Entrez
261 >>> Entrez.email = "Your.Name.Here@example.org"
262 >>> handle = Entrez.esummary(db="journals", id="30367")
263 >>> record = Entrez.read(handle)
264 >>> handle.close()
265 >>> print record[0]["Id"]
266 30367
267 >>> print record[0]["Title"]
268 Computational biology and chemistry
269
270 """
271 cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi'
272 variables = {}
273 variables.update(keywds)
274 return _open(cgi, variables)
275
276
278 """EGQuery provides Entrez database counts for a global search.
279
280 EGQuery provides Entrez database counts in XML for a single search
281 using Global Query.
282
283 See the online documentation for an explanation of the parameters:
284 http://www.ncbi.nlm.nih.gov/entrez/query/static/egquery_help.html
285
286 Return a handle to the results in XML format.
287
288 Raises an IOError exception if there's a network error.
289
290 This quick example based on a longer version from the Biopython
291 Tutorial just checks there are over 60 matches for 'Biopython'
292 in PubMedCentral:
293
294 >>> from Bio import Entrez
295 >>> Entrez.email = "Your.Name.Here@example.org"
296 >>> handle = Entrez.egquery(term="biopython")
297 >>> record = Entrez.read(handle)
298 >>> handle.close()
299 >>> for row in record["eGQueryResult"]:
300 ... if "pmc" in row["DbName"]:
301 ... print row["Count"] > 60
302 True
303
304 """
305 cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/egquery.fcgi'
306 variables = {}
307 variables.update(keywds)
308 return _open(cgi, variables)
309
310
312 """ESpell retrieves spelling suggestions, returned in a results handle.
313
314 ESpell retrieves spelling suggestions, if available.
315
316 See the online documentation for an explanation of the parameters:
317 http://www.ncbi.nlm.nih.gov/entrez/query/static/espell_help.html
318
319 Return a handle to the results, by default in XML format.
320
321 Raises an IOError exception if there's a network error.
322
323 Short example:
324
325 >>> from Bio import Entrez
326 >>> Entrez.email = "Your.Name.Here@example.org"
327 >>> record = Entrez.read(Entrez.espell(term="biopythooon"))
328 >>> print record["Query"]
329 biopythooon
330 >>> print record["CorrectedQuery"]
331 biopython
332
333 """
334 cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/espell.fcgi'
335 variables = {}
336 variables.update(keywds)
337 return _open(cgi, variables)
338
339
340 -def read(handle, validate=True):
341 """Parses an XML file from the NCBI Entrez Utilities into python objects.
342
343 This function parses an XML file created by NCBI's Entrez Utilities,
344 returning a multilevel data structure of Python lists and dictionaries.
345 Most XML files returned by NCBI's Entrez Utilities can be parsed by
346 this function, provided its DTD is available. Biopython includes the
347 DTDs for most commonly used Entrez Utilities.
348
349 If validate is True (default), the parser will validate the XML file
350 against the DTD, and raise an error if the XML file contains tags that
351 are not represented in the DTD. If validate is False, the parser will
352 simply skip such tags.
353
354 Whereas the data structure seems to consist of generic Python lists,
355 dictionaries, strings, and so on, each of these is actually a class
356 derived from the base type. This allows us to store the attributes
357 (if any) of each element in a dictionary my_element.attributes, and
358 the tag name in my_element.tag.
359 """
360 from Parser import DataHandler
361 handler = DataHandler(validate)
362 record = handler.read(handle)
363 return record
364
365
366 -def parse(handle, validate=True):
367 """Parses an XML file from the NCBI Entrez Utilities into python objects.
368
369 This function parses an XML file created by NCBI's Entrez Utilities,
370 returning a multilevel data structure of Python lists and dictionaries.
371 This function is suitable for XML files that (in Python) can be represented
372 as a list of individual records. Whereas 'read' reads the complete file
373 and returns a single Python list, 'parse' is a generator function that
374 returns the records one by one. This function is therefore particularly
375 useful for parsing large files.
376
377 Most XML files returned by NCBI's Entrez Utilities can be parsed by
378 this function, provided its DTD is available. Biopython includes the
379 DTDs for most commonly used Entrez Utilities.
380
381 If validate is True (default), the parser will validate the XML file
382 against the DTD, and raise an error if the XML file contains tags that
383 are not represented in the DTD. If validate is False, the parser will
384 simply skip such tags.
385
386 Whereas the data structure seems to consist of generic Python lists,
387 dictionaries, strings, and so on, each of these is actually a class
388 derived from the base type. This allows us to store the attributes
389 (if any) of each element in a dictionary my_element.attributes, and
390 the tag name in my_element.tag.
391 """
392 from Parser import DataHandler
393 handler = DataHandler(validate)
394 records = handler.parse(handle)
395 return records
396
397
398 -def _open(cgi, params={}, post=False):
399 """Helper function to build the URL and open a handle to it (PRIVATE).
400
401 Open a handle to Entrez. cgi is the URL for the cgi script to access.
402 params is a dictionary with the options to pass to it. Does some
403 simple error checking, and will raise an IOError if it encounters one.
404
405 This function also enforces the "up to three queries per second rule"
406 to avoid abusing the NCBI servers.
407 """
408
409
410 delay = 0.333333334
411 current = time.time()
412 wait = _open.previous + delay - current
413 if wait > 0:
414 time.sleep(wait)
415 _open.previous = current + wait
416 else:
417 _open.previous = current
418
419 for key, value in params.items():
420 if value is None:
421 del params[key]
422
423
424 if not "tool" in params:
425 params["tool"] = tool
426
427 if not "email" in params:
428 if email is not None:
429 params["email"] = email
430 else:
431 warnings.warn("""
432 Email address is not specified.
433
434 To make use of NCBI's E-utilities, NCBI strongly recommends you to specify
435 your email address with each request. From June 1, 2010, this will be
436 mandatory. As an example, if your email address is A.N.Other@example.com, you
437 can specify it as follows:
438 from Bio import Entrez
439 Entrez.email = 'A.N.Other@example.com'
440 In case of excessive usage of the E-utilities, NCBI will attempt to contact
441 a user at the email address provided before blocking access to the
442 E-utilities.""", UserWarning)
443
444 options = urllib.urlencode(params, doseq=True)
445
446 try:
447 if post:
448
449 handle = urllib2.urlopen(cgi, data=options)
450 else:
451
452 cgi += "?" + options
453 handle = urllib2.urlopen(cgi)
454 except urllib2.HTTPError, exception:
455 raise exception
456
457 return _binary_to_string_handle(handle)
458
459 _open.previous = 0
460
461
463 """Run the module's doctests (PRIVATE)."""
464 print "Running doctests..."
465 import doctest
466 doctest.testmod()
467 print "Done"
468
469 if __name__ == "__main__":
470 _test()
471