1
2
3
4
5
6
7 """Provides code to access NCBI over the WWW.
8
9 The main Entrez web page is available at:
10 http://www.ncbi.nlm.nih.gov/Entrez/
11
12 A list of the Entrez utilities is available at:
13 http://www.ncbi.nlm.nih.gov/entrez/utils/utils_index.html
14
15 Variables:
16 email Set the Entrez email parameter (default is not set).
17 tool Set the Entrez tool parameter (default is biopython).
18
19 Functions:
20 efetch Retrieves records in the requested format from a list of one or
21 more primary IDs or from the user's environment
22 epost Posts a file containing a list of primary IDs for future use in
23 the user's environment to use with subsequent search strategies
24 esearch Searches and retrieves primary IDs (for use in EFetch, ELink,
25 and ESummary) and term translations and optionally retains
26 results for future use in the user's environment.
27 elink Checks for the existence of an external or Related Articles link
28 from a list of one or more primary IDs. Retrieves primary IDs
29 and relevancy scores for links to Entrez databases or Related
30 Articles; creates a hyperlink to the primary LinkOut provider
31 for a specific ID and database, or lists LinkOut URLs
32 and Attributes for multiple IDs.
33 einfo Provides field index term counts, last update, and available
34 links for each database.
35 esummary Retrieves document summaries from a list of primary IDs or from
36 the user's environment.
37 egquery Provides Entrez database counts in XML for a single search
38 using Global Query.
39 espell Retrieves spelling suggestions.
40
41 read Parses the XML results returned by any of the above functions.
42 Typical usage is:
43
44 >>> from Bio import Entrez
45 >>> Entrez.email = "Your.Name.Here@example.org"
46 >>> handle = Entrez.einfo() # or esearch, efetch, ...
47 >>> record = Entrez.read(handle)
48 >>> handle.close()
49
50 where record is now a Python dictionary or list.
51
52 parse Parses the XML results returned by those of the above functions
53 which can return multiple records - such as efetch, esummary
54 and elink. Typical usage is:
55
56 >>> handle = Entrez.efetch("pubmed", id="19304878,14630660", retmode="xml")
57 >>> records = Entrez.parse(handle)
58 >>> for record in records:
59 ... # each record is a Python dictionary or list.
60 ... print record['MedlineCitation']['Article']['ArticleTitle']
61 Biopython: freely available Python tools for computational molecular biology and bioinformatics.
62 PDB file parser and structure class implemented in Python.
63 >>> handle.close()
64
65 This function is appropriate only if the XML file contains
66 multiple records, and is particular useful for large files.
67
68 _open Internally used function.
69
70 """
71 import urllib, urllib2, time, warnings
72 import os.path
73
74 from Bio._py3k import _binary_to_string_handle
75
76 email = None
77 tool = "biopython"
78
79
80
81 -def epost(db, **keywds):
82 """Post a file of identifiers for future use.
83
84 Posts a file containing a list of UIs for future use in the user's
85 environment to use with subsequent search strategies.
86
87 See the online documentation for an explanation of the parameters:
88 http://www.ncbi.nlm.nih.gov/entrez/query/static/epost_help.html
89
90 Return a handle to the results.
91
92 Raises an IOError exception if there's a network error.
93 """
94 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/epost.fcgi'
95 variables = {'db' : db}
96 variables.update(keywds)
97 return _open(cgi, variables, post=True)
98
100 """Fetches Entrez results which are returned as a handle.
101
102 EFetch retrieves records in the requested format from a list of one or
103 more UIs or from user's environment.
104
105 See the online documentation for an explanation of the parameters:
106 http://www.ncbi.nlm.nih.gov/entrez/query/static/efetch_help.html
107
108 Return a handle to the results.
109
110 Raises an IOError exception if there's a network error.
111
112 Short example:
113
114 >>> from Bio import Entrez
115 >>> Entrez.email = "Your.Name.Here@example.org"
116 >>> handle = Entrez.efetch(db="nucleotide", id="57240072", rettype="gb", retmode="text")
117 >>> print handle.readline().strip()
118 LOCUS AY851612 892 bp DNA linear PLN 10-APR-2007
119 >>> handle.close()
120
121 Warning: The NCBI changed the default retmode in Feb 2012, so many
122 databases which previously returned text output now give XML.
123 """
124 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
125 variables = {'db' : db}
126 keywords = keywds
127 if "id" in keywds and isinstance(keywds["id"], list):
128
129
130
131
132 keywords = keywds.copy()
133 keywords["id"] = ",".join(keywds["id"])
134 variables.update(keywords)
135 return _open(cgi, variables)
136
138 """ESearch runs an Entrez search and returns a handle to the results.
139
140 ESearch searches and retrieves primary IDs (for use in EFetch, ELink
141 and ESummary) and term translations, and optionally retains results
142 for future use in the user's environment.
143
144 See the online documentation for an explanation of the parameters:
145 http://www.ncbi.nlm.nih.gov/entrez/query/static/esearch_help.html
146
147 Return a handle to the results which are always in XML format.
148
149 Raises an IOError exception if there's a network error.
150
151 Short example:
152
153 >>> from Bio import Entrez
154 >>> Entrez.email = "Your.Name.Here@example.org"
155 >>> handle = Entrez.esearch(db="nucleotide", retmax=10, term="opuntia[ORGN] accD")
156 >>> record = Entrez.read(handle)
157 >>> handle.close()
158 >>> record["Count"] >= 2
159 True
160 >>> "156535671" in record["IdList"]
161 True
162 >>> "156535673" in record["IdList"]
163 True
164
165 """
166 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
167 variables = {'db' : db,
168 'term' : term}
169 variables.update(keywds)
170 return _open(cgi, variables)
171
173 """ELink checks for linked external articles and returns a handle.
174
175 ELink checks for the existence of an external or Related Articles link
176 from a list of one or more primary IDs; retrieves IDs and relevancy
177 scores for links to Entrez databases or Related Articles; creates a
178 hyperlink to the primary LinkOut provider for a specific ID and
179 database, or lists LinkOut URLs and attributes for multiple IDs.
180
181 See the online documentation for an explanation of the parameters:
182 http://www.ncbi.nlm.nih.gov/entrez/query/static/elink_help.html
183
184 Return a handle to the results, by default in XML format.
185
186 Raises an IOError exception if there's a network error.
187
188 This example finds articles related to the Biopython application
189 note's entry in the PubMed database:
190
191 >>> from Bio import Entrez
192 >>> Entrez.email = "Your.Name.Here@example.org"
193 >>> pmid = "19304878"
194 >>> handle = Entrez.elink(dbfrom="pubmed", id=pmid, linkname="pubmed_pubmed")
195 >>> record = Entrez.read(handle)
196 >>> handle.close()
197 >>> print record[0]["LinkSetDb"][0]["LinkName"]
198 pubmed_pubmed
199 >>> linked = [link["Id"] for link in record[0]["LinkSetDb"][0]["Link"]]
200 >>> "17121776" in linked
201 True
202
203 This is explained in much more detail in the Biopython Tutorial.
204 """
205 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi'
206 variables = {}
207 variables.update(keywds)
208 return _open(cgi, variables)
209
211 """EInfo returns a summary of the Entez databases as a results handle.
212
213 EInfo provides field names, index term counts, last update, and
214 available links for each Entrez database.
215
216 See the online documentation for an explanation of the parameters:
217 http://www.ncbi.nlm.nih.gov/entrez/query/static/einfo_help.html
218
219 Return a handle to the results, by default in XML format.
220
221 Raises an IOError exception if there's a network error.
222
223 Short example:
224
225 >>> from Bio import Entrez
226 >>> Entrez.email = "Your.Name.Here@example.org"
227 >>> record = Entrez.read(Entrez.einfo())
228 >>> 'pubmed' in record['DbList']
229 True
230
231 """
232 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi'
233 variables = {}
234 variables.update(keywds)
235 return _open(cgi, variables)
236
238 """ESummary retrieves document summaries as a results handle.
239
240 ESummary retrieves document summaries from a list of primary IDs or
241 from the user's environment.
242
243 See the online documentation for an explanation of the parameters:
244 http://www.ncbi.nlm.nih.gov/entrez/query/static/esummary_help.html
245
246 Return a handle to the results, by default in XML format.
247
248 Raises an IOError exception if there's a network error.
249
250 This example discovers more about entry 30367 in the journals database:
251
252 >>> from Bio import Entrez
253 >>> Entrez.email = "Your.Name.Here@example.org"
254 >>> handle = Entrez.esummary(db="journals", id="30367")
255 >>> record = Entrez.read(handle)
256 >>> handle.close()
257 >>> print record[0]["Id"]
258 30367
259 >>> print record[0]["Title"]
260 Computational biology and chemistry
261
262 """
263 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi'
264 variables = {}
265 variables.update(keywds)
266 return _open(cgi, variables)
267
269 """EGQuery provides Entrez database counts for a global search.
270
271 EGQuery provides Entrez database counts in XML for a single search
272 using Global Query.
273
274 See the online documentation for an explanation of the parameters:
275 http://www.ncbi.nlm.nih.gov/entrez/query/static/egquery_help.html
276
277 Return a handle to the results in XML format.
278
279 Raises an IOError exception if there's a network error.
280
281 This quick example based on a longer version from the Biopython
282 Tutorial just checks there are over 60 matches for 'Biopython'
283 in PubMedCentral:
284
285 >>> from Bio import Entrez
286 >>> Entrez.email = "Your.Name.Here@example.org"
287 >>> handle = Entrez.egquery(term="biopython")
288 >>> record = Entrez.read(handle)
289 >>> handle.close()
290 >>> for row in record["eGQueryResult"]:
291 ... if "pmc" in row["DbName"]:
292 ... print row["Count"] > 60
293 True
294
295 """
296 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/egquery.fcgi'
297 variables = {}
298 variables.update(keywds)
299 return _open(cgi, variables)
300
302 """ESpell retrieves spelling suggestions, returned in a results handle.
303
304 ESpell retrieves spelling suggestions, if available.
305
306 See the online documentation for an explanation of the parameters:
307 http://www.ncbi.nlm.nih.gov/entrez/query/static/espell_help.html
308
309 Return a handle to the results, by default in XML format.
310
311 Raises an IOError exception if there's a network error.
312
313 Short example:
314
315 >>> from Bio import Entrez
316 >>> Entrez.email = "Your.Name.Here@example.org"
317 >>> record = Entrez.read(Entrez.espell(term="biopythooon"))
318 >>> print record["Query"]
319 biopythooon
320 >>> print record["CorrectedQuery"]
321 biopython
322
323 """
324 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/espell.fcgi'
325 variables = {}
326 variables.update(keywds)
327 return _open(cgi, variables)
328
329 -def read(handle, validate=True):
330 """Parses an XML file from the NCBI Entrez Utilities into python objects.
331
332 This function parses an XML file created by NCBI's Entrez Utilities,
333 returning a multilevel data structure of Python lists and dictionaries.
334 Most XML files returned by NCBI's Entrez Utilities can be parsed by
335 this function, provided its DTD is available. Biopython includes the
336 DTDs for most commonly used Entrez Utilities.
337
338 If validate is True (default), the parser will validate the XML file
339 against the DTD, and raise an error if the XML file contains tags that
340 are not represented in the DTD. If validate is False, the parser will
341 simply skip such tags.
342
343 Whereas the data structure seems to consist of generic Python lists,
344 dictionaries, strings, and so on, each of these is actually a class
345 derived from the base type. This allows us to store the attributes
346 (if any) of each element in a dictionary my_element.attributes, and
347 the tag name in my_element.tag.
348 """
349 from Parser import DataHandler
350 handler = DataHandler(validate)
351 record = handler.read(handle)
352 return record
353
354 -def parse(handle, validate=True):
355 """Parses an XML file from the NCBI Entrez Utilities into python objects.
356
357 This function parses an XML file created by NCBI's Entrez Utilities,
358 returning a multilevel data structure of Python lists and dictionaries.
359 This function is suitable for XML files that (in Python) can be represented
360 as a list of individual records. Whereas 'read' reads the complete file
361 and returns a single Python list, 'parse' is a generator function that
362 returns the records one by one. This function is therefore particularly
363 useful for parsing large files.
364
365 Most XML files returned by NCBI's Entrez Utilities can be parsed by
366 this function, provided its DTD is available. Biopython includes the
367 DTDs for most commonly used Entrez Utilities.
368
369 If validate is True (default), the parser will validate the XML file
370 against the DTD, and raise an error if the XML file contains tags that
371 are not represented in the DTD. If validate is False, the parser will
372 simply skip such tags.
373
374 Whereas the data structure seems to consist of generic Python lists,
375 dictionaries, strings, and so on, each of these is actually a class
376 derived from the base type. This allows us to store the attributes
377 (if any) of each element in a dictionary my_element.attributes, and
378 the tag name in my_element.tag.
379 """
380 from Parser import DataHandler
381 handler = DataHandler(validate)
382 records = handler.parse(handle)
383 return records
384
385 -def _open(cgi, params={}, post=False):
386 """Helper function to build the URL and open a handle to it (PRIVATE).
387
388 Open a handle to Entrez. cgi is the URL for the cgi script to access.
389 params is a dictionary with the options to pass to it. Does some
390 simple error checking, and will raise an IOError if it encounters one.
391
392 This function also enforces the "up to three queries per second rule"
393 to avoid abusing the NCBI servers.
394 """
395
396
397 delay = 0.333333334
398 current = time.time()
399 wait = _open.previous + delay - current
400 if wait > 0:
401 time.sleep(wait)
402 _open.previous = current + wait
403 else:
404 _open.previous = current
405
406 for key, value in params.items():
407 if value is None:
408 del params[key]
409
410
411 if not "tool" in params:
412 params["tool"] = tool
413
414 if not "email" in params:
415 if email!=None:
416 params["email"] = email
417 else:
418 warnings.warn("""
419 Email address is not specified.
420
421 To make use of NCBI's E-utilities, NCBI strongly recommends you to specify
422 your email address with each request. From June 1, 2010, this will be
423 mandatory. As an example, if your email address is A.N.Other@example.com, you
424 can specify it as follows:
425 from Bio import Entrez
426 Entrez.email = 'A.N.Other@example.com'
427 In case of excessive usage of the E-utilities, NCBI will attempt to contact
428 a user at the email address provided before blocking access to the
429 E-utilities.""", UserWarning)
430
431 options = urllib.urlencode(params, doseq=True)
432
433 try:
434 if post:
435
436 handle = urllib2.urlopen(cgi, data=options)
437 else:
438
439 cgi += "?" + options
440 handle = urllib2.urlopen(cgi)
441 except urllib2.HTTPError, exception:
442 raise exception
443
444 return _binary_to_string_handle(handle)
445
446 _open.previous = 0
447
448
450 """Run the module's doctests (PRIVATE)."""
451 print "Runing doctests..."
452 import doctest
453 doctest.testmod()
454 print "Done"
455
456 if __name__ == "__main__":
457 _test()
458