1
2
3
4
5
6 """Parser for XML results returned by NCBI's Entrez Utilities. This
7 parser is used by the read() function in Bio.Entrez, and is not intended
8 be used directly.
9 """
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38 import os.path
39 import urlparse
40 import urllib
41 import warnings
42 from xml.parsers import expat
43
44
45
46
49 text = int.__repr__(self)
50 try:
51 attributes = self.attributes
52 except AttributeError:
53 return text
54 return "IntegerElement(%s, attributes=%s)" % (text, repr(attributes))
55
58 text = str.__repr__(self)
59 try:
60 attributes = self.attributes
61 except AttributeError:
62 return text
63 return "StringElement(%s, attributes=%s)" % (text, repr(attributes))
64
67 text = unicode.__repr__(self)
68 try:
69 attributes = self.attributes
70 except AttributeError:
71 return text
72 return "UnicodeElement(%s, attributes=%s)" % (text, repr(attributes))
73
76 text = list.__repr__(self)
77 try:
78 attributes = self.attributes
79 except AttributeError:
80 return text
81 return "ListElement(%s, attributes=%s)" % (text, repr(attributes))
82
85 text = dict.__repr__(self)
86 try:
87 attributes = self.attributes
88 except AttributeError:
89 return text
90 return "DictElement(%s, attributes=%s)" % (text, repr(attributes))
91
92
93
94
107 text = dict.__repr__(self)
108 try:
109 attributes = self.attributes
110 except AttributeError:
111 return text
112 return "DictElement(%s, attributes=%s)" % (text, repr(attributes))
113
114
119 return "Failed to parse the XML data (%s). Please make sure that the input data are in XML format." % self.msg
120
121
126 return "Failed to parse the XML data (%s). Please make sure that the input data are not corrupted." % self.msg
127
128
130 """Validating parsers raise this error if the parser finds a tag in the XML that is not defined in the DTD. Non-validating parsers do not raise this error. The Bio.Entrez.read and Bio.Entrez.parse functions use validating parsers by default (see those functions for more information)"""
134 return "Failed to find tag '%s' in the DTD. To skip all tags that are not represented in the DTD, please call Bio.Entrez.read or Bio.Entrez.parse with validate=False." % self.name
135
136
138
139 home = os.path.expanduser('~')
140 local_dtd_dir = os.path.join(home, '.biopython', 'Bio', 'Entrez', 'DTDs')
141 del home
142
143 from Bio import Entrez
144 global_dtd_dir = os.path.join(str(Entrez.__path__[0]), "DTDs")
145 del Entrez
146
148 self.stack = []
149 self.errors = []
150 self.integers = []
151 self.strings = []
152 self.lists = []
153 self.dictionaries = []
154 self.structures = {}
155 self.items = []
156 self.dtd_urls = []
157 self.validating = validate
158 self.parser = expat.ParserCreate(namespace_separator=" ")
159 self.parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS)
160 self.parser.XmlDeclHandler = self.xmlDeclHandler
161
162 - def read(self, handle):
163 """Set up the parser and let it parse the XML results"""
164 if hasattr(handle, "closed") and handle.closed:
165
166
167 raise IOError("Can't parse a closed handle")
168 try:
169 self.parser.ParseFile(handle)
170 except expat.ExpatError, e:
171 if self.parser.StartElementHandler:
172
173
174
175 raise CorruptedXMLError(e)
176 else:
177
178
179 raise NotXMLError(e)
180 try:
181 return self.object
182 except AttributeError:
183 if self.parser.StartElementHandler:
184
185
186
187 raise RuntimeError("Failed to parse the XML file correctly, possibly due to a bug in Bio.Entrez. Please contact the Biopython developers at biopython-dev@biopython.org for assistance.")
188 else:
189
190
191 raise NotXMLError("XML declaration not found")
192
193 - def parse(self, handle):
194 BLOCK = 1024
195 while True:
196
197 text = handle.read(BLOCK)
198 if not text:
199
200 if self.stack:
201
202
203 raise CorruptedXMLError
204 try:
205 for record in self.object:
206 yield record
207 except AttributeError:
208 if self.parser.StartElementHandler:
209
210
211
212 raise RuntimeError("Failed to parse the XML file correctly, possibly due to a bug in Bio.Entrez. Please contact the Biopython developers at biopython-dev@biopython.org for assistance.")
213 else:
214
215
216 raise NotXMLError("XML declaration not found")
217 self.parser.Parse("", True)
218 self.parser = None
219 return
220
221 try:
222 self.parser.Parse(text, False)
223 except expat.ExpatError, e:
224 if self.parser.StartElementHandler:
225
226
227
228 raise CorruptedXMLError(e)
229 else:
230
231
232 raise NotXMLError(e)
233
234 if not self.stack:
235
236 continue
237
238 records = self.stack[0]
239 if not isinstance(records, list):
240 raise ValueError("The XML file does not represent a list. Please use Entrez.read instead of Entrez.parse")
241 while len(records) > 1:
242 record = records.pop(0)
243 yield record
244
252
254 raise NotImplementedError("The Bio.Entrez parser cannot handle XML data that make use of XML namespaces")
255
257 self.content = ""
258 if name in self.lists:
259 object = ListElement()
260 elif name in self.dictionaries:
261 object = DictionaryElement()
262 elif name in self.structures:
263 object = StructureElement(self.structures[name])
264 elif name in self.items:
265 name = str(attrs["Name"])
266 del attrs["Name"]
267 itemtype = str(attrs["Type"])
268 del attrs["Type"]
269 if itemtype=="Structure":
270 object = DictionaryElement()
271 elif name in ("ArticleIds", "History"):
272 object = StructureElement(["pubmed", "medline"])
273 elif itemtype=="List":
274 object = ListElement()
275 else:
276 object = StringElement()
277 object.itemname = name
278 object.itemtype = itemtype
279 elif name in self.strings + self.errors + self.integers:
280 self.attributes = attrs
281 return
282 else:
283
284 if self.validating:
285 raise ValidationError(name)
286 else:
287
288 object = ""
289 if object!="":
290 object.tag = name
291 if attrs:
292 object.attributes = dict(attrs)
293 if len(self.stack)!=0:
294 current = self.stack[-1]
295 try:
296 current.append(object)
297 except AttributeError:
298 current[name] = object
299 self.stack.append(object)
300
342
344 self.content += content
345
347 """This callback function is called for each element declaration:
348 <!ELEMENT name (...)>
349 encountered in a DTD. The purpose of this function is to determine
350 whether this element should be regarded as a string, integer, list
351 dictionary, structure, or error."""
352 if name.upper()=="ERROR":
353 self.errors.append(name)
354 return
355 if name=='Item' and model==(expat.model.XML_CTYPE_MIXED,
356 expat.model.XML_CQUANT_REP,
357 None, ((expat.model.XML_CTYPE_NAME,
358 expat.model.XML_CQUANT_NONE,
359 'Item',
360 ()
361 ),
362 )
363 ):
364
365
366 self.items.append(name)
367 return
368
369 while (model[0] in (expat.model.XML_CTYPE_SEQ,
370 expat.model.XML_CTYPE_CHOICE)
371 and model[1] in (expat.model.XML_CQUANT_NONE,
372 expat.model.XML_CQUANT_OPT)
373 and len(model[3])==1):
374 model = model[3][0]
375
376 if model[0] in (expat.model.XML_CTYPE_MIXED,
377 expat.model.XML_CTYPE_EMPTY):
378 self.strings.append(name)
379 return
380
381 if (model[0] in (expat.model.XML_CTYPE_CHOICE,
382 expat.model.XML_CTYPE_SEQ) and
383 model[1] in (expat.model.XML_CQUANT_PLUS,
384 expat.model.XML_CQUANT_REP)):
385 self.lists.append(name)
386 return
387
388
389
390
391
392
393
394 single = []
395 multiple = []
396
397
398
399 def count(model):
400 quantifier, name, children = model[1:]
401 if name==None:
402 if quantifier in (expat.model.XML_CQUANT_PLUS,
403 expat.model.XML_CQUANT_REP):
404 for child in children:
405 multiple.append(child[2])
406 else:
407 for child in children:
408 count(child)
409 elif name.upper()!="ERROR":
410 if quantifier in (expat.model.XML_CQUANT_NONE,
411 expat.model.XML_CQUANT_OPT):
412 single.append(name)
413 elif quantifier in (expat.model.XML_CQUANT_PLUS,
414 expat.model.XML_CQUANT_REP):
415 multiple.append(name)
416 count(model)
417 if len(single)==0 and len(multiple)==1:
418 self.lists.append(name)
419 elif len(multiple)==0:
420 self.dictionaries.append(name)
421 else:
422 self.structures.update({name: multiple})
423
440
442 """The purpose of this function is to load the DTD locally, instead
443 of downloading it from the URL specified in the XML. Using the local
444 DTD results in much faster parsing. If the DTD is not found locally,
445 we try to download it. If new DTDs become available from NCBI,
446 putting them in Bio/Entrez/DTDs will allow the parser to see them."""
447 urlinfo = urlparse.urlparse(systemId)
448
449
450 if urlinfo[0]=='http':
451
452 url = systemId
453 elif urlinfo[0]=='':
454
455
456 try:
457 url = self.dtd_urls[-1]
458 except IndexError:
459
460
461 source = "http://www.ncbi.nlm.nih.gov/dtd/"
462 else:
463 source = os.path.dirname(url)
464 url = os.path.join(source, systemId)
465 self.dtd_urls.append(url)
466
467 location, filename = os.path.split(systemId)
468 handle = self.open_dtd_file(filename)
469 if not handle:
470
471
472 message = """\
473 Unable to load DTD file %s.
474
475 Bio.Entrez uses NCBI's DTD files to parse XML files returned by NCBI Entrez.
476 Though most of NCBI's DTD files are included in the Biopython distribution,
477 sometimes you may find that a particular DTD file is missing. While we can
478 access the DTD file through the internet, the parser is much faster if the
479 required DTD files are available locally.
480
481 For this purpose, please download %s from
482
483 %s
484
485 and save it either in directory
486
487 %s
488
489 or in directory
490
491 %s
492
493 in order for Bio.Entrez to find it.
494
495 Alternatively, you can save %s in the directory
496 Bio/Entrez/DTDs in the Biopython distribution, and reinstall Biopython.
497
498 Please also inform the Biopython developers about this missing DTD, by
499 reporting a bug on http://bugzilla.open-bio.org/ or sign up to our mailing
500 list and emailing us, so that we can include it with the next release of
501 Biopython.
502
503 Proceeding to access the DTD file through the internet...
504 """ % (filename, filename, url, self.global_dtd_dir, self.local_dtd_dir, filename)
505 warnings.warn(message)
506 try:
507 handle = urllib.urlopen(url)
508 except IOError:
509 raise RuntimeException("Failed to access %s at %s" % (filename, url))
510
511 parser = self.parser.ExternalEntityParserCreate(context)
512 parser.ElementDeclHandler = self.elementDecl
513 parser.ParseFile(handle)
514 handle.close()
515 self.dtd_urls.pop()
516 return 1
517