1
2
3
4
5
6 """Parser for XML results returned by NCBI's Entrez Utilities. This
7 parser is used by the read() function in Bio.Entrez, and is not intended
8 be used directly.
9 """
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38 import os.path
39 import urlparse
40 import urllib
41 import warnings
42 from xml.parsers import expat
43
44
45
46
47
50 text = int.__repr__(self)
51 try:
52 attributes = self.attributes
53 except AttributeError:
54 return text
55 return "IntegerElement(%s, attributes=%s)" % (text, repr(attributes))
56
57
60 text = str.__repr__(self)
61 try:
62 attributes = self.attributes
63 except AttributeError:
64 return text
65 return "StringElement(%s, attributes=%s)" % (text, repr(attributes))
66
67
70 text = unicode.__repr__(self)
71 try:
72 attributes = self.attributes
73 except AttributeError:
74 return text
75 return "UnicodeElement(%s, attributes=%s)" % (text, repr(attributes))
76
77
80 text = list.__repr__(self)
81 try:
82 attributes = self.attributes
83 except AttributeError:
84 return text
85 return "ListElement(%s, attributes=%s)" % (text, repr(attributes))
86
87
90 text = dict.__repr__(self)
91 try:
92 attributes = self.attributes
93 except AttributeError:
94 return text
95 return "DictElement(%s, attributes=%s)" % (text, repr(attributes))
96
97
98
99
100
107
113
115 text = dict.__repr__(self)
116 try:
117 attributes = self.attributes
118 except AttributeError:
119 return text
120 return "DictElement(%s, attributes=%s)" % (text, repr(attributes))
121
122
126
128 return "Failed to parse the XML data (%s). Please make sure that the input data are in XML format." % self.msg
129
130
134
136 return "Failed to parse the XML data (%s). Please make sure that the input data are not corrupted." % self.msg
137
138
140 """Validating parsers raise this error if the parser finds a tag in the XML that is not defined in the DTD. Non-validating parsers do not raise this error. The Bio.Entrez.read and Bio.Entrez.parse functions use validating parsers by default (see those functions for more information)"""
143
145 return "Failed to find tag '%s' in the DTD. To skip all tags that are not represented in the DTD, please call Bio.Entrez.read or Bio.Entrez.parse with validate=False." % self.name
146
147
149
150 home = os.path.expanduser('~')
151 local_dtd_dir = os.path.join(home, '.biopython', 'Bio', 'Entrez', 'DTDs')
152 del home
153
154 from Bio import Entrez
155 global_dtd_dir = os.path.join(str(Entrez.__path__[0]), "DTDs")
156 del Entrez
157
159 self.stack = []
160 self.errors = []
161 self.integers = []
162 self.strings = []
163 self.lists = []
164 self.dictionaries = []
165 self.structures = {}
166 self.items = []
167 self.dtd_urls = []
168 self.validating = validate
169 self.parser = expat.ParserCreate(namespace_separator=" ")
170 self.parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS)
171 self.parser.XmlDeclHandler = self.xmlDeclHandler
172
173 - def read(self, handle):
174 """Set up the parser and let it parse the XML results"""
175
176
177 if handle.__class__.__name__ == 'EvilHandleHack':
178 handle = handle._handle
179 if hasattr(handle, "closed") and handle.closed:
180
181
182 raise IOError("Can't parse a closed handle")
183 try:
184 self.parser.ParseFile(handle)
185 except expat.ExpatError, e:
186 if self.parser.StartElementHandler:
187
188
189
190 raise CorruptedXMLError(e)
191 else:
192
193
194 raise NotXMLError(e)
195 try:
196 return self.object
197 except AttributeError:
198 if self.parser.StartElementHandler:
199
200
201
202 raise RuntimeError("Failed to parse the XML file correctly, possibly due to a bug in Bio.Entrez. Please contact the Biopython developers at biopython-dev@biopython.org for assistance.")
203 else:
204
205
206 raise NotXMLError("XML declaration not found")
207
208 - def parse(self, handle):
209 BLOCK = 1024
210 while True:
211
212 text = handle.read(BLOCK)
213 if not text:
214
215 if self.stack:
216
217
218 raise CorruptedXMLError
219 try:
220 for record in self.object:
221 yield record
222 except AttributeError:
223 if self.parser.StartElementHandler:
224
225
226
227 raise RuntimeError("Failed to parse the XML file correctly, possibly due to a bug in Bio.Entrez. Please contact the Biopython developers at biopython-dev@biopython.org for assistance.")
228 else:
229
230
231 raise NotXMLError("XML declaration not found")
232 self.parser.Parse("", True)
233 self.parser = None
234 return
235
236 try:
237 self.parser.Parse(text, False)
238 except expat.ExpatError, e:
239 if self.parser.StartElementHandler:
240
241
242
243 raise CorruptedXMLError(e)
244 else:
245
246
247 raise NotXMLError(e)
248
249 if not self.stack:
250
251 continue
252
253 records = self.stack[0]
254 if not isinstance(records, list):
255 raise ValueError("The XML file does not represent a list. Please use Entrez.read instead of Entrez.parse")
256 while len(records) > 1:
257 record = records.pop(0)
258 yield record
259
267
269 raise NotImplementedError("The Bio.Entrez parser cannot handle XML data that make use of XML namespaces")
270
272 self.content = ""
273 if name in self.lists:
274 object = ListElement()
275 elif name in self.dictionaries:
276 object = DictionaryElement()
277 elif name in self.structures:
278 object = StructureElement(self.structures[name])
279 elif name in self.items:
280 name = str(attrs["Name"])
281 del attrs["Name"]
282 itemtype = str(attrs["Type"])
283 del attrs["Type"]
284 if itemtype=="Structure":
285 object = DictionaryElement()
286 elif name in ("ArticleIds", "History"):
287 object = StructureElement(["pubmed", "medline"])
288 elif itemtype=="List":
289 object = ListElement()
290 else:
291 object = StringElement()
292 object.itemname = name
293 object.itemtype = itemtype
294 elif name in self.strings + self.errors + self.integers:
295 self.attributes = attrs
296 return
297 else:
298
299 if self.validating:
300 raise ValidationError(name)
301 else:
302
303 object = ""
304 if object!="":
305 object.tag = name
306 if attrs:
307 object.attributes = dict(attrs)
308 if len(self.stack)!=0:
309 current = self.stack[-1]
310 try:
311 current.append(object)
312 except AttributeError:
313 current[name] = object
314 self.stack.append(object)
315
357
359 self.content += content
360
362 """This callback function is called for each element declaration:
363 <!ELEMENT name (...)>
364 encountered in a DTD. The purpose of this function is to determine
365 whether this element should be regarded as a string, integer, list
366 dictionary, structure, or error."""
367 if name.upper()=="ERROR":
368 self.errors.append(name)
369 return
370 if name=='Item' and model==(expat.model.XML_CTYPE_MIXED,
371 expat.model.XML_CQUANT_REP,
372 None, ((expat.model.XML_CTYPE_NAME,
373 expat.model.XML_CQUANT_NONE,
374 'Item',
375 ()
376 ),
377 )
378 ):
379
380
381 self.items.append(name)
382 return
383
384 while (model[0] in (expat.model.XML_CTYPE_SEQ,
385 expat.model.XML_CTYPE_CHOICE)
386 and model[1] in (expat.model.XML_CQUANT_NONE,
387 expat.model.XML_CQUANT_OPT)
388 and len(model[3])==1):
389 model = model[3][0]
390
391 if model[0] in (expat.model.XML_CTYPE_MIXED,
392 expat.model.XML_CTYPE_EMPTY):
393 self.strings.append(name)
394 return
395
396 if (model[0] in (expat.model.XML_CTYPE_CHOICE,
397 expat.model.XML_CTYPE_SEQ) and
398 model[1] in (expat.model.XML_CQUANT_PLUS,
399 expat.model.XML_CQUANT_REP)):
400 self.lists.append(name)
401 return
402
403
404
405
406
407
408
409 single = []
410 multiple = []
411
412
413
414
415 def count(model):
416 quantifier, name, children = model[1:]
417 if name is None:
418 if quantifier in (expat.model.XML_CQUANT_PLUS,
419 expat.model.XML_CQUANT_REP):
420 for child in children:
421 multiple.append(child[2])
422 else:
423 for child in children:
424 count(child)
425 elif name.upper()!="ERROR":
426 if quantifier in (expat.model.XML_CQUANT_NONE,
427 expat.model.XML_CQUANT_OPT):
428 single.append(name)
429 elif quantifier in (expat.model.XML_CQUANT_PLUS,
430 expat.model.XML_CQUANT_REP):
431 multiple.append(name)
432 count(model)
433 if len(single)==0 and len(multiple)==1:
434 self.lists.append(name)
435 elif len(multiple)==0:
436 self.dictionaries.append(name)
437 else:
438 self.structures.update({name: multiple})
439
456
458 """The purpose of this function is to load the DTD locally, instead
459 of downloading it from the URL specified in the XML. Using the local
460 DTD results in much faster parsing. If the DTD is not found locally,
461 we try to download it. If new DTDs become available from NCBI,
462 putting them in Bio/Entrez/DTDs will allow the parser to see them."""
463 urlinfo = urlparse.urlparse(systemId)
464
465
466 if urlinfo[0]=='http':
467
468 url = systemId
469 elif urlinfo[0]=='':
470
471
472 try:
473 url = self.dtd_urls[-1]
474 except IndexError:
475
476
477 source = "http://www.ncbi.nlm.nih.gov/dtd/"
478 else:
479 source = os.path.dirname(url)
480
481 url = source.rstrip("/") + "/" + systemId
482 self.dtd_urls.append(url)
483
484 location, filename = os.path.split(systemId)
485 handle = self.open_dtd_file(filename)
486 if not handle:
487
488
489 message = """\
490 Unable to load DTD file %s.
491
492 Bio.Entrez uses NCBI's DTD files to parse XML files returned by NCBI Entrez.
493 Though most of NCBI's DTD files are included in the Biopython distribution,
494 sometimes you may find that a particular DTD file is missing. While we can
495 access the DTD file through the internet, the parser is much faster if the
496 required DTD files are available locally.
497
498 For this purpose, please download %s from
499
500 %s
501
502 and save it either in directory
503
504 %s
505
506 or in directory
507
508 %s
509
510 in order for Bio.Entrez to find it.
511
512 Alternatively, you can save %s in the directory
513 Bio/Entrez/DTDs in the Biopython distribution, and reinstall Biopython.
514
515 Please also inform the Biopython developers about this missing DTD, by
516 reporting a bug on http://bugzilla.open-bio.org/ or sign up to our mailing
517 list and emailing us, so that we can include it with the next release of
518 Biopython.
519
520 Proceeding to access the DTD file through the internet...
521 """ % (filename, filename, url, self.global_dtd_dir, self.local_dtd_dir, filename)
522 warnings.warn(message)
523 try:
524 handle = urllib.urlopen(url)
525 except IOError:
526 raise RuntimeException("Failed to access %s at %s" % (filename, url))
527
528 parser = self.parser.ExternalEntityParserCreate(context)
529 parser.ElementDeclHandler = self.elementDecl
530 parser.ParseFile(handle)
531 handle.close()
532 self.dtd_urls.pop()
533 return 1
534