Package Bio :: Package Entrez :: Module Parser
[hide private]
[frames] | no frames]

Source Code for Module Bio.Entrez.Parser

  1  # Copyright 2008 by Michiel de Hoon.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Parser for XML results returned by NCBI's Entrez Utilities. This 
  7  parser is used by the read() function in Bio.Entrez, and is not intended 
  8  be used directly. 
  9  """ 
 10   
 11  # The question is how to represent an XML file as Python objects. Some 
 12  # XML files returned by NCBI look like lists, others look like dictionaries, 
 13  # and others look like a mix of lists and dictionaries. 
 14  # 
 15  # My approach is to classify each possible element in the XML as a plain 
 16  # string, an integer, a list, a dictionary, or a structure. The latter is a 
 17  # dictionary where the same key can occur multiple times; in Python, it is 
 18  # represented as a dictionary where that key occurs once, pointing to a list 
 19  # of values found in the XML file. 
 20  # 
 21  # The parser then goes through the XML and creates the appropriate Python 
 22  # object for each element. The different levels encountered in the XML are 
 23  # preserved on the Python side. So a subelement of a subelement of an element 
 24  # is a value in a dictionary that is stored in a list which is a value in 
 25  # some other dictionary (or a value in a list which itself belongs to a list 
 26  # which is a value in a dictionary, and so on). Attributes encountered in 
 27  # the XML are stored as a dictionary in a member .attributes of each element, 
 28  # and the tag name is saved in a member .tag. 
 29  # 
 30  # To decide which kind of Python object corresponds to each element in the 
 31  # XML, the parser analyzes the DTD referred at the top of (almost) every 
 32  # XML file returned by the Entrez Utilities. This is preferred over a hand- 
 33  # written solution, since the number of DTDs is rather large and their 
 34  # contents may change over time. About half the code in this parser deals 
 35  # wih parsing the DTD, and the other half with the XML itself. 
 36   
 37   
 38  import os.path 
 39  import warnings 
 40  from xml.parsers import expat 
 41   
 42  #Importing these functions with leading underscore as not intended for reuse 
 43  from Bio._py3k import urlopen as _urlopen 
 44  from Bio._py3k import urlparse as _urlparse 
 45  from Bio._py3k import unicode 
 46   
 47  # The following four classes are used to add a member .attributes to integers, 
 48  # strings, lists, and dictionaries, respectively. 
 49   
 50   
51 -class IntegerElement(int):
52 - def __repr__(self):
53 text = int.__repr__(self) 54 try: 55 attributes = self.attributes 56 except AttributeError: 57 return text 58 return "IntegerElement(%s, attributes=%s)" % (text, repr(attributes))
59 60
61 -class StringElement(str):
62 - def __repr__(self):
63 text = str.__repr__(self) 64 try: 65 attributes = self.attributes 66 except AttributeError: 67 return text 68 return "StringElement(%s, attributes=%s)" % (text, repr(attributes))
69 70
71 -class UnicodeElement(unicode):
72 - def __repr__(self):
73 text = unicode.__repr__(self) 74 try: 75 attributes = self.attributes 76 except AttributeError: 77 return text 78 return "UnicodeElement(%s, attributes=%s)" % (text, repr(attributes))
79 80
81 -class ListElement(list):
82 - def __repr__(self):
83 text = list.__repr__(self) 84 try: 85 attributes = self.attributes 86 except AttributeError: 87 return text 88 return "ListElement(%s, attributes=%s)" % (text, repr(attributes))
89 90
91 -class DictionaryElement(dict):
92 - def __repr__(self):
93 text = dict.__repr__(self) 94 try: 95 attributes = self.attributes 96 except AttributeError: 97 return text 98 return "DictElement(%s, attributes=%s)" % (text, repr(attributes))
99 100 101 # A StructureElement is like a dictionary, but some of its keys can have 102 # multiple values associated with it. These values are stored in a list 103 # under each key.
104 -class StructureElement(dict):
105 - def __init__(self, keys):
106 dict.__init__(self) 107 for key in keys: 108 dict.__setitem__(self, key, []) 109 self.listkeys = keys
110
111 - def __setitem__(self, key, value):
112 if key in self.listkeys: 113 self[key].append(value) 114 else: 115 dict.__setitem__(self, key, value)
116
117 - def __repr__(self):
118 text = dict.__repr__(self) 119 try: 120 attributes = self.attributes 121 except AttributeError: 122 return text 123 return "DictElement(%s, attributes=%s)" % (text, repr(attributes))
124 125
126 -class NotXMLError(ValueError):
127 - def __init__(self, message):
128 self.msg = message
129
130 - def __str__(self):
131 return "Failed to parse the XML data (%s). Please make sure that the input data are in XML format." % self.msg
132 133
134 -class CorruptedXMLError(ValueError):
135 - def __init__(self, message):
136 self.msg = message
137
138 - def __str__(self):
139 return "Failed to parse the XML data (%s). Please make sure that the input data are not corrupted." % self.msg
140 141
142 -class ValidationError(ValueError):
143 """Validating parsers raise this error if the parser finds a tag in the XML that is not defined in the DTD. Non-validating parsers do not raise this error. The Bio.Entrez.read and Bio.Entrez.parse functions use validating parsers by default (see those functions for more information)"""
144 - def __init__(self, name):
145 self.name = name
146
147 - def __str__(self):
148 return "Failed to find tag '%s' in the DTD. To skip all tags that are not represented in the DTD, please call Bio.Entrez.read or Bio.Entrez.parse with validate=False." % self.name
149 150
151 -class DataHandler(object):
152 153 home = os.path.expanduser('~') 154 local_dtd_dir = os.path.join(home, '.biopython', 'Bio', 'Entrez', 'DTDs') 155 del home 156 157 from Bio import Entrez 158 global_dtd_dir = os.path.join(str(Entrez.__path__[0]), "DTDs") 159 del Entrez 160
161 - def __init__(self, validate):
162 self.stack = [] 163 self.errors = [] 164 self.integers = [] 165 self.strings = [] 166 self.lists = [] 167 self.dictionaries = [] 168 self.structures = {} 169 self.items = [] 170 self.dtd_urls = [] 171 self.validating = validate 172 self.parser = expat.ParserCreate(namespace_separator=" ") 173 self.parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS) 174 self.parser.XmlDeclHandler = self.xmlDeclHandler
175
176 - def read(self, handle):
177 """Set up the parser and let it parse the XML results""" 178 # HACK: remove Bio._py3k handle conversion, since the Entrez XML parser 179 # expects binary data 180 if handle.__class__.__name__ == 'EvilHandleHack': 181 handle = handle._handle 182 if hasattr(handle, "closed") and handle.closed: 183 #Should avoid a possible Segmentation Fault, see: 184 #http://bugs.python.org/issue4877 185 raise IOError("Can't parse a closed handle") 186 try: 187 self.parser.ParseFile(handle) 188 except expat.ExpatError as e: 189 if self.parser.StartElementHandler: 190 # We saw the initial <!xml declaration, so we can be sure that 191 # we are parsing XML data. Most likely, the XML file is 192 # corrupted. 193 raise CorruptedXMLError(e) 194 else: 195 # We have not seen the initial <!xml declaration, so probably 196 # the input data is not in XML format. 197 raise NotXMLError(e) 198 try: 199 return self.object 200 except AttributeError: 201 if self.parser.StartElementHandler: 202 # We saw the initial <!xml declaration, and expat didn't notice 203 # any errors, so self.object should be defined. If not, this is 204 # a bug. 205 raise RuntimeError("Failed to parse the XML file correctly, possibly due to a bug in Bio.Entrez. Please contact the Biopython developers at biopython-dev@biopython.org for assistance.") 206 else: 207 # We did not see the initial <!xml declaration, so probably 208 # the input data is not in XML format. 209 raise NotXMLError("XML declaration not found")
210
211 - def parse(self, handle):
212 BLOCK = 1024 213 while True: 214 #Read in another block of the file... 215 text = handle.read(BLOCK) 216 if not text: 217 # We have reached the end of the XML file 218 if self.stack: 219 # No more XML data, but there is still some unfinished 220 # business 221 raise CorruptedXMLError 222 try: 223 for record in self.object: 224 yield record 225 except AttributeError: 226 if self.parser.StartElementHandler: 227 # We saw the initial <!xml declaration, and expat 228 # didn't notice any errors, so self.object should be 229 # defined. If not, this is a bug. 230 raise RuntimeError("Failed to parse the XML file correctly, possibly due to a bug in Bio.Entrez. Please contact the Biopython developers at biopython-dev@biopython.org for assistance.") 231 else: 232 # We did not see the initial <!xml declaration, so 233 # probably the input data is not in XML format. 234 raise NotXMLError("XML declaration not found") 235 self.parser.Parse("", True) 236 self.parser = None 237 return 238 239 try: 240 self.parser.Parse(text, False) 241 except expat.ExpatError as e: 242 if self.parser.StartElementHandler: 243 # We saw the initial <!xml declaration, so we can be sure 244 # that we are parsing XML data. Most likely, the XML file 245 # is corrupted. 246 raise CorruptedXMLError(e) 247 else: 248 # We have not seen the initial <!xml declaration, so 249 # probably the input data is not in XML format. 250 raise NotXMLError(e) 251 252 if not self.stack: 253 # Haven't read enough from the XML file yet 254 continue 255 256 records = self.stack[0] 257 if not isinstance(records, list): 258 raise ValueError("The XML file does not represent a list. Please use Entrez.read instead of Entrez.parse") 259 while len(records) > 1: # Then the top record is finished 260 record = records.pop(0) 261 yield record
262
263 - def xmlDeclHandler(self, version, encoding, standalone):
264 # XML declaration found; set the handlers 265 self.parser.StartElementHandler = self.startElementHandler 266 self.parser.EndElementHandler = self.endElementHandler 267 self.parser.CharacterDataHandler = self.characterDataHandler 268 self.parser.ExternalEntityRefHandler = self.externalEntityRefHandler 269 self.parser.StartNamespaceDeclHandler = self.startNamespaceDeclHandler
270
271 - def startNamespaceDeclHandler(self, prefix, un):
272 raise NotImplementedError("The Bio.Entrez parser cannot handle XML data that make use of XML namespaces")
273
274 - def startElementHandler(self, name, attrs):
275 self.content = "" 276 if name in self.lists: 277 object = ListElement() 278 elif name in self.dictionaries: 279 object = DictionaryElement() 280 elif name in self.structures: 281 object = StructureElement(self.structures[name]) 282 elif name in self.items: # Only appears in ESummary 283 name = str(attrs["Name"]) # convert from Unicode 284 del attrs["Name"] 285 itemtype = str(attrs["Type"]) # convert from Unicode 286 del attrs["Type"] 287 if itemtype=="Structure": 288 object = DictionaryElement() 289 elif name in ("ArticleIds", "History"): 290 object = StructureElement(["pubmed", "medline"]) 291 elif itemtype=="List": 292 object = ListElement() 293 else: 294 object = StringElement() 295 object.itemname = name 296 object.itemtype = itemtype 297 elif name in self.strings + self.errors + self.integers: 298 self.attributes = attrs 299 return 300 else: 301 # Element not found in DTD 302 if self.validating: 303 raise ValidationError(name) 304 else: 305 # this will not be stored in the record 306 object = "" 307 if object!="": 308 object.tag = name 309 if attrs: 310 object.attributes = dict(attrs) 311 if len(self.stack)!=0: 312 current = self.stack[-1] 313 try: 314 current.append(object) 315 except AttributeError: 316 current[name] = object 317 self.stack.append(object)
318
319 - def endElementHandler(self, name):
320 value = self.content 321 if name in self.errors: 322 if value=="": 323 return 324 else: 325 raise RuntimeError(value) 326 elif name in self.integers: 327 value = IntegerElement(value) 328 elif name in self.strings: 329 # Convert Unicode strings to plain strings if possible 330 try: 331 value = StringElement(value) 332 except UnicodeEncodeError: 333 value = UnicodeElement(value) 334 elif name in self.items: 335 self.object = self.stack.pop() 336 if self.object.itemtype in ("List", "Structure"): 337 return 338 elif self.object.itemtype=="Integer" and value: 339 value = IntegerElement(value) 340 else: 341 # Convert Unicode strings to plain strings if possible 342 try: 343 value = StringElement(value) 344 except UnicodeEncodeError: 345 value = UnicodeElement(value) 346 name = self.object.itemname 347 else: 348 self.object = self.stack.pop() 349 return 350 value.tag = name 351 if self.attributes: 352 value.attributes = dict(self.attributes) 353 del self.attributes 354 current = self.stack[-1] 355 if current!="": 356 try: 357 current.append(value) 358 except AttributeError: 359 current[name] = value
360
361 - def characterDataHandler(self, content):
362 self.content += content
363
364 - def elementDecl(self, name, model):
365 """This callback function is called for each element declaration: 366 <!ELEMENT name (...)> 367 encountered in a DTD. The purpose of this function is to determine 368 whether this element should be regarded as a string, integer, list 369 dictionary, structure, or error.""" 370 if name.upper()=="ERROR": 371 self.errors.append(name) 372 return 373 if name=='Item' and model==(expat.model.XML_CTYPE_MIXED, 374 expat.model.XML_CQUANT_REP, 375 None, ((expat.model.XML_CTYPE_NAME, 376 expat.model.XML_CQUANT_NONE, 377 'Item', 378 () 379 ), 380 ) 381 ): 382 # Special case. As far as I can tell, this only occurs in the 383 # eSummary DTD. 384 self.items.append(name) 385 return 386 # First, remove ignorable parentheses around declarations 387 while (model[0] in (expat.model.XML_CTYPE_SEQ, 388 expat.model.XML_CTYPE_CHOICE) 389 and model[1] in (expat.model.XML_CQUANT_NONE, 390 expat.model.XML_CQUANT_OPT) 391 and len(model[3])==1): 392 model = model[3][0] 393 # PCDATA declarations correspond to strings 394 if model[0] in (expat.model.XML_CTYPE_MIXED, 395 expat.model.XML_CTYPE_EMPTY): 396 self.strings.append(name) 397 return 398 # List-type elements 399 if (model[0] in (expat.model.XML_CTYPE_CHOICE, 400 expat.model.XML_CTYPE_SEQ) and 401 model[1] in (expat.model.XML_CQUANT_PLUS, 402 expat.model.XML_CQUANT_REP)): 403 self.lists.append(name) 404 return 405 # This is the tricky case. Check which keys can occur multiple 406 # times. If only one key is possible, and it can occur multiple 407 # times, then this is a list. If more than one key is possible, 408 # but none of them can occur multiple times, then this is a 409 # dictionary. Otherwise, this is a structure. 410 # In 'single' and 'multiple', we keep track which keys can occur 411 # only once, and which can occur multiple times. 412 single = [] 413 multiple = [] 414 # The 'count' function is called recursively to make sure all the 415 # children in this model are counted. Error keys are ignored; 416 # they raise an exception in Python. 417 418 def count(model): 419 quantifier, name, children = model[1:] 420 if name is None: 421 if quantifier in (expat.model.XML_CQUANT_PLUS, 422 expat.model.XML_CQUANT_REP): 423 for child in children: 424 multiple.append(child[2]) 425 else: 426 for child in children: 427 count(child) 428 elif name.upper()!="ERROR": 429 if quantifier in (expat.model.XML_CQUANT_NONE, 430 expat.model.XML_CQUANT_OPT): 431 single.append(name) 432 elif quantifier in (expat.model.XML_CQUANT_PLUS, 433 expat.model.XML_CQUANT_REP): 434 multiple.append(name)
435 count(model) 436 if len(single)==0 and len(multiple)==1: 437 self.lists.append(name) 438 elif len(multiple)==0: 439 self.dictionaries.append(name) 440 else: 441 self.structures.update({name: multiple})
442
443 - def open_dtd_file(self, filename):
444 path = os.path.join(DataHandler.local_dtd_dir, filename) 445 try: 446 handle = open(path, "rb") 447 except IOError: 448 pass 449 else: 450 return handle 451 path = os.path.join(DataHandler.global_dtd_dir, filename) 452 try: 453 handle = open(path, "rb") 454 except IOError: 455 pass 456 else: 457 return handle 458 return None
459
460 - def externalEntityRefHandler(self, context, base, systemId, publicId):
461 """The purpose of this function is to load the DTD locally, instead 462 of downloading it from the URL specified in the XML. Using the local 463 DTD results in much faster parsing. If the DTD is not found locally, 464 we try to download it. If new DTDs become available from NCBI, 465 putting them in Bio/Entrez/DTDs will allow the parser to see them.""" 466 urlinfo = _urlparse(systemId) 467 #Following attribute requires Python 2.5+ 468 #if urlinfo.scheme=='http': 469 if urlinfo[0]=='http': 470 # Then this is an absolute path to the DTD. 471 url = systemId 472 elif urlinfo[0]=='': 473 # Then this is a relative path to the DTD. 474 # Look at the parent URL to find the full path. 475 try: 476 url = self.dtd_urls[-1] 477 except IndexError: 478 # Assume the default URL for DTDs if the top parent 479 # does not contain an absolute path 480 source = "http://www.ncbi.nlm.nih.gov/dtd/" 481 else: 482 source = os.path.dirname(url) 483 # urls always have a forward slash, don't use os.path.join 484 url = source.rstrip("/") + "/" + systemId 485 self.dtd_urls.append(url) 486 # First, try to load the local version of the DTD file 487 location, filename = os.path.split(systemId) 488 handle = self.open_dtd_file(filename) 489 if not handle: 490 # DTD is not available as a local file. Try accessing it through 491 # the internet instead. 492 message = """\ 493 Unable to load DTD file %s. 494 495 Bio.Entrez uses NCBI's DTD files to parse XML files returned by NCBI Entrez. 496 Though most of NCBI's DTD files are included in the Biopython distribution, 497 sometimes you may find that a particular DTD file is missing. While we can 498 access the DTD file through the internet, the parser is much faster if the 499 required DTD files are available locally. 500 501 For this purpose, please download %s from 502 503 %s 504 505 and save it either in directory 506 507 %s 508 509 or in directory 510 511 %s 512 513 in order for Bio.Entrez to find it. 514 515 Alternatively, you can save %s in the directory 516 Bio/Entrez/DTDs in the Biopython distribution, and reinstall Biopython. 517 518 Please also inform the Biopython developers about this missing DTD, by 519 reporting a bug on https://github.com/biopython/biopython/issues or sign 520 up to our mailing list and emailing us, so that we can include it with the 521 next release of Biopython. 522 523 Proceeding to access the DTD file through the internet... 524 """ % (filename, filename, url, self.global_dtd_dir, self.local_dtd_dir, filename) 525 warnings.warn(message) 526 try: 527 handle = _urlopen(url) 528 except IOError: 529 raise RuntimeException("Failed to access %s at %s" % (filename, url)) 530 531 parser = self.parser.ExternalEntityParserCreate(context) 532 parser.ElementDeclHandler = self.elementDecl 533 parser.ParseFile(handle) 534 handle.close() 535 self.dtd_urls.pop() 536 return 1
537