Package Bio :: Package Entrez :: Module Parser
[hide private]
[frames] | no frames]

Source Code for Module Bio.Entrez.Parser

  1  # Copyright 2008 by Michiel de Hoon.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Parser for XML results returned by NCBI's Entrez Utilities. This 
  7  parser is used by the read() function in Bio.Entrez, and is not intended 
  8  be used directly. 
  9  """ 
 10   
 11  # The question is how to represent an XML file as Python objects. Some 
 12  # XML files returned by NCBI look like lists, others look like dictionaries, 
 13  # and others look like a mix of lists and dictionaries. 
 14  # 
 15  # My approach is to classify each possible element in the XML as a plain 
 16  # string, an integer, a list, a dictionary, or a structure. The latter is a 
 17  # dictionary where the same key can occur multiple times; in Python, it is 
 18  # represented as a dictionary where that key occurs once, pointing to a list 
 19  # of values found in the XML file. 
 20  # 
 21  # The parser then goes through the XML and creates the appropriate Python 
 22  # object for each element. The different levels encountered in the XML are 
 23  # preserved on the Python side. So a subelement of a subelement of an element 
 24  # is a value in a dictionary that is stored in a list which is a value in 
 25  # some other dictionary (or a value in a list which itself belongs to a list 
 26  # which is a value in a dictionary, and so on). Attributes encountered in 
 27  # the XML are stored as a dictionary in a member .attributes of each element, 
 28  # and the tag name is saved in a member .tag. 
 29  # 
 30  # To decide which kind of Python object corresponds to each element in the 
 31  # XML, the parser analyzes the DTD referred at the top of (almost) every 
 32  # XML file returned by the Entrez Utilities. This is preferred over a hand- 
 33  # written solution, since the number of DTDs is rather large and their 
 34  # contents may change over time. About half the code in this parser deals 
 35  # wih parsing the DTD, and the other half with the XML itself. 
 36   
 37   
 38  import os 
 39  import warnings 
 40  from xml.parsers import expat 
 41  from io import BytesIO 
 42   
 43  #Importing these functions with leading underscore as not intended for reuse 
 44  from Bio._py3k import urlopen as _urlopen 
 45  from Bio._py3k import urlparse as _urlparse 
 46  from Bio._py3k import unicode 
 47   
 48  # The following four classes are used to add a member .attributes to integers, 
 49  # strings, lists, and dictionaries, respectively. 
 50   
 51   
52 -class IntegerElement(int):
53 - def __repr__(self):
54 text = int.__repr__(self) 55 try: 56 attributes = self.attributes 57 except AttributeError: 58 return text 59 return "IntegerElement(%s, attributes=%s)" % (text, repr(attributes))
60 61
62 -class StringElement(str):
63 - def __repr__(self):
64 text = str.__repr__(self) 65 try: 66 attributes = self.attributes 67 except AttributeError: 68 return text 69 return "StringElement(%s, attributes=%s)" % (text, repr(attributes))
70 71
72 -class UnicodeElement(unicode):
73 - def __repr__(self):
74 text = unicode.__repr__(self) 75 try: 76 attributes = self.attributes 77 except AttributeError: 78 return text 79 return "UnicodeElement(%s, attributes=%s)" % (text, repr(attributes))
80 81
82 -class ListElement(list):
83 - def __repr__(self):
84 text = list.__repr__(self) 85 try: 86 attributes = self.attributes 87 except AttributeError: 88 return text 89 return "ListElement(%s, attributes=%s)" % (text, repr(attributes))
90 91
92 -class DictionaryElement(dict):
93 - def __repr__(self):
94 text = dict.__repr__(self) 95 try: 96 attributes = self.attributes 97 except AttributeError: 98 return text 99 return "DictElement(%s, attributes=%s)" % (text, repr(attributes))
100 101 102 # A StructureElement is like a dictionary, but some of its keys can have 103 # multiple values associated with it. These values are stored in a list 104 # under each key.
105 -class StructureElement(dict):
106 - def __init__(self, keys):
107 dict.__init__(self) 108 for key in keys: 109 dict.__setitem__(self, key, []) 110 self.listkeys = keys
111
112 - def __setitem__(self, key, value):
113 if key in self.listkeys: 114 self[key].append(value) 115 else: 116 dict.__setitem__(self, key, value)
117
118 - def __repr__(self):
119 text = dict.__repr__(self) 120 try: 121 attributes = self.attributes 122 except AttributeError: 123 return text 124 return "DictElement(%s, attributes=%s)" % (text, repr(attributes))
125 126
127 -class NotXMLError(ValueError):
128 - def __init__(self, message):
129 self.msg = message
130
131 - def __str__(self):
132 return "Failed to parse the XML data (%s). Please make sure that the input data are in XML format." % self.msg
133 134
135 -class CorruptedXMLError(ValueError):
136 - def __init__(self, message):
137 self.msg = message
138
139 - def __str__(self):
140 return "Failed to parse the XML data (%s). Please make sure that the input data are not corrupted." % self.msg
141 142
143 -class ValidationError(ValueError):
144 """Validating parsers raise this error if the parser finds a tag in the XML that is not defined in the DTD. Non-validating parsers do not raise this error. The Bio.Entrez.read and Bio.Entrez.parse functions use validating parsers by default (see those functions for more information)"""
145 - def __init__(self, name):
146 self.name = name
147
148 - def __str__(self):
149 return "Failed to find tag '%s' in the DTD. To skip all tags that are not represented in the DTD, please call Bio.Entrez.read or Bio.Entrez.parse with validate=False." % self.name
150 151
152 -class DataHandler(object):
153 154 import platform 155 if platform.system()=='Windows': 156 directory = os.path.join(os.getenv("APPDATA"), "biopython") 157 else: # Unix/Linux/Mac 158 home = os.path.expanduser('~') 159 directory = os.path.join(home, '.config', 'biopython') 160 del home 161 local_dtd_dir = os.path.join(directory, 'Bio', 'Entrez', 'DTDs') 162 del directory 163 del platform 164 try: 165 os.makedirs(local_dtd_dir) # use exist_ok=True on Python >= 3.2 166 except OSError as exception: 167 # Check if local_dtd_dir already exists, and that it is a directory. 168 # Trying os.makedirs first and then checking for os.path.isdir avoids 169 # a race condition. 170 if not os.path.isdir(local_dtd_dir): 171 raise exception 172 173 from Bio import Entrez 174 global_dtd_dir = os.path.join(str(Entrez.__path__[0]), "DTDs") 175 del Entrez 176
177 - def __init__(self, validate):
178 self.stack = [] 179 self.errors = [] 180 self.integers = [] 181 self.strings = [] 182 self.lists = [] 183 self.dictionaries = [] 184 self.structures = {} 185 self.items = [] 186 self.dtd_urls = [] 187 self.validating = validate 188 self.parser = expat.ParserCreate(namespace_separator=" ") 189 self.parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS) 190 self.parser.XmlDeclHandler = self.xmlDeclHandler
191
192 - def read(self, handle):
193 """Set up the parser and let it parse the XML results""" 194 # HACK: remove Bio._py3k handle conversion, since the Entrez XML parser 195 # expects binary data 196 if handle.__class__.__name__ == 'EvilHandleHack': 197 handle = handle._handle 198 if hasattr(handle, "closed") and handle.closed: 199 #Should avoid a possible Segmentation Fault, see: 200 #http://bugs.python.org/issue4877 201 raise IOError("Can't parse a closed handle") 202 try: 203 self.parser.ParseFile(handle) 204 except expat.ExpatError as e: 205 if self.parser.StartElementHandler: 206 # We saw the initial <!xml declaration, so we can be sure that 207 # we are parsing XML data. Most likely, the XML file is 208 # corrupted. 209 raise CorruptedXMLError(e) 210 else: 211 # We have not seen the initial <!xml declaration, so probably 212 # the input data is not in XML format. 213 raise NotXMLError(e) 214 try: 215 return self.object 216 except AttributeError: 217 if self.parser.StartElementHandler: 218 # We saw the initial <!xml declaration, and expat didn't notice 219 # any errors, so self.object should be defined. If not, this is 220 # a bug. 221 raise RuntimeError("Failed to parse the XML file correctly, possibly due to a bug in Bio.Entrez. Please contact the Biopython developers at biopython-dev@biopython.org for assistance.") 222 else: 223 # We did not see the initial <!xml declaration, so probably 224 # the input data is not in XML format. 225 raise NotXMLError("XML declaration not found")
226
227 - def parse(self, handle):
228 BLOCK = 1024 229 while True: 230 #Read in another block of the file... 231 text = handle.read(BLOCK) 232 if not text: 233 # We have reached the end of the XML file 234 if self.stack: 235 # No more XML data, but there is still some unfinished 236 # business 237 raise CorruptedXMLError 238 try: 239 for record in self.object: 240 yield record 241 except AttributeError: 242 if self.parser.StartElementHandler: 243 # We saw the initial <!xml declaration, and expat 244 # didn't notice any errors, so self.object should be 245 # defined. If not, this is a bug. 246 raise RuntimeError("Failed to parse the XML file correctly, possibly due to a bug in Bio.Entrez. Please contact the Biopython developers at biopython-dev@biopython.org for assistance.") 247 else: 248 # We did not see the initial <!xml declaration, so 249 # probably the input data is not in XML format. 250 raise NotXMLError("XML declaration not found") 251 self.parser.Parse("", True) 252 self.parser = None 253 return 254 255 try: 256 self.parser.Parse(text, False) 257 except expat.ExpatError as e: 258 if self.parser.StartElementHandler: 259 # We saw the initial <!xml declaration, so we can be sure 260 # that we are parsing XML data. Most likely, the XML file 261 # is corrupted. 262 raise CorruptedXMLError(e) 263 else: 264 # We have not seen the initial <!xml declaration, so 265 # probably the input data is not in XML format. 266 raise NotXMLError(e) 267 268 if not self.stack: 269 # Haven't read enough from the XML file yet 270 continue 271 272 records = self.stack[0] 273 if not isinstance(records, list): 274 raise ValueError("The XML file does not represent a list. Please use Entrez.read instead of Entrez.parse") 275 while len(records) > 1: # Then the top record is finished 276 record = records.pop(0) 277 yield record
278
279 - def xmlDeclHandler(self, version, encoding, standalone):
280 # XML declaration found; set the handlers 281 self.parser.StartElementHandler = self.startElementHandler 282 self.parser.EndElementHandler = self.endElementHandler 283 self.parser.CharacterDataHandler = self.characterDataHandler 284 self.parser.ExternalEntityRefHandler = self.externalEntityRefHandler 285 self.parser.StartNamespaceDeclHandler = self.startNamespaceDeclHandler
286
287 - def startNamespaceDeclHandler(self, prefix, un):
288 raise NotImplementedError("The Bio.Entrez parser cannot handle XML data that make use of XML namespaces")
289
290 - def startElementHandler(self, name, attrs):
291 self.content = "" 292 if name in self.lists: 293 object = ListElement() 294 elif name in self.dictionaries: 295 object = DictionaryElement() 296 elif name in self.structures: 297 object = StructureElement(self.structures[name]) 298 elif name in self.items: # Only appears in ESummary 299 name = str(attrs["Name"]) # convert from Unicode 300 del attrs["Name"] 301 itemtype = str(attrs["Type"]) # convert from Unicode 302 del attrs["Type"] 303 if itemtype=="Structure": 304 object = DictionaryElement() 305 elif name in ("ArticleIds", "History"): 306 object = StructureElement(["pubmed", "medline"]) 307 elif itemtype=="List": 308 object = ListElement() 309 else: 310 object = StringElement() 311 object.itemname = name 312 object.itemtype = itemtype 313 elif name in self.strings + self.errors + self.integers: 314 self.attributes = attrs 315 return 316 else: 317 # Element not found in DTD 318 if self.validating: 319 raise ValidationError(name) 320 else: 321 # this will not be stored in the record 322 object = "" 323 if object!="": 324 object.tag = name 325 if attrs: 326 object.attributes = dict(attrs) 327 if len(self.stack)!=0: 328 current = self.stack[-1] 329 try: 330 current.append(object) 331 except AttributeError: 332 current[name] = object 333 self.stack.append(object)
334
335 - def endElementHandler(self, name):
336 value = self.content 337 if name in self.errors: 338 if value=="": 339 return 340 else: 341 raise RuntimeError(value) 342 elif name in self.integers: 343 value = IntegerElement(value) 344 elif name in self.strings: 345 # Convert Unicode strings to plain strings if possible 346 try: 347 value = StringElement(value) 348 except UnicodeEncodeError: 349 value = UnicodeElement(value) 350 elif name in self.items: 351 self.object = self.stack.pop() 352 if self.object.itemtype in ("List", "Structure"): 353 return 354 elif self.object.itemtype=="Integer" and value: 355 value = IntegerElement(value) 356 else: 357 # Convert Unicode strings to plain strings if possible 358 try: 359 value = StringElement(value) 360 except UnicodeEncodeError: 361 value = UnicodeElement(value) 362 name = self.object.itemname 363 else: 364 self.object = self.stack.pop() 365 return 366 value.tag = name 367 if self.attributes: 368 value.attributes = dict(self.attributes) 369 del self.attributes 370 current = self.stack[-1] 371 if current!="": 372 try: 373 current.append(value) 374 except AttributeError: 375 current[name] = value
376
377 - def characterDataHandler(self, content):
378 self.content += content
379
380 - def elementDecl(self, name, model):
381 """This callback function is called for each element declaration: 382 <!ELEMENT name (...)> 383 encountered in a DTD. The purpose of this function is to determine 384 whether this element should be regarded as a string, integer, list 385 dictionary, structure, or error.""" 386 if name.upper()=="ERROR": 387 self.errors.append(name) 388 return 389 if name=='Item' and model==(expat.model.XML_CTYPE_MIXED, 390 expat.model.XML_CQUANT_REP, 391 None, ((expat.model.XML_CTYPE_NAME, 392 expat.model.XML_CQUANT_NONE, 393 'Item', 394 () 395 ), 396 ) 397 ): 398 # Special case. As far as I can tell, this only occurs in the 399 # eSummary DTD. 400 self.items.append(name) 401 return 402 # First, remove ignorable parentheses around declarations 403 while (model[0] in (expat.model.XML_CTYPE_SEQ, 404 expat.model.XML_CTYPE_CHOICE) 405 and model[1] in (expat.model.XML_CQUANT_NONE, 406 expat.model.XML_CQUANT_OPT) 407 and len(model[3])==1): 408 model = model[3][0] 409 # PCDATA declarations correspond to strings 410 if model[0] in (expat.model.XML_CTYPE_MIXED, 411 expat.model.XML_CTYPE_EMPTY): 412 self.strings.append(name) 413 return 414 # List-type elements 415 if (model[0] in (expat.model.XML_CTYPE_CHOICE, 416 expat.model.XML_CTYPE_SEQ) and 417 model[1] in (expat.model.XML_CQUANT_PLUS, 418 expat.model.XML_CQUANT_REP)): 419 self.lists.append(name) 420 return 421 # This is the tricky case. Check which keys can occur multiple 422 # times. If only one key is possible, and it can occur multiple 423 # times, then this is a list. If more than one key is possible, 424 # but none of them can occur multiple times, then this is a 425 # dictionary. Otherwise, this is a structure. 426 # In 'single' and 'multiple', we keep track which keys can occur 427 # only once, and which can occur multiple times. 428 single = [] 429 multiple = [] 430 # The 'count' function is called recursively to make sure all the 431 # children in this model are counted. Error keys are ignored; 432 # they raise an exception in Python. 433 434 def count(model): 435 quantifier, name, children = model[1:] 436 if name is None: 437 if quantifier in (expat.model.XML_CQUANT_PLUS, 438 expat.model.XML_CQUANT_REP): 439 for child in children: 440 multiple.append(child[2]) 441 else: 442 for child in children: 443 count(child) 444 elif name.upper()!="ERROR": 445 if quantifier in (expat.model.XML_CQUANT_NONE, 446 expat.model.XML_CQUANT_OPT): 447 single.append(name) 448 elif quantifier in (expat.model.XML_CQUANT_PLUS, 449 expat.model.XML_CQUANT_REP): 450 multiple.append(name)
451 count(model) 452 if len(single)==0 and len(multiple)==1: 453 self.lists.append(name) 454 elif len(multiple)==0: 455 self.dictionaries.append(name) 456 else: 457 self.structures.update({name: multiple})
458
459 - def open_dtd_file(self, filename):
460 path = os.path.join(DataHandler.local_dtd_dir, filename) 461 try: 462 handle = open(path, "rb") 463 except IOError: 464 pass 465 else: 466 return handle 467 path = os.path.join(DataHandler.global_dtd_dir, filename) 468 try: 469 handle = open(path, "rb") 470 except IOError: 471 pass 472 else: 473 return handle 474 return None
475
476 - def save_dtd_file(self, filename, text):
477 path = os.path.join(DataHandler.local_dtd_dir, filename) 478 try: 479 handle = open(path, "wb") 480 except IOError: 481 warnings.warn("Failed to save %s at %s" % (filename, path)) 482 else: 483 handle.write(text) 484 handle.close()
485
486 - def externalEntityRefHandler(self, context, base, systemId, publicId):
487 """The purpose of this function is to load the DTD locally, instead 488 of downloading it from the URL specified in the XML. Using the local 489 DTD results in much faster parsing. If the DTD is not found locally, 490 we try to download it. If new DTDs become available from NCBI, 491 putting them in Bio/Entrez/DTDs will allow the parser to see them.""" 492 urlinfo = _urlparse(systemId) 493 #Following attribute requires Python 2.5+ 494 #if urlinfo.scheme=='http': 495 if urlinfo[0]=='http': 496 # Then this is an absolute path to the DTD. 497 url = systemId 498 elif urlinfo[0]=='': 499 # Then this is a relative path to the DTD. 500 # Look at the parent URL to find the full path. 501 try: 502 url = self.dtd_urls[-1] 503 except IndexError: 504 # Assume the default URL for DTDs if the top parent 505 # does not contain an absolute path 506 source = "http://www.ncbi.nlm.nih.gov/dtd/" 507 else: 508 source = os.path.dirname(url) 509 # urls always have a forward slash, don't use os.path.join 510 url = source.rstrip("/") + "/" + systemId 511 self.dtd_urls.append(url) 512 # First, try to load the local version of the DTD file 513 location, filename = os.path.split(systemId) 514 handle = self.open_dtd_file(filename) 515 if not handle: 516 # DTD is not available as a local file. Try accessing it through 517 # the internet instead. 518 try: 519 handle = _urlopen(url) 520 except IOError: 521 raise RuntimeError("Failed to access %s at %s" % (filename, url)) 522 text = handle.read() 523 handle.close() 524 self.save_dtd_file(filename, text) 525 handle = BytesIO(text) 526 527 parser = self.parser.ExternalEntityParserCreate(context) 528 parser.ElementDeclHandler = self.elementDecl 529 parser.ParseFile(handle) 530 handle.close() 531 self.dtd_urls.pop() 532 return 1
533