Package Bio :: Package Entrez :: Module Parser
[hide private]
[frames] | no frames]

Source Code for Module Bio.Entrez.Parser

  1  # Copyright 2008 by Michiel de Hoon.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Parser for XML results returned by NCBI's Entrez Utilities. This 
  7  parser is used by the read() function in Bio.Entrez, and is not intended 
  8  be used directly. 
  9  """ 
 10   
 11  # The question is how to represent an XML file as Python objects. Some 
 12  # XML files returned by NCBI look like lists, others look like dictionaries, 
 13  # and others look like a mix of lists and dictionaries. 
 14  # 
 15  # My approach is to classify each possible element in the XML as a plain 
 16  # string, an integer, a list, a dictionary, or a structure. The latter is a 
 17  # dictionary where the same key can occur multiple times; in Python, it is 
 18  # represented as a dictionary where that key occurs once, pointing to a list 
 19  # of values found in the XML file. 
 20  # 
 21  # The parser then goes through the XML and creates the appropriate Python 
 22  # object for each element. The different levels encountered in the XML are 
 23  # preserved on the Python side. So a subelement of a subelement of an element 
 24  # is a value in a dictionary that is stored in a list which is a value in 
 25  # some other dictionary (or a value in a list which itself belongs to a list 
 26  # which is a value in a dictionary, and so on). Attributes encountered in  
 27  # the XML are stored as a dictionary in a member .attributes of each element, 
 28  # and the tag name is saved in a member .tag. 
 29  # 
 30  # To decide which kind of Python object corresponds to each element in the 
 31  # XML, the parser analyzes the DTD referred at the top of (almost) every 
 32  # XML file returned by the Entrez Utilities. This is preferred over a hand- 
 33  # written solution, since the number of DTDs is rather large and their 
 34  # contents may change over time. About half the code in this parser deals 
 35  # wih parsing the DTD, and the other half with the XML itself. 
 36   
 37   
 38  import os.path 
 39  import urlparse 
 40  import urllib 
 41  import warnings 
 42  from xml.parsers import expat 
 43   
 44  # The following four classes are used to add a member .attributes to integers, 
 45  # strings, lists, and dictionaries, respectively. 
 46   
47 -class IntegerElement(int):
48 - def __repr__(self):
49 text = int.__repr__(self) 50 try: 51 attributes = self.attributes 52 except AttributeError: 53 return text 54 return "IntegerElement(%s, attributes=%s)" % (text, repr(attributes))
55
56 -class StringElement(str):
57 - def __repr__(self):
58 text = str.__repr__(self) 59 try: 60 attributes = self.attributes 61 except AttributeError: 62 return text 63 return "StringElement(%s, attributes=%s)" % (text, repr(attributes))
64
65 -class UnicodeElement(unicode):
66 - def __repr__(self):
67 text = unicode.__repr__(self) 68 try: 69 attributes = self.attributes 70 except AttributeError: 71 return text 72 return "UnicodeElement(%s, attributes=%s)" % (text, repr(attributes))
73
74 -class ListElement(list):
75 - def __repr__(self):
76 text = list.__repr__(self) 77 try: 78 attributes = self.attributes 79 except AttributeError: 80 return text 81 return "ListElement(%s, attributes=%s)" % (text, repr(attributes))
82
83 -class DictionaryElement(dict):
84 - def __repr__(self):
85 text = dict.__repr__(self) 86 try: 87 attributes = self.attributes 88 except AttributeError: 89 return text 90 return "DictElement(%s, attributes=%s)" % (text, repr(attributes))
91 92 # A StructureElement is like a dictionary, but some of its keys can have 93 # multiple values associated with it. These values are stored in a list 94 # under each key.
95 -class StructureElement(dict):
96 - def __init__(self, keys):
97 dict.__init__(self) 98 for key in keys: 99 dict.__setitem__(self, key, []) 100 self.listkeys = keys
101 - def __setitem__(self, key, value):
102 if key in self.listkeys: 103 self[key].append(value) 104 else: 105 dict.__setitem__(self, key, value)
106 - def __repr__(self):
107 text = dict.__repr__(self) 108 try: 109 attributes = self.attributes 110 except AttributeError: 111 return text 112 return "DictElement(%s, attributes=%s)" % (text, repr(attributes))
113 114
115 -class NotXMLError(ValueError):
116 - def __init__(self, message):
117 self.msg = message
118 - def __str__(self):
119 return "Failed to parse the XML data (%s). Please make sure that the input data are in XML format." % self.msg
120 121
122 -class CorruptedXMLError(ValueError):
123 - def __init__(self, message):
124 self.msg = message
125 - def __str__(self):
126 return "Failed to parse the XML data (%s). Please make sure that the input data are not corrupted." % self.msg
127 128
129 -class ValidationError(ValueError):
130 """Validating parsers raise this error if the parser finds a tag in the XML that is not defined in the DTD. Non-validating parsers do not raise this error. The Bio.Entrez.read and Bio.Entrez.parse functions use validating parsers by default (see those functions for more information)"""
131 - def __init__(self, name):
132 self.name = name
133 - def __str__(self):
134 return "Failed to find tag '%s' in the DTD. To skip all tags that are not represented in the DTD, please call Bio.Entrez.read or Bio.Entrez.parse with validate=False." % self.name
135 136
137 -class DataHandler(object):
138 139 home = os.path.expanduser('~') 140 local_dtd_dir = os.path.join(home, '.biopython', 'Bio', 'Entrez', 'DTDs') 141 del home 142 143 from Bio import Entrez 144 global_dtd_dir = os.path.join(str(Entrez.__path__[0]), "DTDs") 145 del Entrez 146
147 - def __init__(self, validate):
148 self.stack = [] 149 self.errors = [] 150 self.integers = [] 151 self.strings = [] 152 self.lists = [] 153 self.dictionaries = [] 154 self.structures = {} 155 self.items = [] 156 self.dtd_urls = [] 157 self.validating = validate 158 self.parser = expat.ParserCreate(namespace_separator=" ") 159 self.parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS) 160 self.parser.XmlDeclHandler = self.xmlDeclHandler
161
162 - def read(self, handle):
163 """Set up the parser and let it parse the XML results""" 164 if hasattr(handle, "closed") and handle.closed: 165 #Should avoid a possible Segmentation Fault, see: 166 #http://bugs.python.org/issue4877 167 raise IOError("Can't parse a closed handle") 168 try: 169 self.parser.ParseFile(handle) 170 except expat.ExpatError, e: 171 if self.parser.StartElementHandler: 172 # We saw the initial <!xml declaration, so we can be sure that 173 # we are parsing XML data. Most likely, the XML file is 174 # corrupted. 175 raise CorruptedXMLError(e) 176 else: 177 # We have not seen the initial <!xml declaration, so probably 178 # the input data is not in XML format. 179 raise NotXMLError(e) 180 try: 181 return self.object 182 except AttributeError: 183 if self.parser.StartElementHandler: 184 # We saw the initial <!xml declaration, and expat didn't notice 185 # any errors, so self.object should be defined. If not, this is 186 # a bug. 187 raise RuntimeError("Failed to parse the XML file correctly, possibly due to a bug in Bio.Entrez. Please contact the Biopython developers at biopython-dev@biopython.org for assistance.") 188 else: 189 # We did not see the initial <!xml declaration, so probably 190 # the input data is not in XML format. 191 raise NotXMLError("XML declaration not found")
192
193 - def parse(self, handle):
194 BLOCK = 1024 195 while True: 196 #Read in another block of the file... 197 text = handle.read(BLOCK) 198 if not text: 199 # We have reached the end of the XML file 200 if self.stack: 201 # No more XML data, but there is still some unfinished 202 # business 203 raise CorruptedXMLError 204 try: 205 for record in self.object: 206 yield record 207 except AttributeError: 208 if self.parser.StartElementHandler: 209 # We saw the initial <!xml declaration, and expat 210 # didn't notice any errors, so self.object should be 211 # defined. If not, this is a bug. 212 raise RuntimeError("Failed to parse the XML file correctly, possibly due to a bug in Bio.Entrez. Please contact the Biopython developers at biopython-dev@biopython.org for assistance.") 213 else: 214 # We did not see the initial <!xml declaration, so 215 # probably the input data is not in XML format. 216 raise NotXMLError("XML declaration not found") 217 self.parser.Parse("", True) 218 self.parser = None 219 return 220 221 try: 222 self.parser.Parse(text, False) 223 except expat.ExpatError, e: 224 if self.parser.StartElementHandler: 225 # We saw the initial <!xml declaration, so we can be sure 226 # that we are parsing XML data. Most likely, the XML file 227 # is corrupted. 228 raise CorruptedXMLError(e) 229 else: 230 # We have not seen the initial <!xml declaration, so 231 # probably the input data is not in XML format. 232 raise NotXMLError(e) 233 234 if not self.stack: 235 # Haven't read enough from the XML file yet 236 continue 237 238 records = self.stack[0] 239 if not isinstance(records, list): 240 raise ValueError("The XML file does not represent a list. Please use Entrez.read instead of Entrez.parse") 241 while len(records) > 1: # Then the top record is finished 242 record = records.pop(0) 243 yield record
244
245 - def xmlDeclHandler(self, version, encoding, standalone):
246 # XML declaration found; set the handlers 247 self.parser.StartElementHandler = self.startElementHandler 248 self.parser.EndElementHandler = self.endElementHandler 249 self.parser.CharacterDataHandler = self.characterDataHandler 250 self.parser.ExternalEntityRefHandler = self.externalEntityRefHandler 251 self.parser.StartNamespaceDeclHandler = self.startNamespaceDeclHandler
252
253 - def startNamespaceDeclHandler(self, prefix, un):
254 raise NotImplementedError("The Bio.Entrez parser cannot handle XML data that make use of XML namespaces")
255
256 - def startElementHandler(self, name, attrs):
257 self.content = "" 258 if name in self.lists: 259 object = ListElement() 260 elif name in self.dictionaries: 261 object = DictionaryElement() 262 elif name in self.structures: 263 object = StructureElement(self.structures[name]) 264 elif name in self.items: # Only appears in ESummary 265 name = str(attrs["Name"]) # convert from Unicode 266 del attrs["Name"] 267 itemtype = str(attrs["Type"]) # convert from Unicode 268 del attrs["Type"] 269 if itemtype=="Structure": 270 object = DictionaryElement() 271 elif name in ("ArticleIds", "History"): 272 object = StructureElement(["pubmed", "medline"]) 273 elif itemtype=="List": 274 object = ListElement() 275 else: 276 object = StringElement() 277 object.itemname = name 278 object.itemtype = itemtype 279 elif name in self.strings + self.errors + self.integers: 280 self.attributes = attrs 281 return 282 else: 283 # Element not found in DTD 284 if self.validating: 285 raise ValidationError(name) 286 else: 287 # this will not be stored in the record 288 object = "" 289 if object!="": 290 object.tag = name 291 if attrs: 292 object.attributes = dict(attrs) 293 if len(self.stack)!=0: 294 current = self.stack[-1] 295 try: 296 current.append(object) 297 except AttributeError: 298 current[name] = object 299 self.stack.append(object)
300
301 - def endElementHandler(self, name):
302 value = self.content 303 if name in self.errors: 304 if value=="": 305 return 306 else: 307 raise RuntimeError(value) 308 elif name in self.integers: 309 value = IntegerElement(value) 310 elif name in self.strings: 311 # Convert Unicode strings to plain strings if possible 312 try: 313 value = StringElement(value) 314 except UnicodeEncodeError: 315 value = UnicodeElement(value) 316 elif name in self.items: 317 self.object = self.stack.pop() 318 if self.object.itemtype in ("List", "Structure"): 319 return 320 elif self.object.itemtype=="Integer" and value: 321 value = IntegerElement(value) 322 else: 323 # Convert Unicode strings to plain strings if possible 324 try: 325 value = StringElement(value) 326 except UnicodeEncodeError: 327 value = UnicodeElement(value) 328 name = self.object.itemname 329 else: 330 self.object = self.stack.pop() 331 return 332 value.tag = name 333 if self.attributes: 334 value.attributes = dict(self.attributes) 335 del self.attributes 336 current = self.stack[-1] 337 if current!="": 338 try: 339 current.append(value) 340 except AttributeError: 341 current[name] = value
342
343 - def characterDataHandler(self, content):
344 self.content += content
345
346 - def elementDecl(self, name, model):
347 """This callback function is called for each element declaration: 348 <!ELEMENT name (...)> 349 encountered in a DTD. The purpose of this function is to determine 350 whether this element should be regarded as a string, integer, list 351 dictionary, structure, or error.""" 352 if name.upper()=="ERROR": 353 self.errors.append(name) 354 return 355 if name=='Item' and model==(expat.model.XML_CTYPE_MIXED, 356 expat.model.XML_CQUANT_REP, 357 None, ((expat.model.XML_CTYPE_NAME, 358 expat.model.XML_CQUANT_NONE, 359 'Item', 360 () 361 ), 362 ) 363 ): 364 # Special case. As far as I can tell, this only occurs in the 365 # eSummary DTD. 366 self.items.append(name) 367 return 368 # First, remove ignorable parentheses around declarations 369 while (model[0] in (expat.model.XML_CTYPE_SEQ, 370 expat.model.XML_CTYPE_CHOICE) 371 and model[1] in (expat.model.XML_CQUANT_NONE, 372 expat.model.XML_CQUANT_OPT) 373 and len(model[3])==1): 374 model = model[3][0] 375 # PCDATA declarations correspond to strings 376 if model[0] in (expat.model.XML_CTYPE_MIXED, 377 expat.model.XML_CTYPE_EMPTY): 378 self.strings.append(name) 379 return 380 # List-type elements 381 if (model[0] in (expat.model.XML_CTYPE_CHOICE, 382 expat.model.XML_CTYPE_SEQ) and 383 model[1] in (expat.model.XML_CQUANT_PLUS, 384 expat.model.XML_CQUANT_REP)): 385 self.lists.append(name) 386 return 387 # This is the tricky case. Check which keys can occur multiple 388 # times. If only one key is possible, and it can occur multiple 389 # times, then this is a list. If more than one key is possible, 390 # but none of them can occur multiple times, then this is a 391 # dictionary. Otherwise, this is a structure. 392 # In 'single' and 'multiple', we keep track which keys can occur 393 # only once, and which can occur multiple times. 394 single = [] 395 multiple = [] 396 # The 'count' function is called recursively to make sure all the 397 # children in this model are counted. Error keys are ignored; 398 # they raise an exception in Python. 399 def count(model): 400 quantifier, name, children = model[1:] 401 if name==None: 402 if quantifier in (expat.model.XML_CQUANT_PLUS, 403 expat.model.XML_CQUANT_REP): 404 for child in children: 405 multiple.append(child[2]) 406 else: 407 for child in children: 408 count(child) 409 elif name.upper()!="ERROR": 410 if quantifier in (expat.model.XML_CQUANT_NONE, 411 expat.model.XML_CQUANT_OPT): 412 single.append(name) 413 elif quantifier in (expat.model.XML_CQUANT_PLUS, 414 expat.model.XML_CQUANT_REP): 415 multiple.append(name)
416 count(model) 417 if len(single)==0 and len(multiple)==1: 418 self.lists.append(name) 419 elif len(multiple)==0: 420 self.dictionaries.append(name) 421 else: 422 self.structures.update({name: multiple})
423
424 - def open_dtd_file(self, filename):
425 path = os.path.join(DataHandler.local_dtd_dir, filename) 426 try: 427 handle = open(path, "rb") 428 except IOError: 429 pass 430 else: 431 return handle 432 path = os.path.join(DataHandler.global_dtd_dir, filename) 433 try: 434 handle = open(path, "rb") 435 except IOError: 436 pass 437 else: 438 return handle 439 return None
440
441 - def externalEntityRefHandler(self, context, base, systemId, publicId):
442 """The purpose of this function is to load the DTD locally, instead 443 of downloading it from the URL specified in the XML. Using the local 444 DTD results in much faster parsing. If the DTD is not found locally, 445 we try to download it. If new DTDs become available from NCBI, 446 putting them in Bio/Entrez/DTDs will allow the parser to see them.""" 447 urlinfo = urlparse.urlparse(systemId) 448 #Following attribute requires Python 2.5+ 449 #if urlinfo.scheme=='http': 450 if urlinfo[0]=='http': 451 # Then this is an absolute path to the DTD. 452 url = systemId 453 elif urlinfo[0]=='': 454 # Then this is a relative path to the DTD. 455 # Look at the parent URL to find the full path. 456 try: 457 url = self.dtd_urls[-1] 458 except IndexError: 459 # Assume the default URL for DTDs if the top parent 460 # does not contain an absolute path 461 source = "http://www.ncbi.nlm.nih.gov/dtd/" 462 else: 463 source = os.path.dirname(url) 464 url = os.path.join(source, systemId) 465 self.dtd_urls.append(url) 466 # First, try to load the local version of the DTD file 467 location, filename = os.path.split(systemId) 468 handle = self.open_dtd_file(filename) 469 if not handle: 470 # DTD is not available as a local file. Try accessing it through 471 # the internet instead. 472 message = """\ 473 Unable to load DTD file %s. 474 475 Bio.Entrez uses NCBI's DTD files to parse XML files returned by NCBI Entrez. 476 Though most of NCBI's DTD files are included in the Biopython distribution, 477 sometimes you may find that a particular DTD file is missing. While we can 478 access the DTD file through the internet, the parser is much faster if the 479 required DTD files are available locally. 480 481 For this purpose, please download %s from 482 483 %s 484 485 and save it either in directory 486 487 %s 488 489 or in directory 490 491 %s 492 493 in order for Bio.Entrez to find it. 494 495 Alternatively, you can save %s in the directory 496 Bio/Entrez/DTDs in the Biopython distribution, and reinstall Biopython. 497 498 Please also inform the Biopython developers about this missing DTD, by 499 reporting a bug on http://bugzilla.open-bio.org/ or sign up to our mailing 500 list and emailing us, so that we can include it with the next release of 501 Biopython. 502 503 Proceeding to access the DTD file through the internet... 504 """ % (filename, filename, url, self.global_dtd_dir, self.local_dtd_dir, filename) 505 warnings.warn(message) 506 try: 507 handle = urllib.urlopen(url) 508 except IOError: 509 raise RuntimeException("Failed to access %s at %s" % (filename, url)) 510 511 parser = self.parser.ExternalEntityParserCreate(context) 512 parser.ElementDeclHandler = self.elementDecl 513 parser.ParseFile(handle) 514 handle.close() 515 self.dtd_urls.pop() 516 return 1
517