Package Bio :: Module ParserSupport
[hide private]
[frames] | no frames]

Source Code for Module Bio.ParserSupport

  1  # Copyright 1999 by Jeffrey Chang.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Code to support writing parsers (DEPRECATED). 
  7   
  8  Classes: 
  9   
 10      - AbstractParser         Base class for parsers. 
 11      - AbstractConsumer       Base class of all Consumers. 
 12      - TaggingConsumer        Consumer that tags output with its event.  For debugging 
 13      - EventGenerator         Generate Biopython Events from Martel XML output 
 14        (note that Martel has been removed) 
 15   
 16  Functions: 
 17   
 18      - safe_readline          Read a line from a handle, with check for EOF. 
 19      - safe_peekline          Peek at next line, with check for EOF. 
 20      - read_and_call          Read a line from a handle and pass it to a method. 
 21      - read_and_call_while    Read many lines, as long as a condition is met. 
 22      - read_and_call_until    Read many lines, until a condition is met. 
 23      - attempt_read_and_call  Like read_and_call, but forgiving of errors. 
 24      - is_blank_line          Test whether a line is blank. 
 25   
 26  """ 
 27   
 28  from Bio import BiopythonDeprecationWarning 
 29  import warnings 
 30  warnings.warn("Bio.ParserSupport is now deprecated will be removed in a " 
 31                "future release of Biopython.", BiopythonDeprecationWarning) 
 32   
 33  import sys 
 34  try: 
 35      from types import InstanceType 
 36  except ImportError: 
 37      # Python 3, see http://bugs.python.org/issue8206 
 38      InstanceType = object 
 39  from types import MethodType 
 40   
 41  from Bio._py3k import StringIO 
 42   
 43  from Bio import File 
 44   
 45  # XML from python 2.0 
 46  try: 
 47      from xml.sax import handler 
 48      xml_support = 1 
 49  except ImportError: 
 50      sys.stderr.write("Warning: Could not import SAX for dealing with XML.\n" + 
 51                       "This causes problems with some ParserSupport modules\n") 
 52      xml_support = 0 
 53   
 54  __docformat__ = "restructuredtext en" 
 55   
 56   
57 -class AbstractParser(object):
58 """Base class for other parsers. 59 60 """
61 - def parse(self, handle):
62 raise NotImplementedError("Please implement in a derived class")
63
64 - def parse_str(self, string):
65 return self.parse(StringIO(string))
66
67 - def parse_file(self, filename):
68 with open(filename) as h: 69 retval = self.parse(h) 70 return retval
71 72
73 -class AbstractConsumer(object):
74 """Base class for other Consumers. 75 76 Derive Consumers from this class and implement appropriate 77 methods for each event that you want to receive. 78 79 """
80 - def _unhandled_section(self):
81 pass
82
83 - def _unhandled(self, data):
84 pass
85
86 - def __getattr__(self, attr):
87 if attr[:6] == 'start_' or attr[:4] == 'end_': 88 method = self._unhandled_section 89 else: 90 method = self._unhandled 91 return method
92 93
94 -class TaggingConsumer(AbstractConsumer):
95 """A Consumer that tags the data stream with the event and 96 prints it to a handle. Useful for debugging. 97 98 """
99 - def __init__(self, handle=None, colwidth=15, maxwidth=80):
100 """TaggingConsumer(handle=sys.stdout, colwidth=15, maxwidth=80)""" 101 # I can't assign sys.stdout to handle in the argument list. 102 # If I do that, handle will be assigned the value of sys.stdout 103 # the first time this function is called. This will fail if 104 # the user has assigned sys.stdout to some other file, which may 105 # be closed or invalid at a later time. 106 if handle is None: 107 handle = sys.stdout 108 self._handle = handle 109 self._colwidth = colwidth 110 self._maxwidth = maxwidth
111
112 - def unhandled_section(self):
113 self._print_name('unhandled_section')
114
115 - def unhandled(self, data):
116 self._print_name('unhandled', data)
117
118 - def _print_name(self, name, data=None):
119 if data is None: 120 # Write the name of a section. 121 self._handle.write("%s %s\n" % ("*" * self._colwidth, name)) 122 else: 123 # Write the tag and line. 124 self._handle.write("%-*s: %s\n" % ( 125 self._colwidth, name[:self._colwidth], 126 data[:self._maxwidth - self._colwidth - 2].rstrip()))
127
128 - def __getattr__(self, attr):
129 if attr[:6] == 'start_' or attr[:4] == 'end_': 130 method = lambda a=attr, s=self: s._print_name(a) 131 else: 132 method = lambda x, a=attr, s=self: s._print_name(a, x) 133 return method
134 135 136 # onle use the Event Generator if XML handling is okay 137 if xml_support:
138 - class EventGenerator(handler.ContentHandler):
139 """Handler to generate events associated with a Martel parsed file. 140 141 This acts like a normal SAX handler, and accepts XML generated by 142 Martel during parsing. These events are then converted into 143 'Biopython events', which can then be caught by a standard 144 biopython consumer. 145 146 Note that Martel is now DEPRECATED. 147 """
148 - def __init__(self, consumer, interest_tags, callback_finalizer=None, 149 exempt_tags=[]):
150 """Initialize to begin catching and firing off events. 151 152 Arguments: 153 o consumer - The consumer that we'll send Biopython events to. 154 155 o interest_tags - A listing of all the tags we are interested in. 156 157 o callback_finalizer - A function to deal with the collected 158 information before passing it on to the consumer. By default 159 the collected information is a list of all of the lines read 160 for a particular tag -- if there are multiple tags in a row 161 like: 162 163 <some_info>Spam<some_info> 164 <some_info>More Spam<some_info> 165 166 In this case the list of information would be: 167 168 ['Spam', 'More Spam'] 169 170 This list of lines will be passed to the callback finalizer if 171 it is present. Otherwise the consumer will be called with the 172 list of content information. 173 174 o exempt_tags - A listing of particular tags that are exempt from 175 being processed by the callback_finalizer. This allows you to 176 use a finalizer to deal with most tags, but leave those you don't 177 want touched. 178 """ 179 self._consumer = consumer 180 self.interest_tags = interest_tags 181 self._finalizer = callback_finalizer 182 self._exempt_tags = exempt_tags 183 184 # a dictionary of content for each tag of interest 185 # the information for each tag is held as a list of the lines. 186 # This allows us to collect information from multiple tags 187 # in a row, and return it all at once. 188 self.info = {} 189 for tag in self.interest_tags: 190 self.info[tag] = [] 191 192 # the previous tag we were collecting information for. 193 # We set a delay in sending info to the consumer so that we can 194 # collect a bunch of tags in a row and append all of the info 195 # together. 196 self._previous_tag = '' 197 198 # the current character information for a tag 199 self._cur_content = [] 200 # whether we should be collecting information 201 self._collect_characters = 0
202
203 - def startElement(self, name, attrs):
204 """Determine if we should collect characters from this tag. 205 """ 206 if name in self.interest_tags: 207 self._collect_characters = 1
208
209 - def characters(self, content):
210 """Extract the information if we are interested in it. 211 """ 212 if self._collect_characters: 213 self._cur_content.append(content)
214
215 - def endElement(self, name):
216 """Send the information to the consumer. 217 218 Once we've got the end element we've collected up all of the 219 character information we need, and we need to send this on to 220 the consumer to do something with it. 221 222 We have a delay of one tag on doing this, so that we can collect 223 all of the info from multiple calls to the same element at once. 224 """ 225 # only deal with the tag if it is something we are 226 # interested in and potentially have information for 227 if self._collect_characters: 228 # add all of the information collected inside this tag 229 self.info[name].append("".join(self._cur_content)) 230 # reset our information and flags 231 self._cur_content = [] 232 self._collect_characters = 0 233 234 # if we are at a new tag, pass on the info from the last tag 235 if self._previous_tag and self._previous_tag != name: 236 self._make_callback(self._previous_tag) 237 238 # set this tag as the next to be passed 239 self._previous_tag = name
240
241 - def _make_callback(self, name):
242 """Call the callback function with the info with the given name. 243 """ 244 # strip off whitespace and call the consumer 245 callback_function = getattr(self._consumer, name) 246 247 # --- pass back the information 248 # if there is a finalizer, use that 249 if self._finalizer is not None and name not in self._exempt_tags: 250 info_to_pass = self._finalizer(self.info[name]) 251 # otherwise pass back the entire list of information 252 else: 253 info_to_pass = self.info[name] 254 255 callback_function(info_to_pass) 256 257 # reset the information for the tag 258 self.info[name] = []
259
260 - def endDocument(self):
261 """Make sure all of our information has been passed. 262 263 This just flushes out any stored tags that need to be passed. 264 """ 265 if self._previous_tag: 266 self._make_callback(self._previous_tag)
267 268
269 -def read_and_call(uhandle, method, **keywds):
270 """read_and_call(uhandle, method[, start][, end][, contains][, blank][, has_re]) 271 272 Read a line from uhandle, check it, and pass it to the method. 273 Raises a ValueError if the line does not pass the checks. 274 275 start, end, contains, blank, and has_re specify optional conditions 276 that the line must pass. start and end specifies what the line must 277 begin or end with (not counting EOL characters). contains 278 specifies a substring that must be found in the line. If blank 279 is a true value, then the line must be blank. has_re should be 280 a regular expression object with a pattern that the line must match 281 somewhere. 282 283 """ 284 line = safe_readline(uhandle) 285 errmsg = _fails_conditions(*(line,), **keywds) 286 if errmsg is not None: 287 raise ValueError(errmsg) 288 method(line)
289 290
291 -def read_and_call_while(uhandle, method, **keywds):
292 """read_and_call_while(uhandle, method[, start][, end][, contains][, blank][, has_re]) -> number of lines 293 294 Read a line from uhandle and pass it to the method as long as 295 some condition is true. Returns the number of lines that were read. 296 297 See the docstring for read_and_call for a description of the parameters. 298 299 """ 300 nlines = 0 301 while True: 302 line = safe_readline(uhandle) 303 # If I've failed the condition, then stop reading the line. 304 if _fails_conditions(*(line,), **keywds): 305 uhandle.saveline(line) 306 break 307 method(line) 308 nlines = nlines + 1 309 return nlines
310 311
312 -def read_and_call_until(uhandle, method, **keywds):
313 """read_and_call_until(uhandle, method, 314 start=None, end=None, contains=None, blank=None) -> number of lines 315 316 Read a line from uhandle and pass it to the method until 317 some condition is true. Returns the number of lines that were read. 318 319 See the docstring for read_and_call for a description of the parameters. 320 321 """ 322 nlines = 0 323 while True: 324 line = safe_readline(uhandle) 325 # If I've met the condition, then stop reading the line. 326 if not _fails_conditions(*(line,), **keywds): 327 uhandle.saveline(line) 328 break 329 method(line) 330 nlines = nlines + 1 331 return nlines
332 333
334 -def attempt_read_and_call(uhandle, method, **keywds):
335 """attempt_read_and_call(uhandle, method, **keywds) -> boolean 336 337 Similar to read_and_call, but returns a boolean specifying 338 whether the line has passed the checks. Does not raise 339 exceptions. 340 341 See docs for read_and_call for a description of the function 342 arguments. 343 344 """ 345 line = safe_readline(uhandle) 346 passed = not _fails_conditions(*(line,), **keywds) 347 if passed: 348 method(line) 349 else: 350 uhandle.saveline(line) 351 return passed
352 353
354 -def _fails_conditions(line, start=None, end=None, contains=None, blank=None, 355 has_re=None):
356 if start is not None: 357 if line[:len(start)] != start: 358 return "Line does not start with '%s':\n%s" % (start, line) 359 if end is not None: 360 if line.rstrip()[-len(end):] != end: 361 return "Line does not end with '%s':\n%s" % (end, line) 362 if contains is not None: 363 if contains not in line: 364 return "Line does not contain '%s':\n%s" % (contains, line) 365 if blank is not None: 366 if blank: 367 if not is_blank_line(line): 368 return "Expected blank line, but got:\n%s" % line 369 else: 370 if is_blank_line(line): 371 return "Expected non-blank line, but got a blank one" 372 if has_re is not None: 373 if has_re.search(line) is None: 374 return "Line does not match regex '%s':\n%s" % ( 375 has_re.pattern, line) 376 return None
377 378
379 -def is_blank_line(line, allow_spaces=0):
380 """is_blank_line(line, allow_spaces=0) -> boolean 381 382 Return whether a line is blank. allow_spaces specifies whether to 383 allow whitespaces in a blank line. A true value signifies that a 384 line containing whitespaces as well as end-of-line characters 385 should be considered blank. 386 387 """ 388 if not line: 389 return 1 390 if allow_spaces: 391 return line.rstrip() == '' 392 return line[0] == '\n' or line[0] == '\r'
393 394
395 -def safe_readline(handle):
396 """safe_readline(handle) -> line 397 398 Read a line from an UndoHandle and return it. If there are no more 399 lines to read, I will raise a ValueError. 400 401 """ 402 line = handle.readline() 403 if not line: 404 raise ValueError("Unexpected end of stream.") 405 return line
406 407
408 -def safe_peekline(handle):
409 """safe_peekline(handle) -> line 410 411 Peek at the next line in an UndoHandle and return it. If there are no 412 more lines to peek, I will raise a ValueError. 413 414 """ 415 line = handle.peekline() 416 if not line: 417 raise ValueError("Unexpected end of stream.") 418 return line
419