Package Bio :: Module ParserSupport
[hide private]
[frames] | no frames]

Source Code for Module Bio.ParserSupport

  1  # Copyright 1999 by Jeffrey Chang.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Code to support writing parsers (DEPRECATED). 
  7   
  8  Classes: 
  9   
 10      - AbstractParser         Base class for parsers. 
 11      - AbstractConsumer       Base class of all Consumers. 
 12      - TaggingConsumer        Consumer that tags output with its event.  For debugging 
 13      - EventGenerator         Generate Biopython Events from Martel XML output 
 14        (note that Martel has been removed) 
 15   
 16  Functions: 
 17   
 18      - safe_readline          Read a line from a handle, with check for EOF. 
 19      - safe_peekline          Peek at next line, with check for EOF. 
 20      - read_and_call          Read a line from a handle and pass it to a method. 
 21      - read_and_call_while    Read many lines, as long as a condition is met. 
 22      - read_and_call_until    Read many lines, until a condition is met. 
 23      - attempt_read_and_call  Like read_and_call, but forgiving of errors. 
 24      - is_blank_line          Test whether a line is blank. 
 25   
 26  """ 
 27   
 28  from Bio import BiopythonDeprecationWarning 
 29  import warnings 
 30  warnings.warn("Bio.ParserSupport is now deprecated will be removed in a " 
 31                "future release of Biopython.", BiopythonDeprecationWarning) 
 32   
 33  import sys 
 34  try: 
 35      from types import InstanceType 
 36  except ImportError: 
 37      # Python 3, see http://bugs.python.org/issue8206 
 38      InstanceType = object 
 39  from types import MethodType 
 40   
 41  from Bio._py3k import StringIO 
 42   
 43  from Bio import File 
 44   
 45  # XML from python 2.0 
 46  try: 
 47      from xml.sax import handler 
 48      xml_support = 1 
 49  except ImportError: 
 50      sys.stderr.write("Warning: Could not import SAX for dealing with XML.\n" + 
 51                       "This causes problems with some ParserSupport modules\n") 
 52      xml_support = 0 
 53   
 54  __docformat__ = "restructuredtext en" 
 55   
56 -class AbstractParser(object):
57 """Base class for other parsers. 58 59 """
60 - def parse(self, handle):
61 raise NotImplementedError("Please implement in a derived class")
62
63 - def parse_str(self, string):
64 return self.parse(StringIO(string))
65
66 - def parse_file(self, filename):
67 with open(filename) as h: 68 retval = self.parse(h) 69 return retval
70 71
72 -class AbstractConsumer(object):
73 """Base class for other Consumers. 74 75 Derive Consumers from this class and implement appropriate 76 methods for each event that you want to receive. 77 78 """
79 - def _unhandled_section(self):
80 pass
81
82 - def _unhandled(self, data):
83 pass
84
85 - def __getattr__(self, attr):
86 if attr[:6] == 'start_' or attr[:4] == 'end_': 87 method = self._unhandled_section 88 else: 89 method = self._unhandled 90 return method
91 92
93 -class TaggingConsumer(AbstractConsumer):
94 """A Consumer that tags the data stream with the event and 95 prints it to a handle. Useful for debugging. 96 97 """
98 - def __init__(self, handle=None, colwidth=15, maxwidth=80):
99 """TaggingConsumer(handle=sys.stdout, colwidth=15, maxwidth=80)""" 100 # I can't assign sys.stdout to handle in the argument list. 101 # If I do that, handle will be assigned the value of sys.stdout 102 # the first time this function is called. This will fail if 103 # the user has assigned sys.stdout to some other file, which may 104 # be closed or invalid at a later time. 105 if handle is None: 106 handle = sys.stdout 107 self._handle = handle 108 self._colwidth = colwidth 109 self._maxwidth = maxwidth
110
111 - def unhandled_section(self):
112 self._print_name('unhandled_section')
113
114 - def unhandled(self, data):
115 self._print_name('unhandled', data)
116
117 - def _print_name(self, name, data=None):
118 if data is None: 119 # Write the name of a section. 120 self._handle.write("%s %s\n" % ("*"*self._colwidth, name)) 121 else: 122 # Write the tag and line. 123 self._handle.write("%-*s: %s\n" % ( 124 self._colwidth, name[:self._colwidth], 125 data[:self._maxwidth-self._colwidth-2].rstrip()))
126
127 - def __getattr__(self, attr):
128 if attr[:6] == 'start_' or attr[:4] == 'end_': 129 method = lambda a=attr, s=self: s._print_name(a) 130 else: 131 method = lambda x, a=attr, s=self: s._print_name(a, x) 132 return method
133 134 135 # onle use the Event Generator if XML handling is okay 136 if xml_support:
137 - class EventGenerator(handler.ContentHandler):
138 """Handler to generate events associated with a Martel parsed file. 139 140 This acts like a normal SAX handler, and accepts XML generated by 141 Martel during parsing. These events are then converted into 142 'Biopython events', which can then be caught by a standard 143 biopython consumer. 144 145 Note that Martel is now DEPRECATED. 146 """
147 - def __init__(self, consumer, interest_tags, callback_finalizer=None, 148 exempt_tags=[]):
149 """Initialize to begin catching and firing off events. 150 151 Arguments: 152 o consumer - The consumer that we'll send Biopython events to. 153 154 o interest_tags - A listing of all the tags we are interested in. 155 156 o callback_finalizer - A function to deal with the collected 157 information before passing it on to the consumer. By default 158 the collected information is a list of all of the lines read 159 for a particular tag -- if there are multiple tags in a row 160 like: 161 162 <some_info>Spam<some_info> 163 <some_info>More Spam<some_info> 164 165 In this case the list of information would be: 166 167 ['Spam', 'More Spam'] 168 169 This list of lines will be passed to the callback finalizer if 170 it is present. Otherwise the consumer will be called with the 171 list of content information. 172 173 o exempt_tags - A listing of particular tags that are exempt from 174 being processed by the callback_finalizer. This allows you to 175 use a finalizer to deal with most tags, but leave those you don't 176 want touched. 177 """ 178 self._consumer = consumer 179 self.interest_tags = interest_tags 180 self._finalizer = callback_finalizer 181 self._exempt_tags = exempt_tags 182 183 # a dictionary of content for each tag of interest 184 # the information for each tag is held as a list of the lines. 185 # This allows us to collect information from multiple tags 186 # in a row, and return it all at once. 187 self.info = {} 188 for tag in self.interest_tags: 189 self.info[tag] = [] 190 191 # the previous tag we were collecting information for. 192 # We set a delay in sending info to the consumer so that we can 193 # collect a bunch of tags in a row and append all of the info 194 # together. 195 self._previous_tag = '' 196 197 # the current character information for a tag 198 self._cur_content = [] 199 # whether we should be collecting information 200 self._collect_characters = 0
201
202 - def startElement(self, name, attrs):
203 """Determine if we should collect characters from this tag. 204 """ 205 if name in self.interest_tags: 206 self._collect_characters = 1
207
208 - def characters(self, content):
209 """Extract the information if we are interested in it. 210 """ 211 if self._collect_characters: 212 self._cur_content.append(content)
213
214 - def endElement(self, name):
215 """Send the information to the consumer. 216 217 Once we've got the end element we've collected up all of the 218 character information we need, and we need to send this on to 219 the consumer to do something with it. 220 221 We have a delay of one tag on doing this, so that we can collect 222 all of the info from multiple calls to the same element at once. 223 """ 224 # only deal with the tag if it is something we are 225 # interested in and potentially have information for 226 if self._collect_characters: 227 # add all of the information collected inside this tag 228 self.info[name].append("".join(self._cur_content)) 229 # reset our information and flags 230 self._cur_content = [] 231 self._collect_characters = 0 232 233 # if we are at a new tag, pass on the info from the last tag 234 if self._previous_tag and self._previous_tag != name: 235 self._make_callback(self._previous_tag) 236 237 # set this tag as the next to be passed 238 self._previous_tag = name
239
240 - def _make_callback(self, name):
241 """Call the callback function with the info with the given name. 242 """ 243 # strip off whitespace and call the consumer 244 callback_function = getattr(self._consumer, name) 245 246 # --- pass back the information 247 # if there is a finalizer, use that 248 if self._finalizer is not None and name not in self._exempt_tags: 249 info_to_pass = self._finalizer(self.info[name]) 250 # otherwise pass back the entire list of information 251 else: 252 info_to_pass = self.info[name] 253 254 callback_function(info_to_pass) 255 256 # reset the information for the tag 257 self.info[name] = []
258
259 - def endDocument(self):
260 """Make sure all of our information has been passed. 261 262 This just flushes out any stored tags that need to be passed. 263 """ 264 if self._previous_tag: 265 self._make_callback(self._previous_tag)
266 267
268 -def read_and_call(uhandle, method, **keywds):
269 """read_and_call(uhandle, method[, start][, end][, contains][, blank][, has_re]) 270 271 Read a line from uhandle, check it, and pass it to the method. 272 Raises a ValueError if the line does not pass the checks. 273 274 start, end, contains, blank, and has_re specify optional conditions 275 that the line must pass. start and end specifies what the line must 276 begin or end with (not counting EOL characters). contains 277 specifies a substring that must be found in the line. If blank 278 is a true value, then the line must be blank. has_re should be 279 a regular expression object with a pattern that the line must match 280 somewhere. 281 282 """ 283 line = safe_readline(uhandle) 284 errmsg = _fails_conditions(*(line,), **keywds) 285 if errmsg is not None: 286 raise ValueError(errmsg) 287 method(line)
288 289
290 -def read_and_call_while(uhandle, method, **keywds):
291 """read_and_call_while(uhandle, method[, start][, end][, contains][, blank][, has_re]) -> number of lines 292 293 Read a line from uhandle and pass it to the method as long as 294 some condition is true. Returns the number of lines that were read. 295 296 See the docstring for read_and_call for a description of the parameters. 297 298 """ 299 nlines = 0 300 while True: 301 line = safe_readline(uhandle) 302 # If I've failed the condition, then stop reading the line. 303 if _fails_conditions(*(line,), **keywds): 304 uhandle.saveline(line) 305 break 306 method(line) 307 nlines = nlines + 1 308 return nlines
309 310
311 -def read_and_call_until(uhandle, method, **keywds):
312 """read_and_call_until(uhandle, method, 313 start=None, end=None, contains=None, blank=None) -> number of lines 314 315 Read a line from uhandle and pass it to the method until 316 some condition is true. Returns the number of lines that were read. 317 318 See the docstring for read_and_call for a description of the parameters. 319 320 """ 321 nlines = 0 322 while True: 323 line = safe_readline(uhandle) 324 # If I've met the condition, then stop reading the line. 325 if not _fails_conditions(*(line,), **keywds): 326 uhandle.saveline(line) 327 break 328 method(line) 329 nlines = nlines + 1 330 return nlines
331 332
333 -def attempt_read_and_call(uhandle, method, **keywds):
334 """attempt_read_and_call(uhandle, method, **keywds) -> boolean 335 336 Similar to read_and_call, but returns a boolean specifying 337 whether the line has passed the checks. Does not raise 338 exceptions. 339 340 See docs for read_and_call for a description of the function 341 arguments. 342 343 """ 344 line = safe_readline(uhandle) 345 passed = not _fails_conditions(*(line,), **keywds) 346 if passed: 347 method(line) 348 else: 349 uhandle.saveline(line) 350 return passed
351 352
353 -def _fails_conditions(line, start=None, end=None, contains=None, blank=None, 354 has_re=None):
355 if start is not None: 356 if line[:len(start)] != start: 357 return "Line does not start with '%s':\n%s" % (start, line) 358 if end is not None: 359 if line.rstrip()[-len(end):] != end: 360 return "Line does not end with '%s':\n%s" % (end, line) 361 if contains is not None: 362 if contains not in line: 363 return "Line does not contain '%s':\n%s" % (contains, line) 364 if blank is not None: 365 if blank: 366 if not is_blank_line(line): 367 return "Expected blank line, but got:\n%s" % line 368 else: 369 if is_blank_line(line): 370 return "Expected non-blank line, but got a blank one" 371 if has_re is not None: 372 if has_re.search(line) is None: 373 return "Line does not match regex '%s':\n%s" % ( 374 has_re.pattern, line) 375 return None
376 377
378 -def is_blank_line(line, allow_spaces=0):
379 """is_blank_line(line, allow_spaces=0) -> boolean 380 381 Return whether a line is blank. allow_spaces specifies whether to 382 allow whitespaces in a blank line. A true value signifies that a 383 line containing whitespaces as well as end-of-line characters 384 should be considered blank. 385 386 """ 387 if not line: 388 return 1 389 if allow_spaces: 390 return line.rstrip() == '' 391 return line[0] == '\n' or line[0] == '\r'
392 393
394 -def safe_readline(handle):
395 """safe_readline(handle) -> line 396 397 Read a line from an UndoHandle and return it. If there are no more 398 lines to read, I will raise a ValueError. 399 400 """ 401 line = handle.readline() 402 if not line: 403 raise ValueError("Unexpected end of stream.") 404 return line
405 406
407 -def safe_peekline(handle):
408 """safe_peekline(handle) -> line 409 410 Peek at the next line in an UndoHandle and return it. If there are no 411 more lines to peek, I will raise a ValueError. 412 413 """ 414 line = handle.peekline() 415 if not line: 416 raise ValueError("Unexpected end of stream.") 417 return line
418