Package Bio :: Module ParserSupport
[hide private]
[frames] | no frames]

Source Code for Module Bio.ParserSupport

  1  # Copyright 1999 by Jeffrey Chang.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Code to support writing parsers. 
  7   
  8   
  9   
 10  Classes: 
 11  AbstractParser         Base class for parsers. 
 12  AbstractConsumer       Base class of all Consumers. 
 13  TaggingConsumer        Consumer that tags output with its event.  For debugging 
 14  SGMLStrippingConsumer  Consumer that strips SGML tags from output. 
 15  EventGenerator         Generate Biopython Events from Martel XML output 
 16   
 17  Functions: 
 18  safe_readline          Read a line from a handle, with check for EOF. 
 19  safe_peekline          Peek at next line, with check for EOF. 
 20  read_and_call          Read a line from a handle and pass it to a method. 
 21  read_and_call_while    Read many lines, as long as a condition is met. 
 22  read_and_call_until    Read many lines, until a condition is met. 
 23  attempt_read_and_call  Like read_and_call, but forgiving of errors. 
 24  is_blank_line          Test whether a line is blank. 
 25   
 26  """ 
 27   
 28  import sys 
 29  import string 
 30  import traceback 
 31  from types import * 
 32   
 33  from Bio import File 
 34   
 35  # XML from python 2.0 
 36  try: 
 37      from xml.sax import handler 
 38      xml_support = 1 
 39  except ImportError: 
 40      sys.stderr.write("Warning: Could not import SAX for dealing with XML.\n" + 
 41                       "This causes problems with some ParserSupport modules\n") 
 42      xml_support = 0 
 43   
44 -class AbstractParser:
45 """Base class for other parsers. 46 47 """
48 - def parse(self, handle):
49 raise NotImplementedError, "Please implement in a derived class"
50
51 - def parse_str(self, string):
52 return self.parse(File.StringHandle(string))
53
54 - def parse_file(self, filename):
55 h = open(filename) 56 try: 57 retval = self.parse(h) 58 finally: 59 h.close() 60 return retval
61
62 -class AbstractConsumer:
63 """Base class for other Consumers. 64 65 Derive Consumers from this class and implement appropriate 66 methods for each event that you want to receive. 67 68 """
69 - def _unhandled_section(self):
70 pass
71 - def _unhandled(self, data):
72 pass
73 - def __getattr__(self, attr):
74 if attr[:6] == 'start_' or attr[:4] == 'end_': 75 method = self._unhandled_section 76 else: 77 method = self._unhandled 78 return method
79
80 -class TaggingConsumer(AbstractConsumer):
81 """A Consumer that tags the data stream with the event and 82 prints it to a handle. Useful for debugging. 83 84 """
85 - def __init__(self, handle=None, colwidth=15, maxwidth=80):
86 """TaggingConsumer(handle=sys.stdout, colwidth=15, maxwidth=80)""" 87 # I can't assign sys.stdout to handle in the argument list. 88 # If I do that, handle will be assigned the value of sys.stdout 89 # the first time this function is called. This will fail if 90 # the user has assigned sys.stdout to some other file, which may 91 # be closed or invalid at a later time. 92 if handle is None: 93 handle = sys.stdout 94 self._handle = handle 95 self._colwidth = colwidth 96 self._maxwidth = maxwidth
97
98 - def unhandled_section(self):
99 self._print_name('unhandled_section')
100
101 - def unhandled(self, data):
102 self._print_name('unhandled', data)
103
104 - def _print_name(self, name, data=None):
105 if data is None: 106 # Write the name of a section. 107 self._handle.write("%s %s\n" % ("*"*self._colwidth, name)) 108 else: 109 # Write the tag and line. 110 self._handle.write("%-*s: %s\n" % ( 111 self._colwidth, name[:self._colwidth], 112 string.rstrip(data[:self._maxwidth-self._colwidth-2])))
113
114 - def __getattr__(self, attr):
115 if attr[:6] == 'start_' or attr[:4] == 'end_': 116 method = lambda a=attr, s=self: s._print_name(a) 117 else: 118 method = lambda x, a=attr, s=self: s._print_name(a, x) 119 return method
120
121 -class SGMLStrippingConsumer:
122 """A consumer that strips off SGML tags. 123 124 This is meant to be used as a decorator for other consumers. 125 126 """
127 - def __init__(self, consumer):
128 if type(consumer) is not InstanceType: 129 raise ValueError, "consumer should be an instance" 130 self._consumer = consumer 131 self._prev_attr = None 132 self._stripper = File.SGMLStripper()
133
134 - def _apply_clean_data(self, data):
135 clean = self._stripper.strip(data) 136 self._prev_attr(clean)
137
138 - def __getattr__(self, name):
139 if name in ['_prev_attr', '_stripper']: 140 return getattr(self, name) 141 attr = getattr(self._consumer, name) 142 # If this is not a method, then return it as is. 143 if type(attr) is not MethodType: 144 return attr 145 # If it's a section method, then return it. 146 if name[:6] == 'start_' or name[:4] == 'end_': 147 return attr 148 # Otherwise, it's an info event, and return my method. 149 self._prev_attr = attr 150 return self._apply_clean_data
151 152 # onle use the Event Generator if XML handling is okay 153 if xml_support:
154 - class EventGenerator(handler.ContentHandler):
155 """Handler to generate events associated with a Martel parsed file. 156 157 This acts like a normal SAX handler, and accepts XML generated by 158 Martel during parsing. These events are then converted into 159 'Biopython events', which can then be caught by a standard 160 biopython consumer 161 """
162 - def __init__(self, consumer, interest_tags, callback_finalizer = None, 163 exempt_tags = []):
164 """Initialize to begin catching and firing off events. 165 166 Arguments: 167 o consumer - The consumer that we'll send Biopython events to. 168 169 o interest_tags - A listing of all the tags we are interested in. 170 171 o callback_finalizer - A function to deal with the collected 172 information before passing it on to the consumer. By default 173 the collected information is a list of all of the lines read 174 for a particular tag -- if there are multiple tags in a row 175 like: 176 177 <some_info>Spam<some_info> 178 <some_info>More Spam<some_info> 179 180 In this case the list of information would be: 181 182 ['Spam', 'More Spam'] 183 184 This list of lines will be passed to the callback finalizer if 185 it is present. Otherwise the consumer will be called with the 186 list of content information. 187 188 o exempt_tags - A listing of particular tags that are exempt from 189 being processed by the callback_finalizer. This allows you to 190 use a finalizer to deal with most tags, but leave those you don't 191 want touched. 192 """ 193 self._consumer = consumer 194 self.interest_tags = interest_tags 195 self._finalizer = callback_finalizer 196 self._exempt_tags = exempt_tags 197 198 # a dictionary of content for each tag of interest 199 # the information for each tag is held as a list of the lines. 200 # This allows us to collect information from multiple tags 201 # in a row, and return it all at once. 202 self.info = {} 203 for tag in self.interest_tags: 204 self.info[tag] = [] 205 206 # the previous tag we were collecting information for. 207 # We set a delay in sending info to the consumer so that we can 208 # collect a bunch of tags in a row and append all of the info 209 # together. 210 self._previous_tag = '' 211 212 # the current character information for a tag 213 self._cur_content = [] 214 # whether we should be collecting information 215 self._collect_characters = 0
216
217 - def startElement(self, name, attrs):
218 """Determine if we should collect characters from this tag. 219 """ 220 if name in self.interest_tags: 221 self._collect_characters = 1
222
223 - def characters(self, content):
224 """Extract the information if we are interested in it. 225 """ 226 if self._collect_characters: 227 self._cur_content.append(content)
228
229 - def endElement(self, name):
230 """Send the information to the consumer. 231 232 Once we've got the end element we've collected up all of the 233 character information we need, and we need to send this on to 234 the consumer to do something with it. 235 236 We have a delay of one tag on doing this, so that we can collect 237 all of the info from multiple calls to the same element at once. 238 """ 239 # only deal with the tag if it is something we are 240 # interested in and potentially have information for 241 if self._collect_characters: 242 # add all of the information collected inside this tag 243 self.info[name].append("".join(self._cur_content)) 244 # reset our information and flags 245 self._cur_content = [] 246 self._collect_characters = 0 247 248 # if we are at a new tag, pass on the info from the last tag 249 if self._previous_tag and self._previous_tag != name: 250 self._make_callback(self._previous_tag) 251 252 # set this tag as the next to be passed 253 self._previous_tag = name
254
255 - def _make_callback(self, name):
256 """Call the callback function with the info with the given name. 257 """ 258 # strip off whitespace and call the consumer 259 callback_function = getattr(self._consumer, name) 260 261 # --- pass back the information 262 # if there is a finalizer, use that 263 if self._finalizer is not None and name not in self._exempt_tags: 264 info_to_pass = self._finalizer(self.info[name]) 265 # otherwise pass back the entire list of information 266 else: 267 info_to_pass = self.info[name] 268 269 callback_function(info_to_pass) 270 271 # reset the information for the tag 272 self.info[name] = []
273
274 - def endDocument(self):
275 """Make sure all of our information has been passed. 276 277 This just flushes out any stored tags that need to be passed. 278 """ 279 if self._previous_tag: 280 self._make_callback(self._previous_tag)
281
282 -def read_and_call(uhandle, method, **keywds):
283 """read_and_call(uhandle, method[, start][, end][, contains][, blank][, has_re]) 284 285 Read a line from uhandle, check it, and pass it to the method. 286 Raises a ValueError if the line does not pass the checks. 287 288 start, end, contains, blank, and has_re specify optional conditions 289 that the line must pass. start and end specifies what the line must 290 begin or end with (not counting EOL characters). contains 291 specifies a substring that must be found in the line. If blank 292 is a true value, then the line must be blank. has_re should be 293 a regular expression object with a pattern that the line must match 294 somewhere. 295 296 """ 297 line = safe_readline(uhandle) 298 errmsg = _fails_conditions(*(line,), **keywds) 299 if errmsg is not None: 300 raise ValueError, errmsg 301 method(line)
302
303 -def read_and_call_while(uhandle, method, **keywds):
304 """read_and_call_while(uhandle, method[, start][, end][, contains][, blank][, has_re]) -> number of lines 305 306 Read a line from uhandle and pass it to the method as long as 307 some condition is true. Returns the number of lines that were read. 308 309 See the docstring for read_and_call for a description of the parameters. 310 311 """ 312 nlines = 0 313 while 1: 314 line = safe_readline(uhandle) 315 # If I've failed the condition, then stop reading the line. 316 if _fails_conditions(*(line,), **keywds): 317 uhandle.saveline(line) 318 break 319 method(line) 320 nlines = nlines + 1 321 return nlines
322
323 -def read_and_call_until(uhandle, method, **keywds):
324 """read_and_call_until(uhandle, method, 325 start=None, end=None, contains=None, blank=None) -> number of lines 326 327 Read a line from uhandle and pass it to the method until 328 some condition is true. Returns the number of lines that were read. 329 330 See the docstring for read_and_call for a description of the parameters. 331 332 """ 333 nlines = 0 334 while 1: 335 line = safe_readline(uhandle) 336 # If I've met the condition, then stop reading the line. 337 if not _fails_conditions(*(line,), **keywds): 338 uhandle.saveline(line) 339 break 340 method(line) 341 nlines = nlines + 1 342 return nlines
343
344 -def attempt_read_and_call(uhandle, method, **keywds):
345 """attempt_read_and_call(uhandle, method, **keywds) -> boolean 346 347 Similar to read_and_call, but returns a boolean specifying 348 whether the line has passed the checks. Does not raise 349 exceptions. 350 351 See docs for read_and_call for a description of the function 352 arguments. 353 354 """ 355 line = safe_readline(uhandle) 356 passed = not _fails_conditions(*(line,), **keywds) 357 if passed: 358 method(line) 359 else: 360 uhandle.saveline(line) 361 return passed
362
363 -def _fails_conditions(line, start=None, end=None, contains=None, blank=None, 364 has_re=None):
365 if start is not None: 366 if line[:len(start)] != start: 367 return "Line does not start with '%s':\n%s" % (start, line) 368 if end is not None: 369 if string.rstrip(line)[-len(end):] != end: 370 return "Line does not end with '%s':\n%s" % (end, line) 371 if contains is not None: 372 if string.find(line, contains) == -1: 373 return "Line does not contain '%s':\n%s" % (contains, line) 374 if blank is not None: 375 if blank: 376 if not is_blank_line(line): 377 return "Expected blank line, but got:\n%s" % line 378 else: 379 if is_blank_line(line): 380 return "Expected non-blank line, but got a blank one" 381 if has_re is not None: 382 if has_re.search(line) is None: 383 return "Line does not match regex '%s':\n%s" % ( 384 has_re.pattern, line) 385 return None
386
387 -def is_blank_line(line, allow_spaces=0):
388 """is_blank_line(line, allow_spaces=0) -> boolean 389 390 Return whether a line is blank. allow_spaces specifies whether to 391 allow whitespaces in a blank line. A true value signifies that a 392 line containing whitespaces as well as end-of-line characters 393 should be considered blank. 394 395 """ 396 if not line: 397 return 1 398 if allow_spaces: 399 return string.rstrip(line) == '' 400 return line[0] == '\n' or line[0] == '\r'
401
402 -def safe_readline(handle):
403 """safe_readline(handle) -> line 404 405 Read a line from an UndoHandle and return it. If there are no more 406 lines to read, I will raise a ValueError. 407 408 """ 409 line = handle.readline() 410 if not line: 411 raise ValueError, "Unexpected end of stream." 412 return line
413
414 -def safe_peekline(handle):
415 """safe_peekline(handle) -> line 416 417 Peek at the next line in an UndoHandle and return it. If there are no 418 more lines to peek, I will raise a ValueError. 419 420 """ 421 line = handle.peekline() 422 if not line: 423 raise ValueError, "Unexpected end of stream." 424 return line
425