Package Martel :: Module Iterator
[hide private]
[frames] | no frames]

Source Code for Module Martel.Iterator

  1  # Copyright 2000-2001, Dalke Scientific Software, LLC 
  2  # Distributed under the Biopython License Agreement (see the LICENSE file). 
  3   
  4  """Iterate over records of a XML parse tree. 
  5   
  6  The standard parser is callback based over all the elements of a file. 
  7  If the file contains records, many people would like to be able to 
  8  iterate over each record and only use the callback parser to analyze 
  9  the record. 
 10   
 11  If the expression is a 'ParseRecords', then the code to do this is 
 12  easy; use its make_reader to grab records and its record_expression to 
 13  parse them.  However, this isn't general enough.  The use of a 
 14  ParseRecords in the format definition should be strictly a 
 15  implementation decision for better memory use.  So there needs to be 
 16  an API which allows both full and record oriented parsers. 
 17   
 18  Here's an example use of the API: 
 19  >>> import sys 
 20  >>> import swissprot38  # one is in Martel/test/testformats 
 21  >>> from xml.dom import pulldom 
 22  >>> iterator = swissprot38.format.make_iterator("swissprot38_record") 
 23  >>> text = open("sample.swissprot").read() 
 24  >>> for record in iterator.iterateString(text, pulldom.SAX2DOM()): 
 25  ..      print "Read a record with the following AC numbers:" 
 26  ...     for acc in record.document.getElementsByTagName("ac_number"): 
 27  ...         acc.writexml(sys.stdout) 
 28  ...         sys.stdout.write("\n") 
 29  ... 
 30   
 31   
 32  There are several parts to this API.  First is the 'Iterator 
 33   
 34  There are two parts to the API.  One is the EventStream.  This 
 35  contains a single method called "next()" which returns a list of SAX 
 36  events in the 2-ple (event_name, args).  It is called multiple times 
 37  to return successive event lists and returns None if no events are 
 38  available. 
 39   
 40  The other is the Iterator 
 41   
 42  Sean McGrath has a RAX parser (Record API for XML) which uses a 
 43  concept similar to this. 
 44  """ 
 45   
 46   
 47  import sys, urllib, traceback 
 48  from xml.sax import saxutils 
 49  import Parser 
 50  try: 
 51      from cStringIO import StringIO 
 52  except ImportError: 
 53      from StringIO import StringIO 
 54   
55 -class StoreEvents:
56 - def __init__(self):
57 self.events = [] 58 self.has_error = 0 59 self.characters = lambda ch, append = self.events.append: \ 60 append( ("characters", ch) )
61
62 - def startDocument(self):
63 pass
64 - def endDocument(self):
65 pass
66
67 - def startElement(self, *args):
68 self.events.append( ("startElement", args) )
69 ## def characters(self, s): 70 ## # Note: This doesn't store the args as a tuple! 71 ## self.events.append( ("characters", s) )
72 - def endElement(self, *args):
73 self.events.append( ("endElement", args) )
74
75 - def error(self, *args):
76 self.has_error = 1 77 self.events.append( ("error", args) )
78 - def fatalError(self, *args):
79 self.has_error = 1 80 self.events.append( ("fatalError", args) )
81
82 -class EventStream:
83 - def __init__(self, event_list):
84 self.events = event_list
85 - def next(self):
86 if self.events: 87 x = self.events 88 self.events = None 89 return x 90 return None
91
92 -class Iterator:
93 - def __init__(self, parser, tag):
94 self.parser = parser 95 self.tag = tag
96
97 - def iterateString(self, s, cont_handler = None):
98 """create an iterator over a string""" 99 events = StoreEvents() 100 self.parser.setContentHandler(events) 101 self.parser.setErrorHandler(events) 102 self.parser.parseString(s) 103 return Iterate(self, EventStream(events.events), self.tag, 104 cont_handler)
105
106 - def iterateFile(self, fileobj, cont_handler = None):
107 return self.iterateString(fileobj.read(), cont_handler)
108
109 - def iterate(self, source, cont_handler = None):
110 """parse using the URL or file handle""" 111 source = saxutils.prepare_input_source(source) 112 file = source.getCharacterStream() or source.getByteStream() 113 return self.iterateFile(file, cont_handler)
114
115 -class RecordEventStream:
116 - def __init__(self, reader, parser):
117 self.reader = reader 118 self.parser = parser
119 - def next(self):
120 text = self.reader.next() 121 if text is None: 122 return None 123 events = StoreEvents() 124 self.parser.setContentHandler(events) 125 self.parser.setErrorHandler(events) 126 self.parser.parseString(text) 127 return events.events
128
129 -class IteratorRecords:
130 - def __init__(self, record_parser, make_reader, reader_args, marker_tag):
131 self.record_parser = record_parser 132 self.make_reader = make_reader 133 self.reader_args = reader_args 134 self.marker_tag = marker_tag
135
136 - def copy(self):
137 return self # This is (so far) an immutable object
138
139 - def iterateString(self, s, cont_handler = None):
140 return self.iterateFile(StringIO(s), cont_handler)
141
142 - def iterateFile(self, fileobj, cont_handler = None):
143 record_reader = self.make_reader( 144 *(fileobj,) + self.reader_args) 145 return Iterate(self, 146 RecordEventStream(record_reader, self.record_parser), 147 self.marker_tag, cont_handler)
148
149 - def iterate(self, source, cont_handler = None):
150 """parse using the URL or file handle""" 151 source = saxutils.prepare_input_source(source) 152 file = source.getCharacterStream() or source.getByteStream() 153 return self.iterateFile(file, cont_handler)
154
155 -def _get_next_text(reader):
156 try: 157 return reader.next(), None 158 except (KeyboardInterrupt, SystemExit): 159 raise 160 except: 161 # Something unusual happened (couldn't find a record?) 162 # so call it a fatal error and stop 163 outfile = StringIO() 164 traceback.print_exc(file=outfile) 165 exc = Parser.ParserRecordException( 166 outfile.getvalue(), sys.exc_info()[1]) 167 events = [ ("fatalError", (exc,)) ] 168 return None, events
169 170
171 -class HeaderFooterEventStream:
172 - def __init__(self, fileobj, 173 header_parser, make_header_reader, header_args, 174 record_parser, make_record_reader, record_args, 175 footer_parser, make_footer_reader, footer_args):
176 self.fileobj = fileobj 177 178 self.header_parser = header_parser 179 self.make_header_reader = make_header_reader 180 self.header_args = header_args 181 182 self.record_parser = record_parser 183 self.make_record_reader = make_record_reader 184 self.record_args = record_args 185 186 self.footer_parser = footer_parser 187 self.make_footer_reader = make_footer_reader 188 self.footer_args = footer_args 189 190 self._state = "header" 191 self._reader = None 192 self._lookahead = ""
193
194 - def next(self):
195 if self._state == "header": 196 x = self._header_next() 197 self._state = "record" 198 if x is not None: 199 return x 200 201 if self._state == "record": 202 x = self._record_next() 203 if x is not None: 204 return x 205 self._state = "footer" 206 207 if self._state == "footer": 208 x = self._footer_next() 209 self._state = "end" 210 if x is not None: 211 return x 212 213 if self._state == "end": 214 if self._lookahead: 215 return [ ("fatalError", 216 (Parser.ParserIncompleteException(0),)) ] 217 return None 218 219 raise AssertionError("Should not get here")
220
221 - def _header_next(self):
222 assert self._reader is None 223 if self.header_parser is None: 224 return None 225 reader = self.make_header_reader( 226 *(self.fileobj,) + self.header_args, 227 **{"lookahead": self._lookahead}) 228 text, errors = _get_next_text(reader) 229 self.fileobj, self._lookahead = reader.remainder() 230 if text is None: 231 return errors 232 events = StoreEvents() 233 self.header_parser.setContentHandler(events) 234 self.header_parser.setErrorHandler(events) 235 self.header_parser.parseString(text) 236 return events.events
237
238 - def _record_next(self):
239 if self._reader is None: 240 assert self.record_parser is not None 241 reader = self.make_record_reader( 242 *(self.fileobj,) + self.record_args, 243 **{"lookahead": self._lookahead}) 244 self._lookahead = None 245 self._reader = reader 246 else: 247 reader = self._reader 248 text, errors = _get_next_text(reader) 249 if text is None: 250 self.fileobj, self._lookahead = reader.remainder() 251 self._reader = None 252 return errors 253 254 events = StoreEvents() 255 self.record_parser.setContentHandler(events) 256 self.record_parser.setErrorHandler(events) 257 self.record_parser.parseString(text) 258 259 if events.has_error: 260 # Couldn't parse the record. 261 if self.footer_parser is not None: 262 # perhaps there's a footer here? 263 # We'll need to try reading that 264 self.fileobj, self._lookahead = reader.remainder() 265 self._lookahead = text + self._lookahead 266 self._reader = None 267 return None 268 # If no footer is possible, go on and pass 269 # back the error as normal 270 271 return events.events
272
289 290 291
292 -class IteratorHeaderFooter:
293 - def __init__(self, 294 header_parser, make_header_reader, header_args, 295 record_parser, make_record_reader, record_args, 296 footer_parser, make_footer_reader, footer_args, 297 marker_tag):
298 299 self.args = header_parser, make_header_reader, header_args, \ 300 record_parser, make_record_reader, record_args, \ 301 footer_parser, make_footer_reader, footer_args 302 self.marker_tag = marker_tag
303
304 - def iterateString(self, s, cont_handler = None):
305 return self.iterateFile(StringIO(s), cont_handler)
306
307 - def iterateFile(self, fileobj, cont_handler = None):
308 args = (fileobj,) + self.args 309 return Iterate(self, HeaderFooterEventStream(*args), 310 self.marker_tag, cont_handler)
311
312 - def iterate(self, source, cont_handler = None):
313 """parse using the URL or file handle""" 314 source = saxutils.prepare_input_source(source) 315 file = source.getCharacterStream() or source.getByteStream() 316 return self.iterateFile(file, cont_handler)
317 318
319 -class Iterate:
320 - def __init__(self, parent, event_stream, tag, cont_handler = None):
321 self.parent = parent 322 if cont_handler is None: 323 import LAX 324 cont_handler = LAX.LAX() 325 self.event_stream = event_stream 326 self.events = None 327 self.tag = tag 328 self.cont_handler = cont_handler 329 self._n = 0 330 self.parent.start_position = 0 331 self.parent.end_position = 0 332 self.current_position = 0
333
334 - def next(self):
335 events = self.events 336 if not events: 337 events = self.event_stream.next() 338 if events is None: 339 return None 340 self.events = events 341 342 i = 0 343 n = len(events) 344 # Look for the start of the next record 345 while 1: 346 if i == n: 347 new_events = self.event_stream.next() 348 if new_events is None: 349 break 350 events.extend(new_events) 351 n = len(events) 352 353 name, args = events[i] 354 if name == "error" or name == "fatalError": 355 # at this level the error is unrecoverable 356 self.events = None 357 if isinstance(args[0], Parser.ParserPositionException): 358 exc = args[0] 359 exc.pos = 0 360 exc += self.current_position 361 raise args[0] 362 363 if name == "startElement" and args[0] == self.tag: 364 self.parent.start_position = self.current_position 365 cont_handler = self.cont_handler 366 cont_handler.startDocument() 367 while i < n: 368 name, args = events[i] 369 if name == "characters": 370 # This is the most common case. 371 # Recall, args is not a tuple 372 cont_handler.characters(args) 373 self.current_position += len(args) 374 i = i + 1 375 elif name == "error": 376 # in theory this is recoverable, so scan forward 377 # until there's an endElement 378 exc = args[0] 379 while i < n: 380 name, args = events[i] 381 if name == "endElement" and args[0] == self.tag: 382 del self.events[:i+1] 383 if isinstance(exc, Parser.ParserPositionException): 384 exc.pos = 0 385 exc += self.current_position 386 raise exc 387 elif name == "characters": 388 self.current_position += len(args) 389 i = i + 1 390 # no end found, so not recoverable 391 self.events = None 392 if isinstance(exc, Parser.ParserPositionException): 393 exc.pos = 0 394 exc += self.parent.start_position 395 raise exc 396 elif name == "fatalError": 397 # not recoverable 398 self.events = None 399 if isinstance(args[0], Parser.ParserPositionException): 400 exc = args[0] 401 exc = 0 402 exc += self.parent.start_position 403 raise args[0] 404 else: 405 getattr(cont_handler, name)(*args) 406 if name == "endElement" and args[0] == self.tag: 407 self.parent.end_position = self.current_position 408 del self.events[:i+1] 409 cont_handler.endDocument() 410 self._n = self._n + 1 411 return cont_handler 412 i = i + 1 413 414 # Got here without an endElement? Not supposed to happen! 415 raise AssertionError, "no endElement(%s) and no errors?" % \ 416 repr(self.tag) 417 else: 418 if name == "characters": 419 self.current_position += len(args) 420 i = i + 1 421 422 # Went through the document and no more records were found 423 self.events = None 424 return None
425
426 - def __getitem__(self, n):
427 assert n == self._n, "forward iteration only" 428 x = self.next() 429 if x is None: 430 raise IndexError, n 431 return x
432
433 - def __iter__(self):
434 return iter(self.next, None)
435