1
2
3
4
5
6 import string
7
8 try:
9 from mx import TextTools as TT
10 except ImportError:
11 import TextTools as TT
12
13 import msre_parse
14 import Expression, convert_re
15
16 import Parser
17
18
19 supports_lookahead = hasattr(TT, "LookAhead")
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47 _generate_count = 0
48
50 - def __init__(self, groupref_names, debug_level):
51 self.groupref_names = groupref_names
52 self.debug_level = debug_level
53 self.lookup = {}
54
61
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
90
91 tables = []
92 for expr in expression.expressions:
93 tables.append(_generate(expr, genstate))
94
95
96 i = 0
97 n = len(tables)
98 result = []
99 for table in tables:
100 result.append( \
101 (">ignore", TT.Table, tuple(table), +1, n-i+1)
102 )
103 i = i + 1
104
105 result.append( (None, TT.Fail, TT.Here) )
106 return result
107
108
109
110
111
113 result = []
114 if genstate.debug_level == 0:
115 for exp in expression.expressions:
116 table = _generate(exp, genstate)
117 result.extend(table)
118 elif genstate.debug_level == 1:
119 for exp in expression.expressions:
120 table = _generate(exp, genstate)
121 result.extend(table)
122 result.append( (None, TT.Call, track_position, +1, +1) )
123 elif genstate.debug_level == 2:
124 for exp in expression.expressions:
125 table = _generate(exp, genstate)
126 result.extend(table)
127 result.append( (None, TT.Call, track_position, +1, +1) )
128 table.append( (None, TT.Call, print_info(exp), +1, +1) )
129
130 return result
131
132
134 if expression.invert:
135
136 return [(None, TT.IsIn, convert_re.invert(expression.char))]
137
138 else:
139 return [(None, TT.Is, expression.char)]
140
141
143 return [(None, TT.Word, expression.string)]
144
145
147 if expression.invert:
148
149 return [(None, TT.IsIn, convert_re.invert(expression.chars))]
150
151 else:
152 return [(None, TT.IsIn, expression.chars)]
153
154
155
159 - def __call__(self, taglist, text, l, r, subtags):
162
163
165 tagtable = _generate(expression.expression, genstate)
166
167 name = expression.name
168 if name is None:
169 assert not expression.attrs, "unnamed group can't have attrs!"
170
171 return tagtable
172
173 if genstate.groupref_names.get(name) != 1:
174 if expression.attrs:
175 name = genstate.add_group(expression)
176 return [(name, TT.Table, tuple(tagtable)) ]
177 else:
178
179 if expression.attrs:
180 name = genstate.add_group(expression)
181 return [(SetGroupValue(name), TT.Table+TT.CallTag,
182 tuple(tagtable)), ]
183
184
185
186
187
188
189
190
191
192
193
194
195
196
198 - def __init__(self, tagtable, min_count, max_count):
199 self.tagtable = tagtable
200 self.min_count = min_count
201 self.max_count = max_count
202
203 self.taglist = None
204
206
207 min_count = self.min_count
208 if type(min_count) == type(""):
209 min_count = Parser._match_group[min_count]
210 min_count = string.atoi(min_count)
211
212 max_count = self.max_count
213 if type(max_count) == type(""):
214 max_count = Parser._match_group[max_count]
215 max_count = string.atoi(max_count)
216
217 return min_count, max_count
218
219 - def call(self, text, x, end):
220
221
222
223 min_count, max_count = self._get_ranges()
224 assert min_count == max_count, \
225 "cannot have different sizes: %s %s" % (min_count, max_count)
226
227 tagtable = self.tagtable * min_count
228 result, taglist, pos = TT.tag(text, tagtable, x, end)
229 if result == 1:
230
231 self.taglist = taglist
232 return pos + 1
233 else:
234 self.taglist = None
235 return x
236
237 - def calltag(self, taglist, text, l, r, subtags):
238
239 assert not subtags, repr(subtags)
240
241
242 taglist.append( (">ignore", l, r-1, self.taglist) )
243
244
245
246
247
251 - def __call__(self, taglist, text, l, r, subtags):
253
259
261 if type(expression.min_count) != type("") or \
262 type(expression.max_count) != type(""):
263 raise NotImplementedError("Cannot mix numeric and named repeat counts")
264 if expression.min_count != expression.max_count:
265 raise NotImplementedError("Only a single named repeat count allowed")
266
267 tagtable = _generate(expression.expression, genstate)
268 counter = HandleRepeatCount(tuple(tagtable),
269 expression.min_count,
270 expression.max_count)
271
272
273
274 return \
275 [(_call_calltag(counter), TT.Call + TT.CallTag,
276 _call_call(counter), TT.MatchFail),
277 (None, TT.Skip, -1, TT.MatchFail),]
278
279
280
281
282
283
284
285
287 expr = expression.expression
288 min_count = expression.min_count
289 max_count = expression.max_count
290
291
292 if type(min_count) == type("") or type(max_count) == "":
293 return generate_named_max_repeat(expression, genstate)
294
295 assert 0 <= min_count <= max_count, "bad ranges (%sd, %d)" % \
296 (min_count, max_count)
297
298 tagtable = _generate(expr, genstate)
299 result = []
300
301
302 for i in range(min_count):
303 result.append( (None, TT.SubTable, tuple(tagtable)) )
304
305
306 if max_count == msre_parse.MAXREPEAT:
307 result.append( (None, TT.SubTable, tuple(tagtable),
308 +1, 0))
309 elif min_count == max_count:
310
311 pass
312 else:
313
314 offset = max_count - min_count
315 for i in range(offset):
316 result.append( (">ignore", TT.Table,
317 tuple(tagtable),
318 +offset, +1) )
319 offset = offset - 1
320 return result
321
322
325
327 """Print debug information"""
331 print "Martel:", self.msg
332 return x
333
335 return [(None, TT.Call, print_debug(expression.msg), +1, +1)]
336
337
338
340 if x == 0:
341 return x
342 if text[x-1] == "\n":
343 return x
344 return x + 1
345
346
347
348
350 if supports_lookahead:
351 return [(None, TT.Call + TT.LookAhead, check_at_beginning, +1,
352 TT.MatchFail),]
353 else:
354 return [(None, TT.Call, check_at_beginning, +2, +1),
355 (None, TT.Skip, -1, TT.MatchFail, TT.MatchFail),
356 ]
357
358
359
360
361
362
364 return [(None, TT.EOF, TT.Here)]
365
366
367
369 return [(None, TT.IsInSet, TT.invset('\n')), ]
370
371
373 return [(None, TT.Is, '\n', +1, +3),
374 (None, TT.Is, '\r', TT.MatchFail, +1),
375 (None, TT.Is, '\n', +1, +1),
376 ]
377
378
379
381 result, taglist, pos = TT.tag(text, tagtable, x, end)
382 if result:
383
384 return x
385
386 return x + 1
387
388
391 self.tag_words = tag_words
395
396
397
398
400 result, taglist, pos = TT.tag(text, tag_words, x, end)
401 if result:
402
403 return x+1
404
405 return x
406
407
410 self.tag_words = tag_words
414
415
416
418 tagtable = _generate(expression.expression, genstate)
419 if expression.invert:
420 func = CheckAssertNot
421 else:
422 func = CheckAssert
423 if supports_lookahead:
424 return [
425 (None, TT.Call + TT.LookAhead, func(tuple(tagtable)),
426 TT.MatchFail),
427 ]
428 else:
429 return [
430 (None, TT.Call, func(tuple(tagtable)),
431 TT.MatchFail),
432 (None, TT.Skip, -1, TT.MatchFail),
433 ]
434
435
436
437
446
447
448
450
451
452 return [
453 (None, TT.Call, CheckGroupRef(expression.name), TT.MatchFail),
454 (None, TT.Skip, -1, TT.MatchFail),
455 ]
456
457
458
460 return _generate(expression.expression, genstate)
461
462
463 generate_table = {
464 Expression.Alt: generate_alt,
465 Expression.Any: generate_any,
466 Expression.Assert: generate_assert,
467 Expression.AtBeginning: generate_at_beginning,
468 Expression.AtEnd: generate_at_end,
469 Expression.Debug: generate_debug,
470 Expression.Dot: generate_dot,
471 Expression.AnyEol: generate_eol,
472 Expression.Group: generate_group,
473 Expression.GroupRef: generate_groupref,
474 Expression.Literal: generate_literal,
475 Expression.MaxRepeat: generate_max_repeat,
476 Expression.NullOp: generate_null_op,
477 Expression.Seq: generate_seq,
478 Expression.Str: generate_str,
479 }
480
481 _position = -1
483 """store the start position of the farthest successful match
484
485 This value is more useful than mxTextTools' default, which only
486 points out the last text region successfully tagged at the top
487 level. This value is the last region successfully tagged
488 anywhere.
489
490 Uses a global variable so this is SINGLE THREADED!
491
492 """
493
494 global _position
495 _position = max(x, _position)
496 return x
497
499 """Print information after each expression match"""
501 s = str(expression)
502 if len(s) > 40:
503 s = s[:17] + " ... " + s[-17:]
504 self.msg = s
506 print "Match %s (x=%d): %s" % (repr(text[max(0, x-8):x+8]), x,
507 repr(self.msg))
508 return x
509
510
512 try:
513 func = generate_table[expression.__class__]
514 except KeyError:
515 if isinstance(expression, Expression.PassThrough):
516 func = generate_pass_through
517 else:
518 raise AssertionError, \
519 "Unknown Expression object: %s" % repr(expression)
520 table = func(expression, genstate)
521
522 if genstate.debug_level == 0 or not table:
523 pass
524 elif genstate.debug_level == 1:
525 table.append( (None, TT.Call, track_position, +1, +1) )
526 elif genstate.debug_level == 2:
527 table.append( (None, TT.Call, track_position, +1, +1) )
528 table.append( (None, TT.Call, print_info(expression), +1, +1) )
529 else:
530 raise AssertionError, "Unknown debug level: %s" % genstate.debug_level
531
532 return table
533
534
535
536 -def generate(expression, debug_level = 0):
537 """expression -> Parser for the Expression tree"""
538 groupref_names = _find_wanted_groupref_names(expression)
539 genstate = GeneratorState(groupref_names, debug_level)
540 tagtable = _generate(expression, genstate)
541 if groupref_names:
542 want_groupref_names = 1
543 else:
544 want_groupref_names = 0
545 return tuple(tagtable), want_groupref_names, genstate.lookup
546
547
548
550 tagtable, want_groupref_names, attrlookup = generate(expression)
551 return Parser.Parser(tagtable, (want_groupref_names, debug_level,
552 attrlookup))
553
554
555
556
557
558
559
560
561
562
563
564
565
567 """expression -> dict of group names wanted by elements of the tree
568
569 The dict is used to during tagtable generation to specify which
570 groups need to save their match text. There's match-time overhead
571 for doing that, and the code isn't thread safe, so the intent is
572 to save only those groups that are needed.
573
574 The dict value is 1 if the group name is needed, else there is
575 no entry in the dict.
576
577 XXX need to make this a method!
578 """
579 want_names = {}
580 if isinstance(expression, Expression.Alt) or \
581 isinstance(expression, Expression.Seq):
582 for x in expression.expressions:
583 want_names.update(_find_wanted_groupref_names(x))
584
585 elif isinstance(expression, Expression.Group) or \
586 isinstance(expression, Expression.Assert) or \
587 isinstance(expression, Expression.PassThrough):
588 want_names.update(_find_wanted_groupref_names(expression.expression))
589
590 elif isinstance(expression, Expression.MaxRepeat):
591 if type(expression.min_count) == type(""):
592 want_names[expression.min_count] = 1
593 if type(expression.max_count) == type(""):
594 want_names[expression.max_count] = 1
595 want_names.update(_find_wanted_groupref_names(expression.expression))
596
597 elif isinstance(expression, Expression.GroupRef):
598 want_names[expression.name] = 1
599
600 elif isinstance(expression, Expression.Literal) or \
601 isinstance(expression, Expression.Str) or \
602 isinstance(expression, Expression.Any) or \
603 isinstance(expression, Expression.AtBeginning) or \
604 isinstance(expression, Expression.AtEnd) or \
605 isinstance(expression, Expression.Dot) or \
606 isinstance(expression, Expression.AnyEol) or \
607 isinstance(expression, Expression.Debug) or \
608 isinstance(expression, Expression.NullOp):
609 pass
610
611 else:
612 raise NotImplementedError, "What is a %s?" % repr(expression)
613
614 return want_names
615