1
2
3
4
5
6
7
8
9
10
11 """Internal support module for sre"""
12
13
14
15 import sys
16
17 from sre_constants import *
18
20 s = {}
21 for elem in seq:
22 s[elem] = 1
23 return s
24
25 SPECIAL_CHARS = ".\\[{()*+?^$|"
26 REPEAT_CHARS = "*+?{"
27
28 DIGITS = set("0123456789")
29
30 OCTDIGITS = set("01234567")
31 HEXDIGITS = set("0123456789abcdefABCDEF")
32
33 WHITESPACE = set(" \t\n\r\v\f")
34
35 ESCAPES = {
36 r"\a": (LITERAL, ord("\a")),
37 r"\b": (LITERAL, ord("\b")),
38 r"\f": (LITERAL, ord("\f")),
39 r"\n": (LITERAL, ord("\n")),
40 r"\r": (LITERAL, ord("\r")),
41 r"\t": (LITERAL, ord("\t")),
42 r"\v": (LITERAL, ord("\v")),
43 r"\\": (LITERAL, ord("\\"))
44 }
45
46 CATEGORIES = {
47 r"\A": (AT, AT_BEGINNING_STRING),
48 r"\b": (AT, AT_BOUNDARY),
49 r"\B": (AT, AT_NON_BOUNDARY),
50 r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
51 r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
52 r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
53 r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
54 r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
55 r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
56 r"\Z": (AT, AT_END_STRING),
57 }
58
59 FLAGS = {
60
61 "i": SRE_FLAG_IGNORECASE,
62 "L": SRE_FLAG_LOCALE,
63 "m": SRE_FLAG_MULTILINE,
64 "s": SRE_FLAG_DOTALL,
65 "x": SRE_FLAG_VERBOSE,
66
67 "t": SRE_FLAG_TEMPLATE,
68 "u": SRE_FLAG_UNICODE,
69 }
70
72
74 self.flags = 0
75 self.open = []
76 self.groups = 1
77 self.groupdict = {}
79 gid = self.groups
80 self.groups = gid + 1
81 if name is not None:
82 ogid = self.groupdict.get(name, None)
83 if ogid is not None:
84 raise error, ("redefinition of group name %s as group %d; "
85 "was group %d" % (repr(name), gid, ogid))
86 self.groupdict[name] = gid
87 self.open.append(gid)
88 return gid
92 return gid < self.groups and gid not in self.open
93
95
102 - def dump(self, level=0):
103 nl = 1
104 seqtypes = type(()), type([])
105 for op, av in self.data:
106 print level*" " + op,; nl = 0
107 if op == "in":
108
109 print; nl = 1
110 for op, a in av:
111 print (level+1)*" " + op, a
112 elif op == "branch":
113 print; nl = 1
114 i = 0
115 for a in av[1]:
116 if i > 0:
117 print level*" " + "or"
118 a.dump(level+1); nl = 1
119 i = i + 1
120 elif type(av) in seqtypes:
121 for a in av:
122 if isinstance(a, SubPattern):
123 if not nl: print
124 a.dump(level+1); nl = 1
125 else:
126 print a, ; nl = 0
127 else:
128 print av, ; nl = 0
129 if not nl: print
133 return len(self.data)
142 - def insert(self, index, code):
147
148 if self.width:
149 return self.width
150 lo = hi = 0L
151 UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY)
152 REPEATCODES = (MIN_REPEAT, MAX_REPEAT)
153 for op, av in self.data:
154 if op is BRANCH:
155 i = sys.maxint
156 j = 0
157 for av in av[1]:
158 l, h = av.getwidth()
159 i = min(i, l)
160 j = max(j, h)
161 lo = lo + i
162 hi = hi + j
163 elif op is CALL:
164 i, j = av.getwidth()
165 lo = lo + i
166 hi = hi + j
167 elif op is SUBPATTERN:
168 i, j = av[1].getwidth()
169 lo = lo + i
170 hi = hi + j
171 elif op in REPEATCODES:
172 i, j = av[2].getwidth()
173 lo = lo + long(i) * av[0]
174 hi = hi + long(j) * av[1]
175 elif op in UNITCODES:
176 lo = lo + 1
177 hi = hi + 1
178 elif op == SUCCESS:
179 break
180 self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint))
181 return self.width
182
189 if self.index >= len(self.string):
190 self.next = None
191 return
192 char = self.string[self.index]
193 if char[0] == "\\":
194 try:
195 c = self.string[self.index + 1]
196 except IndexError:
197 raise error, "bogus escape (end of line)"
198 char = char + c
199 self.index = self.index + len(char)
200 self.next = char
201 - def match(self, char, skip=1):
202 if char == self.next:
203 if skip:
204 self.__next()
205 return 1
206 return 0
208 this = self.next
209 self.__next()
210 return this
213 - def seek(self, index):
215
217 return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_"
218
220 return "0" <= char <= "9"
221
223
224 if not isident(name[0]):
225 return False
226 for char in name[1:]:
227 if not isident(char) and not isdigit(char):
228 return False
229 return True
230
232
233 code = ESCAPES.get(escape)
234 if code:
235 return code
236 code = CATEGORIES.get(escape)
237 if code:
238 return code
239 try:
240 c = escape[1:2]
241 if c == "x":
242
243 while source.next in HEXDIGITS and len(escape) < 4:
244 escape = escape + source.get()
245 escape = escape[2:]
246 if len(escape) != 2:
247 raise error, "bogus escape: %s" % repr("\\" + escape)
248 return LITERAL, int(escape, 16) & 0xff
249 elif c in OCTDIGITS:
250
251 while source.next in OCTDIGITS and len(escape) < 4:
252 escape = escape + source.get()
253 escape = escape[1:]
254 return LITERAL, int(escape, 8) & 0xff
255 elif c in DIGITS:
256 raise error, "bogus escape: %s" % repr(escape)
257 if len(escape) == 2:
258 return LITERAL, ord(escape[1])
259 except ValueError:
260 pass
261 raise error, "bogus escape: %s" % repr(escape)
262
263 -def _escape(source, escape, state):
264
265 code = CATEGORIES.get(escape)
266 if code:
267 return code
268 code = ESCAPES.get(escape)
269 if code:
270 return code
271 try:
272 c = escape[1:2]
273 if c == "x":
274
275 while source.next in HEXDIGITS and len(escape) < 4:
276 escape = escape + source.get()
277 if len(escape) != 4:
278 raise ValueError
279 return LITERAL, int(escape[2:], 16) & 0xff
280 elif c == "0":
281
282 while source.next in OCTDIGITS and len(escape) < 4:
283 escape = escape + source.get()
284 return LITERAL, int(escape[1:], 8) & 0xff
285 elif c in DIGITS:
286
287 if source.next in DIGITS:
288 escape = escape + source.get()
289 if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and
290 source.next in OCTDIGITS):
291
292 escape = escape + source.get()
293 return LITERAL, int(escape[1:], 8) & 0xff
294
295 group = int(escape[1:])
296 if group < state.groups:
297 if not state.checkgroup(group):
298 raise error, "cannot refer to open group"
299 return GROUPREF, group
300 raise ValueError
301 if len(escape) == 2:
302 return LITERAL, ord(escape[1])
303 except ValueError:
304 pass
305 raise error, "bogus escape: %s" % repr(escape)
306
308
309
310 items = []
311 itemsappend = items.append
312 sourcematch = source.match
313 while 1:
314 itemsappend(_parse(source, state))
315 if sourcematch("|"):
316 continue
317 if not nested:
318 break
319 if not source.next or sourcematch(")", 0):
320 break
321 else:
322 raise error, "pattern not properly closed"
323
324 if len(items) == 1:
325 return items[0]
326
327 subpattern = SubPattern(state)
328 subpatternappend = subpattern.append
329
330
331 while 1:
332 prefix = None
333 for item in items:
334 if not item:
335 break
336 if prefix is None:
337 prefix = item[0]
338 elif item[0] != prefix:
339 break
340 else:
341
342
343 for item in items:
344 del item[0]
345 subpatternappend(prefix)
346 continue
347 break
348
349
350 for item in items:
351 if len(item) != 1 or item[0][0] != LITERAL:
352 break
353 else:
354
355
356 set = []
357 setappend = set.append
358 for item in items:
359 setappend(item[0])
360 subpatternappend((IN, set))
361 return subpattern
362
363 subpattern.append((BRANCH, (None, items)))
364 return subpattern
365
367 item_yes = _parse(source, state)
368 if source.match("|"):
369 item_no = _parse(source, state)
370 if source.match("|"):
371 raise error, "conditional backref with more than two branches"
372 else:
373 item_no = None
374 if source.next and not source.match(")", 0):
375 raise error, "pattern not properly closed"
376 subpattern = SubPattern(state)
377 subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
378 return subpattern
379
380 _PATTERNENDERS = set("|)")
381 _ASSERTCHARS = set("=!<")
382 _LOOKBEHINDASSERTCHARS = set("=!")
383 _REPEATCODES = set([MIN_REPEAT, MAX_REPEAT])
384
386
387 subpattern = SubPattern(state)
388
389
390 subpatternappend = subpattern.append
391 sourceget = source.get
392 sourcematch = source.match
393 _len = len
394 PATTERNENDERS = _PATTERNENDERS
395 ASSERTCHARS = _ASSERTCHARS
396 LOOKBEHINDASSERTCHARS = _LOOKBEHINDASSERTCHARS
397 REPEATCODES = _REPEATCODES
398
399 while 1:
400
401 if source.next in PATTERNENDERS:
402 break
403 this = sourceget()
404 if this is None:
405 break
406
407 if state.flags & SRE_FLAG_VERBOSE:
408
409 if this in WHITESPACE:
410 continue
411 if this == "#":
412 while 1:
413 this = sourceget()
414 if this in (None, "\n"):
415 break
416 continue
417
418 if this and this[0] not in SPECIAL_CHARS:
419 subpatternappend((LITERAL, ord(this)))
420
421 elif this == "[":
422
423 set = []
424 setappend = set.append
425
426
427 if sourcematch("^"):
428 setappend((NEGATE, None))
429
430 start = set[:]
431 while 1:
432 this = sourceget()
433 if this == "]" and set != start:
434 break
435 elif this and this[0] == "\\":
436 code1 = _class_escape(source, this)
437 elif this:
438 code1 = LITERAL, ord(this)
439 else:
440 raise error, "unexpected end of regular expression"
441 if sourcematch("-"):
442
443 this = sourceget()
444 if this == "]":
445 if code1[0] is IN:
446 code1 = code1[1][0]
447 setappend(code1)
448 setappend((LITERAL, ord("-")))
449 break
450 elif this:
451 if this[0] == "\\":
452 code2 = _class_escape(source, this)
453 else:
454 code2 = LITERAL, ord(this)
455 if code1[0] != LITERAL or code2[0] != LITERAL:
456 raise error, "bad character range"
457 lo = code1[1]
458 hi = code2[1]
459 if hi < lo:
460 raise error, "bad character range"
461 setappend((RANGE, (lo, hi)))
462 else:
463 raise error, "unexpected end of regular expression"
464 else:
465 if code1[0] is IN:
466 code1 = code1[1][0]
467 setappend(code1)
468
469
470 if _len(set)==1 and set[0][0] is LITERAL:
471 subpatternappend(set[0])
472 elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
473 subpatternappend((NOT_LITERAL, set[1][1]))
474 else:
475
476 subpatternappend((IN, set))
477
478 elif this and this[0] in REPEAT_CHARS:
479
480 if this == "?":
481 min, max = 0, 1
482 elif this == "*":
483 min, max = 0, MAXREPEAT
484
485 elif this == "+":
486 min, max = 1, MAXREPEAT
487 elif this == "{":
488 if source.next == "}":
489 subpatternappend((LITERAL, ord(this)))
490 continue
491 here = source.tell()
492 min, max = 0, MAXREPEAT
493 lo = hi = ""
494 while source.next in DIGITS:
495 lo = lo + source.get()
496 if sourcematch(","):
497 while source.next in DIGITS:
498 hi = hi + sourceget()
499 else:
500 hi = lo
501 if not sourcematch("}"):
502 subpatternappend((LITERAL, ord(this)))
503 source.seek(here)
504 continue
505 if lo:
506 min = int(lo)
507 if hi:
508 max = int(hi)
509 if max < min:
510 raise error, "bad repeat interval"
511 else:
512 raise error, "not supported"
513
514 if subpattern:
515 item = subpattern[-1:]
516 else:
517 item = None
518 if not item or (_len(item) == 1 and item[0][0] == AT):
519 raise error, "nothing to repeat"
520 if item[0][0] in REPEATCODES:
521 raise error, "multiple repeat"
522 if sourcematch("?"):
523 subpattern[-1] = (MIN_REPEAT, (min, max, item))
524 else:
525 subpattern[-1] = (MAX_REPEAT, (min, max, item))
526
527 elif this == ".":
528 subpatternappend((ANY, None))
529
530 elif this == "(":
531 group = 1
532 name = None
533 condgroup = None
534 if sourcematch("?"):
535 group = 0
536
537 if sourcematch("P"):
538
539 if sourcematch("<"):
540
541 name = ""
542 while 1:
543 char = sourceget()
544 if char is None:
545 raise error, "unterminated name"
546 if char == ">":
547 break
548 name = name + char
549 group = 1
550 if not isname(name):
551 raise error, "bad character in group name"
552 elif sourcematch("="):
553
554 name = ""
555 while 1:
556 char = sourceget()
557 if char is None:
558 raise error, "unterminated name"
559 if char == ")":
560 break
561 name = name + char
562 if not isname(name):
563 raise error, "bad character in group name"
564 gid = state.groupdict.get(name)
565 if gid is None:
566 raise error, "unknown group name"
567 subpatternappend((GROUPREF, gid))
568 continue
569 else:
570 char = sourceget()
571 if char is None:
572 raise error, "unexpected end of pattern"
573 raise error, "unknown specifier: ?P%s" % char
574 elif sourcematch(":"):
575
576 group = 2
577 elif sourcematch("#"):
578
579 while 1:
580 if source.next is None or source.next == ")":
581 break
582 sourceget()
583 if not sourcematch(")"):
584 raise error, "unbalanced parenthesis"
585 continue
586 elif source.next in ASSERTCHARS:
587
588 char = sourceget()
589 dir = 1
590 if char == "<":
591 if source.next not in LOOKBEHINDASSERTCHARS:
592 raise error, "syntax error"
593 dir = -1
594 char = sourceget()
595 p = _parse_sub(source, state)
596 if not sourcematch(")"):
597 raise error, "unbalanced parenthesis"
598 if char == "=":
599 subpatternappend((ASSERT, (dir, p)))
600 else:
601 subpatternappend((ASSERT_NOT, (dir, p)))
602 continue
603 elif sourcematch("("):
604
605 condname = ""
606 while 1:
607 char = sourceget()
608 if char is None:
609 raise error, "unterminated name"
610 if char == ")":
611 break
612 condname = condname + char
613 group = 2
614 if isname(condname):
615 condgroup = state.groupdict.get(condname)
616 if condgroup is None:
617 raise error, "unknown group name"
618 else:
619 try:
620 condgroup = int(condname)
621 except ValueError:
622 raise error, "bad character in group name"
623 else:
624
625 if not source.next in FLAGS:
626 raise error, "unexpected end of pattern"
627 while source.next in FLAGS:
628 state.flags = state.flags | FLAGS[sourceget()]
629 if group:
630
631 if group == 2:
632
633 group = None
634 else:
635 group = state.opengroup(name)
636 if condgroup:
637 p = _parse_sub_cond(source, state, condgroup)
638 else:
639 p = _parse_sub(source, state)
640 if not sourcematch(")"):
641 raise error, "unbalanced parenthesis"
642 if group is not None:
643 state.closegroup(group)
644 subpatternappend((SUBPATTERN, (group, p)))
645 else:
646 while 1:
647 char = sourceget()
648 if char is None:
649 raise error, "unexpected end of pattern"
650 if char == ")":
651 break
652 raise error, "unknown extension"
653
654 elif this == "^":
655 subpatternappend((AT, AT_BEGINNING))
656
657 elif this == "$":
658 subpattern.append((AT, AT_END))
659
660 elif this and this[0] == "\\":
661 code = _escape(source, this, state)
662 subpatternappend(code)
663
664 else:
665 raise error, "parser error"
666
667 return subpattern
668
669 -def parse(str, flags=0, pattern=None):
696
698
699
700 s = Tokenizer(source)
701 sget = s.get
702 p = []
703 a = p.append
704 def literal(literal, p=p, pappend=a):
705 if p and p[-1][0] is LITERAL:
706 p[-1] = LITERAL, p[-1][1] + literal
707 else:
708 pappend((LITERAL, literal))
709 sep = source[:0]
710 if type(sep) is type(""):
711 makechar = chr
712 else:
713 makechar = unichr
714 while 1:
715 this = sget()
716 if this is None:
717 break
718 if this and this[0] == "\\":
719
720 c = this[1:2]
721 if c == "g":
722 name = ""
723 if s.match("<"):
724 while 1:
725 char = sget()
726 if char is None:
727 raise error, "unterminated group name"
728 if char == ">":
729 break
730 name = name + char
731 if not name:
732 raise error, "bad group name"
733 try:
734 index = int(name)
735 if index < 0:
736 raise error, "negative group number"
737 except ValueError:
738 if not isname(name):
739 raise error, "bad character in group name"
740 try:
741 index = pattern.groupindex[name]
742 except KeyError:
743 raise IndexError, "unknown group name"
744 a((MARK, index))
745 elif c == "0":
746 if s.next in OCTDIGITS:
747 this = this + sget()
748 if s.next in OCTDIGITS:
749 this = this + sget()
750 literal(makechar(int(this[1:], 8) & 0xff))
751 elif c in DIGITS:
752 isoctal = False
753 if s.next in DIGITS:
754 this = this + sget()
755 if (c in OCTDIGITS and this[2] in OCTDIGITS and
756 s.next in OCTDIGITS):
757 this = this + sget()
758 isoctal = True
759 literal(makechar(int(this[1:], 8) & 0xff))
760 if not isoctal:
761 a((MARK, int(this[1:])))
762 else:
763 try:
764 this = makechar(ESCAPES[this][1])
765 except KeyError:
766 pass
767 literal(this)
768 else:
769 literal(this)
770
771 i = 0
772 groups = []
773 groupsappend = groups.append
774 literals = [None] * len(p)
775 for c, s in p:
776 if c is MARK:
777 groupsappend((i, s))
778
779 else:
780 literals[i] = s
781 i = i + 1
782 return groups, literals
783
785 g = match.group
786 sep = match.string[:0]
787 groups, literals = template
788 literals = literals[:]
789 try:
790 for index, group in groups:
791 literals[index] = s = g(group)
792 if s is None:
793 raise error, "unmatched group"
794 except IndexError:
795 raise error, "invalid group reference"
796 return sep.join(literals)
797