]>
git.proxmox.com Git - mirror_edk2.git/blob - AppPkg/Applications/Python/Python-2.7.10/Lib/sre_parse.py
2 # Secret Labs' Regular Expression Engine
4 # convert re-style regular expression to sre pattern
6 # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
8 # See the sre.py file for information on usage and redistribution.
11 """Internal support module for sre"""
13 # XXX: show string offset and offending character for all errors
17 from sre_constants
import *
19 SPECIAL_CHARS
= ".\\[{()*+?^$|"
22 DIGITS
= set("0123456789")
24 OCTDIGITS
= set("01234567")
25 HEXDIGITS
= set("0123456789abcdefABCDEF")
27 WHITESPACE
= set(" \t\n\r\v\f")
30 r
"\a": (LITERAL
, ord("\a")),
31 r
"\b": (LITERAL
, ord("\b")),
32 r
"\f": (LITERAL
, ord("\f")),
33 r
"\n": (LITERAL
, ord("\n")),
34 r
"\r": (LITERAL
, ord("\r")),
35 r
"\t": (LITERAL
, ord("\t")),
36 r
"\v": (LITERAL
, ord("\v")),
37 r
"\\": (LITERAL
, ord("\\"))
41 r
"\A": (AT
, AT_BEGINNING_STRING
), # start of string
42 r
"\b": (AT
, AT_BOUNDARY
),
43 r
"\B": (AT
, AT_NON_BOUNDARY
),
44 r
"\d": (IN
, [(CATEGORY
, CATEGORY_DIGIT
)]),
45 r
"\D": (IN
, [(CATEGORY
, CATEGORY_NOT_DIGIT
)]),
46 r
"\s": (IN
, [(CATEGORY
, CATEGORY_SPACE
)]),
47 r
"\S": (IN
, [(CATEGORY
, CATEGORY_NOT_SPACE
)]),
48 r
"\w": (IN
, [(CATEGORY
, CATEGORY_WORD
)]),
49 r
"\W": (IN
, [(CATEGORY
, CATEGORY_NOT_WORD
)]),
50 r
"\Z": (AT
, AT_END_STRING
), # end of string
55 "i": SRE_FLAG_IGNORECASE
,
57 "m": SRE_FLAG_MULTILINE
,
59 "x": SRE_FLAG_VERBOSE
,
61 "t": SRE_FLAG_TEMPLATE
,
62 "u": SRE_FLAG_UNICODE
,
66 # master pattern object. keeps track of global attributes
74 def opengroup(self
, name
=None):
78 ogid
= self
.groupdict
.get(name
, None)
80 raise error
, ("redefinition of group name %s as group %d; "
81 "was group %d" % (repr(name
), gid
, ogid
))
82 self
.groupdict
[name
] = gid
85 def closegroup(self
, gid
):
87 def checkgroup(self
, gid
):
88 return gid
< self
.groups
and gid
not in self
.open
91 # a subpattern, in intermediate form
92 def __init__(self
, pattern
, data
=None):
93 self
.pattern
= pattern
98 def dump(self
, level
=0):
99 seqtypes
= (tuple, list)
100 for op
, av
in self
.data
:
101 print level
*" " + op
,
106 print (level
+1)*" " + op
, a
109 for i
, a
in enumerate(av
[1]):
111 print level
*" " + "or"
113 elif op
== GROUPREF_EXISTS
:
114 condgroup
, item_yes
, item_no
= av
116 item_yes
.dump(level
+1)
118 print level
*" " + "else"
119 item_no
.dump(level
+1)
120 elif isinstance(av
, seqtypes
):
123 if isinstance(a
, SubPattern
):
136 return repr(self
.data
)
138 return len(self
.data
)
139 def __delitem__(self
, index
):
141 def __getitem__(self
, index
):
142 if isinstance(index
, slice):
143 return SubPattern(self
.pattern
, self
.data
[index
])
144 return self
.data
[index
]
145 def __setitem__(self
, index
, code
):
146 self
.data
[index
] = code
147 def insert(self
, index
, code
):
148 self
.data
.insert(index
, code
)
149 def append(self
, code
):
150 self
.data
.append(code
)
152 # determine the width (min, max) for this subpattern
156 UNITCODES
= (ANY
, RANGE
, IN
, LITERAL
, NOT_LITERAL
, CATEGORY
)
157 REPEATCODES
= (MIN_REPEAT
, MAX_REPEAT
)
158 for op
, av
in self
.data
:
172 elif op
is SUBPATTERN
:
173 i
, j
= av
[1].getwidth()
176 elif op
in REPEATCODES
:
177 i
, j
= av
[2].getwidth()
180 elif op
in UNITCODES
:
185 self
.width
= min(lo
, MAXREPEAT
- 1), min(hi
, MAXREPEAT
)
189 def __init__(self
, string
):
194 if self
.index
>= len(self
.string
):
197 char
= self
.string
[self
.index
]
200 c
= self
.string
[self
.index
+ 1]
202 raise error
, "bogus escape (end of line)"
204 self
.index
= self
.index
+ len(char
)
206 def match(self
, char
, skip
=1):
207 if char
== self
.next
:
217 return self
.index
, self
.next
218 def seek(self
, index
):
219 self
.index
, self
.next
= index
222 return "a" <= char
<= "z" or "A" <= char
<= "Z" or char
== "_"
225 return "0" <= char
<= "9"
228 # check that group name is a valid string
229 if not isident(name
[0]):
231 for char
in name
[1:]:
232 if not isident(char
) and not isdigit(char
):
236 def _class_escape(source
, escape
):
237 # handle escape code inside character class
238 code
= ESCAPES
.get(escape
)
241 code
= CATEGORIES
.get(escape
)
242 if code
and code
[0] == IN
:
247 # hexadecimal escape (exactly two digits)
248 while source
.next
in HEXDIGITS
and len(escape
) < 4:
249 escape
= escape
+ source
.get()
252 raise error
, "bogus escape: %s" % repr("\\" + escape
)
253 return LITERAL
, int(escape
, 16) & 0xff
255 # octal escape (up to three digits)
256 while source
.next
in OCTDIGITS
and len(escape
) < 4:
257 escape
= escape
+ source
.get()
259 return LITERAL
, int(escape
, 8) & 0xff
261 raise error
, "bogus escape: %s" % repr(escape
)
263 return LITERAL
, ord(escape
[1])
266 raise error
, "bogus escape: %s" % repr(escape
)
268 def _escape(source
, escape
, state
):
269 # handle escape code in expression
270 code
= CATEGORIES
.get(escape
)
273 code
= ESCAPES
.get(escape
)
280 while source
.next
in HEXDIGITS
and len(escape
) < 4:
281 escape
= escape
+ source
.get()
284 return LITERAL
, int(escape
[2:], 16) & 0xff
287 while source
.next
in OCTDIGITS
and len(escape
) < 4:
288 escape
= escape
+ source
.get()
289 return LITERAL
, int(escape
[1:], 8) & 0xff
291 # octal escape *or* decimal group reference (sigh)
292 if source
.next
in DIGITS
:
293 escape
= escape
+ source
.get()
294 if (escape
[1] in OCTDIGITS
and escape
[2] in OCTDIGITS
and
295 source
.next
in OCTDIGITS
):
296 # got three octal digits; this is an octal escape
297 escape
= escape
+ source
.get()
298 return LITERAL
, int(escape
[1:], 8) & 0xff
299 # not an octal escape, so this is a group reference
300 group
= int(escape
[1:])
301 if group
< state
.groups
:
302 if not state
.checkgroup(group
):
303 raise error
, "cannot refer to open group"
306 warnings
.warn('group references in lookbehind '
307 'assertions are not supported',
309 return GROUPREF
, group
312 return LITERAL
, ord(escape
[1])
315 raise error
, "bogus escape: %s" % repr(escape
)
317 def _parse_sub(source
, state
, nested
=1):
318 # parse an alternation: a|b|c
321 itemsappend
= items
.append
322 sourcematch
= source
.match
324 itemsappend(_parse(source
, state
))
329 if not source
.next
or sourcematch(")", 0):
332 raise error
, "pattern not properly closed"
337 subpattern
= SubPattern(state
)
338 subpatternappend
= subpattern
.append
340 # check if all items share a common prefix
348 elif item
[0] != prefix
:
351 # all subitems start with a common "prefix".
352 # move it out of the branch
355 subpatternappend(prefix
)
356 continue # check next one
359 # check if the branch can be replaced by a character set
361 if len(item
) != 1 or item
[0][0] != LITERAL
:
364 # we can store this as a character set instead of a
365 # branch (the compiler may optimize this even more)
367 setappend
= set.append
370 subpatternappend((IN
, set))
373 subpattern
.append((BRANCH
, (None, items
)))
376 def _parse_sub_cond(source
, state
, condgroup
):
377 item_yes
= _parse(source
, state
)
378 if source
.match("|"):
379 item_no
= _parse(source
, state
)
380 if source
.match("|"):
381 raise error
, "conditional backref with more than two branches"
384 if source
.next
and not source
.match(")", 0):
385 raise error
, "pattern not properly closed"
386 subpattern
= SubPattern(state
)
387 subpattern
.append((GROUPREF_EXISTS
, (condgroup
, item_yes
, item_no
)))
390 _PATTERNENDERS
= set("|)")
391 _ASSERTCHARS
= set("=!<")
392 _LOOKBEHINDASSERTCHARS
= set("=!")
393 _REPEATCODES
= set([MIN_REPEAT
, MAX_REPEAT
])
395 def _parse(source
, state
):
396 # parse a simple pattern
397 subpattern
= SubPattern(state
)
399 # precompute constants into local variables
400 subpatternappend
= subpattern
.append
401 sourceget
= source
.get
402 sourcematch
= source
.match
404 PATTERNENDERS
= _PATTERNENDERS
405 ASSERTCHARS
= _ASSERTCHARS
406 LOOKBEHINDASSERTCHARS
= _LOOKBEHINDASSERTCHARS
407 REPEATCODES
= _REPEATCODES
411 if source
.next
in PATTERNENDERS
:
412 break # end of subpattern
415 break # end of pattern
417 if state
.flags
& SRE_FLAG_VERBOSE
:
418 # skip whitespace and comments
419 if this
in WHITESPACE
:
424 if this
in (None, "\n"):
428 if this
and this
[0] not in SPECIAL_CHARS
:
429 subpatternappend((LITERAL
, ord(this
)))
434 setappend
= set.append
435 ## if sourcematch(":"):
436 ## pass # handle character classes
438 setappend((NEGATE
, None))
439 # check remaining characters
443 if this
== "]" and set != start
:
445 elif this
and this
[0] == "\\":
446 code1
= _class_escape(source
, this
)
448 code1
= LITERAL
, ord(this
)
450 raise error
, "unexpected end of regular expression"
458 setappend((LITERAL
, ord("-")))
462 code2
= _class_escape(source
, this
)
464 code2
= LITERAL
, ord(this
)
465 if code1
[0] != LITERAL
or code2
[0] != LITERAL
:
466 raise error
, "bad character range"
470 raise error
, "bad character range"
471 setappend((RANGE
, (lo
, hi
)))
473 raise error
, "unexpected end of regular expression"
479 # XXX: <fl> should move set optimization to compiler!
480 if _len(set)==1 and set[0][0] is LITERAL
:
481 subpatternappend(set[0]) # optimization
482 elif _len(set)==2 and set[0][0] is NEGATE
and set[1][0] is LITERAL
:
483 subpatternappend((NOT_LITERAL
, set[1][1])) # optimization
485 # XXX: <fl> should add charmap optimization here
486 subpatternappend((IN
, set))
488 elif this
and this
[0] in REPEAT_CHARS
:
489 # repeat previous item
493 min, max = 0, MAXREPEAT
496 min, max = 1, MAXREPEAT
498 if source
.next
== "}":
499 subpatternappend((LITERAL
, ord(this
)))
502 min, max = 0, MAXREPEAT
504 while source
.next
in DIGITS
:
505 lo
= lo
+ source
.get()
507 while source
.next
in DIGITS
:
508 hi
= hi
+ sourceget()
511 if not sourcematch("}"):
512 subpatternappend((LITERAL
, ord(this
)))
518 raise OverflowError("the repetition number is too large")
522 raise OverflowError("the repetition number is too large")
524 raise error("bad repeat interval")
526 raise error
, "not supported"
527 # figure out which item to repeat
529 item
= subpattern
[-1:]
532 if not item
or (_len(item
) == 1 and item
[0][0] == AT
):
533 raise error
, "nothing to repeat"
534 if item
[0][0] in REPEATCODES
:
535 raise error
, "multiple repeat"
537 subpattern
[-1] = (MIN_REPEAT
, (min, max, item
))
539 subpattern
[-1] = (MAX_REPEAT
, (min, max, item
))
542 subpatternappend((ANY
, None))
554 # named group: skip forward to end of name
559 raise error
, "unterminated name"
565 raise error("missing group name")
567 raise error("bad character in group name %r" %
569 elif sourcematch("="):
570 # named backreference
575 raise error
, "unterminated name"
580 raise error("missing group name")
582 raise error("bad character in backref group name "
584 gid
= state
.groupdict
.get(name
)
586 msg
= "unknown group name: {0!r}".format(name
)
590 warnings
.warn('group references in lookbehind '
591 'assertions are not supported',
593 subpatternappend((GROUPREF
, gid
))
598 raise error
, "unexpected end of pattern"
599 raise error
, "unknown specifier: ?P%s" % char
600 elif sourcematch(":"):
601 # non-capturing group
603 elif sourcematch("#"):
606 if source
.next
is None or source
.next
== ")":
609 if not sourcematch(")"):
610 raise error
, "unbalanced parenthesis"
612 elif source
.next
in ASSERTCHARS
:
613 # lookahead assertions
617 if source
.next
not in LOOKBEHINDASSERTCHARS
:
618 raise error
, "syntax error"
619 dir = -1 # lookbehind
621 state
.lookbehind
+= 1
622 p
= _parse_sub(source
, state
)
624 state
.lookbehind
-= 1
625 if not sourcematch(")"):
626 raise error
, "unbalanced parenthesis"
628 subpatternappend((ASSERT
, (dir, p
)))
630 subpatternappend((ASSERT_NOT
, (dir, p
)))
632 elif sourcematch("("):
633 # conditional backreference group
638 raise error
, "unterminated name"
641 condname
= condname
+ char
644 raise error("missing group name")
646 condgroup
= state
.groupdict
.get(condname
)
647 if condgroup
is None:
648 msg
= "unknown group name: {0!r}".format(condname
)
652 condgroup
= int(condname
)
654 raise error
, "bad character in group name"
657 warnings
.warn('group references in lookbehind '
658 'assertions are not supported',
662 if not source
.next
in FLAGS
:
663 raise error
, "unexpected end of pattern"
664 while source
.next
in FLAGS
:
665 state
.flags
= state
.flags | FLAGS
[sourceget()]
667 # parse group contents
672 group
= state
.opengroup(name
)
674 p
= _parse_sub_cond(source
, state
, condgroup
)
676 p
= _parse_sub(source
, state
)
677 if not sourcematch(")"):
678 raise error
, "unbalanced parenthesis"
679 if group
is not None:
680 state
.closegroup(group
)
681 subpatternappend((SUBPATTERN
, (group
, p
)))
686 raise error
, "unexpected end of pattern"
689 raise error
, "unknown extension"
692 subpatternappend((AT
, AT_BEGINNING
))
695 subpattern
.append((AT
, AT_END
))
697 elif this
and this
[0] == "\\":
698 code
= _escape(source
, this
, state
)
699 subpatternappend(code
)
702 raise error
, "parser error"
706 def parse(str, flags
=0, pattern
=None):
707 # parse 're' pattern into list of (opcode, argument) tuples
709 source
= Tokenizer(str)
713 pattern
.flags
= flags
716 p
= _parse_sub(source
, pattern
, 0)
720 raise error
, "unbalanced parenthesis"
722 raise error
, "bogus characters at end of regular expression"
724 if flags
& SRE_FLAG_DEBUG
:
727 if not (flags
& SRE_FLAG_VERBOSE
) and p
.pattern
.flags
& SRE_FLAG_VERBOSE
:
728 # the VERBOSE flag was switched on inside the pattern. to be
729 # on the safe side, we'll parse the whole thing again...
730 return parse(str, p
.pattern
.flags
)
734 def parse_template(source
, pattern
):
735 # parse 're' replacement string into list of literals and
737 s
= Tokenizer(source
)
741 def literal(literal
, p
=p
, pappend
=a
):
742 if p
and p
[-1][0] is LITERAL
:
743 p
[-1] = LITERAL
, p
[-1][1] + literal
745 pappend((LITERAL
, literal
))
747 if type(sep
) is type(""):
754 break # end of replacement string
755 if this
and this
[0] == "\\":
764 raise error
, "unterminated group name"
769 raise error
, "missing group name"
773 raise error
, "negative group number"
776 raise error
, "bad character in group name"
778 index
= pattern
.groupindex
[name
]
780 msg
= "unknown group name: {0!r}".format(name
)
781 raise IndexError(msg
)
784 if s
.next
in OCTDIGITS
:
786 if s
.next
in OCTDIGITS
:
788 literal(makechar(int(this
[1:], 8) & 0xff))
793 if (c
in OCTDIGITS
and this
[2] in OCTDIGITS
and
794 s
.next
in OCTDIGITS
):
797 literal(makechar(int(this
[1:], 8) & 0xff))
799 a((MARK
, int(this
[1:])))
802 this
= makechar(ESCAPES
[this
][1])
808 # convert template to groups and literals lists
811 groupsappend
= groups
.append
812 literals
= [None] * len(p
)
816 # literal[i] is already None
820 return groups
, literals
822 def expand_template(template
, match
):
824 sep
= match
.string
[:0]
825 groups
, literals
= template
826 literals
= literals
[:]
828 for index
, group
in groups
:
829 literals
[index
] = s
= g(group
)
831 raise error
, "unmatched group"
833 raise error
, "invalid group reference"
834 return sep
.join(literals
)