]>
git.proxmox.com Git - mirror_edk2.git/blob - AppPkg/Applications/Python/Python-2.7.10/Lib/sre_compile.py
1 # -*- coding: utf-8 -*-
3 # Secret Labs' Regular Expression Engine
5 # convert template to internal format
7 # Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
9 # See the sre.py file for information on usage and redistribution.
12 """Internal support module for sre"""
16 from sre_constants
import *
18 assert _sre
.MAGIC
== MAGIC
, "SRE module mismatch"
20 if _sre
.CODESIZE
== 2:
25 _LITERAL_CODES
= set([LITERAL
, NOT_LITERAL
])
26 _REPEATING_CODES
= set([REPEAT
, MIN_REPEAT
, MAX_REPEAT
])
27 _SUCCESS_CODES
= set([SUCCESS
, FAILURE
])
28 _ASSERT_CODES
= set([ASSERT
, ASSERT_NOT
])
30 # Sets of lowercase characters which have the same uppercase.
32 # LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I
34 # LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S
36 # MICRO SIGN, GREEK SMALL LETTER MU
38 # COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI
39 (0x345, 0x3b9, 0x1fbe), # \u0345ιι
40 # GREEK SMALL LETTER BETA, GREEK BETA SYMBOL
42 # GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL
44 # GREEK SMALL LETTER THETA, GREEK THETA SYMBOL
46 # GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL
48 # GREEK SMALL LETTER PI, GREEK PI SYMBOL
50 # GREEK SMALL LETTER RHO, GREEK RHO SYMBOL
52 # GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA
54 # GREEK SMALL LETTER PHI, GREEK PHI SYMBOL
56 # LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE
57 (0x1e61, 0x1e9b), # ṡẛ
60 # Maps the lowercase code to lowercase codes which have the same uppercase.
61 _ignorecase_fixes
= {i
: tuple(j
for j
in t
if i
!= j
)
62 for t
in _equivalences
for i
in t
}
64 def _compile(code
, pattern
, flags
):
65 # internal: compile a (sub)pattern
68 LITERAL_CODES
= _LITERAL_CODES
69 REPEATING_CODES
= _REPEATING_CODES
70 SUCCESS_CODES
= _SUCCESS_CODES
71 ASSERT_CODES
= _ASSERT_CODES
72 if (flags
& SRE_FLAG_IGNORECASE
and
73 not (flags
& SRE_FLAG_LOCALE
) and
74 flags
& SRE_FLAG_UNICODE
):
75 fixes
= _ignorecase_fixes
78 for op
, av
in pattern
:
79 if op
in LITERAL_CODES
:
80 if flags
& SRE_FLAG_IGNORECASE
:
81 lo
= _sre
.getlower(av
, flags
)
82 if fixes
and lo
in fixes
:
83 emit(OPCODES
[IN_IGNORE
])
84 skip
= _len(code
); emit(0)
87 for k
in (lo
,) + fixes
[lo
]:
88 emit(OPCODES
[LITERAL
])
90 emit(OPCODES
[FAILURE
])
91 code
[skip
] = _len(code
) - skip
93 emit(OPCODES
[OP_IGNORE
[op
]])
99 if flags
& SRE_FLAG_IGNORECASE
:
100 emit(OPCODES
[OP_IGNORE
[op
]])
101 def fixup(literal
, flags
=flags
):
102 return _sre
.getlower(literal
, flags
)
106 skip
= _len(code
); emit(0)
107 _compile_charset(av
, flags
, code
, fixup
, fixes
)
108 code
[skip
] = _len(code
) - skip
110 if flags
& SRE_FLAG_DOTALL
:
111 emit(OPCODES
[ANY_ALL
])
114 elif op
in REPEATING_CODES
:
115 if flags
& SRE_FLAG_TEMPLATE
:
116 raise error
, "internal: unsupported template operator"
117 emit(OPCODES
[REPEAT
])
118 skip
= _len(code
); emit(0)
121 _compile(code
, av
[2], flags
)
122 emit(OPCODES
[SUCCESS
])
123 code
[skip
] = _len(code
) - skip
124 elif _simple(av
) and op
is not REPEAT
:
126 emit(OPCODES
[REPEAT_ONE
])
128 emit(OPCODES
[MIN_REPEAT_ONE
])
129 skip
= _len(code
); emit(0)
132 _compile(code
, av
[2], flags
)
133 emit(OPCODES
[SUCCESS
])
134 code
[skip
] = _len(code
) - skip
136 emit(OPCODES
[REPEAT
])
137 skip
= _len(code
); emit(0)
140 _compile(code
, av
[2], flags
)
141 code
[skip
] = _len(code
) - skip
143 emit(OPCODES
[MAX_UNTIL
])
145 emit(OPCODES
[MIN_UNTIL
])
146 elif op
is SUBPATTERN
:
150 # _compile_info(code, av[1], flags)
151 _compile(code
, av
[1], flags
)
155 elif op
in SUCCESS_CODES
:
157 elif op
in ASSERT_CODES
:
159 skip
= _len(code
); emit(0)
163 lo
, hi
= av
[1].getwidth()
165 raise error
, "look-behind requires fixed-width pattern"
166 emit(lo
) # look behind
167 _compile(code
, av
[1], flags
)
168 emit(OPCODES
[SUCCESS
])
169 code
[skip
] = _len(code
) - skip
172 skip
= _len(code
); emit(0)
173 _compile(code
, av
, flags
)
174 emit(OPCODES
[SUCCESS
])
175 code
[skip
] = _len(code
) - skip
178 if flags
& SRE_FLAG_MULTILINE
:
179 av
= AT_MULTILINE
.get(av
, av
)
180 if flags
& SRE_FLAG_LOCALE
:
181 av
= AT_LOCALE
.get(av
, av
)
182 elif flags
& SRE_FLAG_UNICODE
:
183 av
= AT_UNICODE
.get(av
, av
)
188 tailappend
= tail
.append
190 skip
= _len(code
); emit(0)
191 # _compile_info(code, av, flags)
192 _compile(code
, av
, flags
)
194 tailappend(_len(code
)); emit(0)
195 code
[skip
] = _len(code
) - skip
196 emit(0) # end of branch
198 code
[tail
] = _len(code
) - tail
201 if flags
& SRE_FLAG_LOCALE
:
203 elif flags
& SRE_FLAG_UNICODE
:
207 if flags
& SRE_FLAG_IGNORECASE
:
208 emit(OPCODES
[OP_IGNORE
[op
]])
212 elif op
is GROUPREF_EXISTS
:
215 skipyes
= _len(code
); emit(0)
216 _compile(code
, av
[1], flags
)
219 skipno
= _len(code
); emit(0)
220 code
[skipyes
] = _len(code
) - skipyes
+ 1
221 _compile(code
, av
[2], flags
)
222 code
[skipno
] = _len(code
) - skipno
224 code
[skipyes
] = _len(code
) - skipyes
+ 1
226 raise ValueError, ("unsupported operand type", op
)
228 def _compile_charset(charset
, flags
, code
, fixup
=None, fixes
=None):
229 # compile charset subprogram
231 for op
, av
in _optimize_charset(charset
, fixup
, fixes
,
232 flags
& SRE_FLAG_UNICODE
):
243 elif op
is BIGCHARSET
:
246 if flags
& SRE_FLAG_LOCALE
:
247 emit(CHCODES
[CH_LOCALE
[av
]])
248 elif flags
& SRE_FLAG_UNICODE
:
249 emit(CHCODES
[CH_UNICODE
[av
]])
253 raise error
, "internal: unsupported set operator"
254 emit(OPCODES
[FAILURE
])
256 def _optimize_charset(charset
, fixup
, fixes
, isunicode
):
257 # internal: optimize character set
260 charmap
= bytearray(256)
261 for op
, av
in charset
:
268 if fixes
and i
in fixes
:
274 r
= range(av
[0], av
[1]+1)
289 tail
.append((op
, av
))
291 if len(charmap
) == 256:
292 # character set contains non-UCS1 character codes
293 charmap
+= b
'\0' * 0xff00
295 # character set contains non-BMP character codes
296 if fixup
and isunicode
and op
is RANGE
:
299 # There are only two ranges of cased astral characters:
300 # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi).
301 _fixup_range(max(0x10000, lo
), min(0x11fff, hi
),
303 for lo
, hi
in ranges
:
305 tail
.append((LITERAL
, hi
))
307 tail
.append((RANGE
, (lo
, hi
)))
309 tail
.append((op
, av
))
312 # compress character map
316 p
= charmap
.find(b
'\1', q
)
322 q
= charmap
.find(b
'\0', p
)
324 runs
.append((p
, len(charmap
)))
331 out
.append((LITERAL
, p
))
333 out
.append((RANGE
, (p
, q
- 1)))
335 # if the case was changed or new representation is more compact
336 if fixup
or len(out
) < len(charset
):
338 # else original character set is good enough
342 if len(charmap
) == 256:
343 data
= _mk_bitmap(charmap
)
344 out
.append((CHARSET
, data
))
348 # To represent a big charset, first a bitmap of all characters in the
349 # set is constructed. Then, this bitmap is sliced into chunks of 256
350 # characters, duplicate chunks are eliminated, and each chunk is
351 # given a number. In the compiled expression, the charset is
352 # represented by a 32-bit word sequence, consisting of one word for
353 # the number of different chunks, a sequence of 256 bytes (64 words)
354 # of chunk numbers indexed by their original chunk position, and a
355 # sequence of 256-bit chunks (8 words each).
357 # Compression is normally good: in a typical charset, large ranges of
358 # Unicode will be either completely excluded (e.g. if only cyrillic
359 # letters are to be matched), or completely included (e.g. if large
360 # subranges of Kanji match). These ranges will be represented by
361 # chunks of all one-bits or all zero-bits.
363 # Matching can be also done efficiently: the more significant byte of
364 # the Unicode character is an index into the chunk number, and the
365 # less significant byte is a bit index in the chunk (just like the
368 # In UCS-4 mode, the BIGCHARSET opcode still supports only subsets
369 # of the basic multilingual plane; an efficient representation
370 # for all of Unicode has not yet been developed.
372 charmap
= bytes(charmap
) # should be hashable
374 mapping
= bytearray(256)
377 for i
in range(0, 65536, 256):
378 chunk
= charmap
[i
: i
+ 256]
380 mapping
[i
// 256] = comps
[chunk
]
382 mapping
[i
// 256] = comps
[chunk
] = block
385 data
= _mk_bitmap(data
)
386 data
[0:0] = [block
] + _bytes_to_codes(mapping
)
387 out
.append((BIGCHARSET
, data
))
391 def _fixup_range(lo
, hi
, ranges
, fixup
):
392 for i
in map(fixup
, range(lo
, hi
+1)):
393 for k
, (lo
, hi
) in enumerate(ranges
):
398 ranges
.insert(k
, (i
, i
))
407 ranges
.append((i
, i
))
409 _CODEBITS
= _sre
.CODESIZE
* 8
410 _BITS_TRANS
= b
'0' + b
'1' * 255
411 def _mk_bitmap(bits
, _CODEBITS
=_CODEBITS
, _int
=int):
412 s
= bytes(bits
).translate(_BITS_TRANS
)[::-1]
413 return [_int(s
[i
- _CODEBITS
: i
], 2)
414 for i
in range(len(s
), 0, -_CODEBITS
)]
416 def _bytes_to_codes(b
):
417 # Convert block indices to word array
419 if _sre
.CODESIZE
== 2:
423 a
= array
.array(code
, bytes(b
))
424 assert a
.itemsize
== _sre
.CODESIZE
425 assert len(a
) * a
.itemsize
== len(b
)
429 # check if av is a "simple" operator
430 lo
, hi
= av
[2].getwidth()
431 return lo
== hi
== 1 and av
[2][0][0] != SUBPATTERN
433 def _compile_info(code
, pattern
, flags
):
434 # internal: compile an info block. in the current version,
435 # this contains min/max pattern width, and an optional literal
436 # prefix or a character map
437 lo
, hi
= pattern
.getwidth()
439 return # not worth it
440 # look for a literal prefix
442 prefixappend
= prefix
.append
444 charset
= [] # not used
445 charsetappend
= charset
.append
446 if not (flags
& SRE_FLAG_IGNORECASE
):
447 # look for literal prefix
448 for op
, av
in pattern
.data
:
450 if len(prefix
) == prefix_skip
:
451 prefix_skip
= prefix_skip
+ 1
453 elif op
is SUBPATTERN
and len(av
[1]) == 1:
461 # if no prefix, look for charset prefix
462 if not prefix
and pattern
.data
:
463 op
, av
= pattern
.data
[0]
464 if op
is SUBPATTERN
and av
[1]:
467 charsetappend((op
, av
))
497 ## print "*** PREFIX", prefix, prefix_skip
499 ## print "*** CHARSET", charset
503 skip
= len(code
); emit(0)
507 mask
= SRE_INFO_PREFIX
508 if len(prefix
) == prefix_skip
== len(pattern
.data
):
509 mask
= mask
+ SRE_INFO_LITERAL
511 mask
= mask
+ SRE_INFO_CHARSET
518 prefix
= prefix
[:MAXCODE
]
525 emit(len(prefix
)) # length
526 emit(prefix_skip
) # skip
528 # generate overlap table
529 table
= [-1] + ([0]*len(prefix
))
530 for i
in xrange(len(prefix
)):
531 table
[i
+1] = table
[i
]+1
532 while table
[i
+1] > 0 and prefix
[i
] != prefix
[table
[i
+1]-1]:
533 table
[i
+1] = table
[table
[i
+1]-1]+1
534 code
.extend(table
[1:]) # don't store first entry
536 _compile_charset(charset
, flags
, code
)
537 code
[skip
] = len(code
) - skip
542 STRING_TYPES
= (type(""),)
544 STRING_TYPES
= (type(""), type(unicode("")))
547 for tp
in STRING_TYPES
:
548 if isinstance(obj
, tp
):
554 flags
= p
.pattern
.flags | flags
558 _compile_info(code
, p
, flags
)
560 # compile the pattern
561 _compile(code
, p
.data
, flags
)
563 code
.append(OPCODES
[SUCCESS
])
567 def compile(p
, flags
=0):
568 # internal: convert pattern list to internal format
572 p
= sre_parse
.parse(p
, flags
)
576 code
= _code(p
, flags
)
580 # XXX: <fl> get rid of this limitation!
581 if p
.pattern
.groups
> 100:
582 raise AssertionError(
583 "sorry, but this version only supports 100 named groups"
586 # map in either direction
587 groupindex
= p
.pattern
.groupdict
588 indexgroup
= [None] * p
.pattern
.groups
589 for k
, i
in groupindex
.items():
593 pattern
, flags | p
.pattern
.flags
, code
,
595 groupindex
, indexgroup