]> git.proxmox.com Git - mirror_edk2.git/blame - AppPkg/Applications/Python/Python-2.7.2/Lib/sre_compile.py
AppPkg/Applications/Python: Add Python 2.7.2 sources since the release of Python...
[mirror_edk2.git] / AppPkg / Applications / Python / Python-2.7.2 / Lib / sre_compile.py
CommitLineData
4710c53d 1#\r
2# Secret Labs' Regular Expression Engine\r
3#\r
4# convert template to internal format\r
5#\r
6# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.\r
7#\r
8# See the sre.py file for information on usage and redistribution.\r
9#\r
10\r
11"""Internal support module for sre"""\r
12\r
13import _sre, sys\r
14import sre_parse\r
15from sre_constants import *\r
16\r
17assert _sre.MAGIC == MAGIC, "SRE module mismatch"\r
18\r
19if _sre.CODESIZE == 2:\r
20 MAXCODE = 65535\r
21else:\r
22 MAXCODE = 0xFFFFFFFFL\r
23\r
24def _identityfunction(x):\r
25 return x\r
26\r
27_LITERAL_CODES = set([LITERAL, NOT_LITERAL])\r
28_REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT])\r
29_SUCCESS_CODES = set([SUCCESS, FAILURE])\r
30_ASSERT_CODES = set([ASSERT, ASSERT_NOT])\r
31\r
32def _compile(code, pattern, flags):\r
33 # internal: compile a (sub)pattern\r
34 emit = code.append\r
35 _len = len\r
36 LITERAL_CODES = _LITERAL_CODES\r
37 REPEATING_CODES = _REPEATING_CODES\r
38 SUCCESS_CODES = _SUCCESS_CODES\r
39 ASSERT_CODES = _ASSERT_CODES\r
40 for op, av in pattern:\r
41 if op in LITERAL_CODES:\r
42 if flags & SRE_FLAG_IGNORECASE:\r
43 emit(OPCODES[OP_IGNORE[op]])\r
44 emit(_sre.getlower(av, flags))\r
45 else:\r
46 emit(OPCODES[op])\r
47 emit(av)\r
48 elif op is IN:\r
49 if flags & SRE_FLAG_IGNORECASE:\r
50 emit(OPCODES[OP_IGNORE[op]])\r
51 def fixup(literal, flags=flags):\r
52 return _sre.getlower(literal, flags)\r
53 else:\r
54 emit(OPCODES[op])\r
55 fixup = _identityfunction\r
56 skip = _len(code); emit(0)\r
57 _compile_charset(av, flags, code, fixup)\r
58 code[skip] = _len(code) - skip\r
59 elif op is ANY:\r
60 if flags & SRE_FLAG_DOTALL:\r
61 emit(OPCODES[ANY_ALL])\r
62 else:\r
63 emit(OPCODES[ANY])\r
64 elif op in REPEATING_CODES:\r
65 if flags & SRE_FLAG_TEMPLATE:\r
66 raise error, "internal: unsupported template operator"\r
67 emit(OPCODES[REPEAT])\r
68 skip = _len(code); emit(0)\r
69 emit(av[0])\r
70 emit(av[1])\r
71 _compile(code, av[2], flags)\r
72 emit(OPCODES[SUCCESS])\r
73 code[skip] = _len(code) - skip\r
74 elif _simple(av) and op is not REPEAT:\r
75 if op is MAX_REPEAT:\r
76 emit(OPCODES[REPEAT_ONE])\r
77 else:\r
78 emit(OPCODES[MIN_REPEAT_ONE])\r
79 skip = _len(code); emit(0)\r
80 emit(av[0])\r
81 emit(av[1])\r
82 _compile(code, av[2], flags)\r
83 emit(OPCODES[SUCCESS])\r
84 code[skip] = _len(code) - skip\r
85 else:\r
86 emit(OPCODES[REPEAT])\r
87 skip = _len(code); emit(0)\r
88 emit(av[0])\r
89 emit(av[1])\r
90 _compile(code, av[2], flags)\r
91 code[skip] = _len(code) - skip\r
92 if op is MAX_REPEAT:\r
93 emit(OPCODES[MAX_UNTIL])\r
94 else:\r
95 emit(OPCODES[MIN_UNTIL])\r
96 elif op is SUBPATTERN:\r
97 if av[0]:\r
98 emit(OPCODES[MARK])\r
99 emit((av[0]-1)*2)\r
100 # _compile_info(code, av[1], flags)\r
101 _compile(code, av[1], flags)\r
102 if av[0]:\r
103 emit(OPCODES[MARK])\r
104 emit((av[0]-1)*2+1)\r
105 elif op in SUCCESS_CODES:\r
106 emit(OPCODES[op])\r
107 elif op in ASSERT_CODES:\r
108 emit(OPCODES[op])\r
109 skip = _len(code); emit(0)\r
110 if av[0] >= 0:\r
111 emit(0) # look ahead\r
112 else:\r
113 lo, hi = av[1].getwidth()\r
114 if lo != hi:\r
115 raise error, "look-behind requires fixed-width pattern"\r
116 emit(lo) # look behind\r
117 _compile(code, av[1], flags)\r
118 emit(OPCODES[SUCCESS])\r
119 code[skip] = _len(code) - skip\r
120 elif op is CALL:\r
121 emit(OPCODES[op])\r
122 skip = _len(code); emit(0)\r
123 _compile(code, av, flags)\r
124 emit(OPCODES[SUCCESS])\r
125 code[skip] = _len(code) - skip\r
126 elif op is AT:\r
127 emit(OPCODES[op])\r
128 if flags & SRE_FLAG_MULTILINE:\r
129 av = AT_MULTILINE.get(av, av)\r
130 if flags & SRE_FLAG_LOCALE:\r
131 av = AT_LOCALE.get(av, av)\r
132 elif flags & SRE_FLAG_UNICODE:\r
133 av = AT_UNICODE.get(av, av)\r
134 emit(ATCODES[av])\r
135 elif op is BRANCH:\r
136 emit(OPCODES[op])\r
137 tail = []\r
138 tailappend = tail.append\r
139 for av in av[1]:\r
140 skip = _len(code); emit(0)\r
141 # _compile_info(code, av, flags)\r
142 _compile(code, av, flags)\r
143 emit(OPCODES[JUMP])\r
144 tailappend(_len(code)); emit(0)\r
145 code[skip] = _len(code) - skip\r
146 emit(0) # end of branch\r
147 for tail in tail:\r
148 code[tail] = _len(code) - tail\r
149 elif op is CATEGORY:\r
150 emit(OPCODES[op])\r
151 if flags & SRE_FLAG_LOCALE:\r
152 av = CH_LOCALE[av]\r
153 elif flags & SRE_FLAG_UNICODE:\r
154 av = CH_UNICODE[av]\r
155 emit(CHCODES[av])\r
156 elif op is GROUPREF:\r
157 if flags & SRE_FLAG_IGNORECASE:\r
158 emit(OPCODES[OP_IGNORE[op]])\r
159 else:\r
160 emit(OPCODES[op])\r
161 emit(av-1)\r
162 elif op is GROUPREF_EXISTS:\r
163 emit(OPCODES[op])\r
164 emit(av[0]-1)\r
165 skipyes = _len(code); emit(0)\r
166 _compile(code, av[1], flags)\r
167 if av[2]:\r
168 emit(OPCODES[JUMP])\r
169 skipno = _len(code); emit(0)\r
170 code[skipyes] = _len(code) - skipyes + 1\r
171 _compile(code, av[2], flags)\r
172 code[skipno] = _len(code) - skipno\r
173 else:\r
174 code[skipyes] = _len(code) - skipyes + 1\r
175 else:\r
176 raise ValueError, ("unsupported operand type", op)\r
177\r
178def _compile_charset(charset, flags, code, fixup=None):\r
179 # compile charset subprogram\r
180 emit = code.append\r
181 if fixup is None:\r
182 fixup = _identityfunction\r
183 for op, av in _optimize_charset(charset, fixup):\r
184 emit(OPCODES[op])\r
185 if op is NEGATE:\r
186 pass\r
187 elif op is LITERAL:\r
188 emit(fixup(av))\r
189 elif op is RANGE:\r
190 emit(fixup(av[0]))\r
191 emit(fixup(av[1]))\r
192 elif op is CHARSET:\r
193 code.extend(av)\r
194 elif op is BIGCHARSET:\r
195 code.extend(av)\r
196 elif op is CATEGORY:\r
197 if flags & SRE_FLAG_LOCALE:\r
198 emit(CHCODES[CH_LOCALE[av]])\r
199 elif flags & SRE_FLAG_UNICODE:\r
200 emit(CHCODES[CH_UNICODE[av]])\r
201 else:\r
202 emit(CHCODES[av])\r
203 else:\r
204 raise error, "internal: unsupported set operator"\r
205 emit(OPCODES[FAILURE])\r
206\r
207def _optimize_charset(charset, fixup):\r
208 # internal: optimize character set\r
209 out = []\r
210 outappend = out.append\r
211 charmap = [0]*256\r
212 try:\r
213 for op, av in charset:\r
214 if op is NEGATE:\r
215 outappend((op, av))\r
216 elif op is LITERAL:\r
217 charmap[fixup(av)] = 1\r
218 elif op is RANGE:\r
219 for i in range(fixup(av[0]), fixup(av[1])+1):\r
220 charmap[i] = 1\r
221 elif op is CATEGORY:\r
222 # XXX: could append to charmap tail\r
223 return charset # cannot compress\r
224 except IndexError:\r
225 # character set contains unicode characters\r
226 return _optimize_unicode(charset, fixup)\r
227 # compress character map\r
228 i = p = n = 0\r
229 runs = []\r
230 runsappend = runs.append\r
231 for c in charmap:\r
232 if c:\r
233 if n == 0:\r
234 p = i\r
235 n = n + 1\r
236 elif n:\r
237 runsappend((p, n))\r
238 n = 0\r
239 i = i + 1\r
240 if n:\r
241 runsappend((p, n))\r
242 if len(runs) <= 2:\r
243 # use literal/range\r
244 for p, n in runs:\r
245 if n == 1:\r
246 outappend((LITERAL, p))\r
247 else:\r
248 outappend((RANGE, (p, p+n-1)))\r
249 if len(out) < len(charset):\r
250 return out\r
251 else:\r
252 # use bitmap\r
253 data = _mk_bitmap(charmap)\r
254 outappend((CHARSET, data))\r
255 return out\r
256 return charset\r
257\r
258def _mk_bitmap(bits):\r
259 data = []\r
260 dataappend = data.append\r
261 if _sre.CODESIZE == 2:\r
262 start = (1, 0)\r
263 else:\r
264 start = (1L, 0L)\r
265 m, v = start\r
266 for c in bits:\r
267 if c:\r
268 v = v + m\r
269 m = m + m\r
270 if m > MAXCODE:\r
271 dataappend(v)\r
272 m, v = start\r
273 return data\r
274\r
275# To represent a big charset, first a bitmap of all characters in the\r
276# set is constructed. Then, this bitmap is sliced into chunks of 256\r
277# characters, duplicate chunks are eliminated, and each chunk is\r
278# given a number. In the compiled expression, the charset is\r
279# represented by a 16-bit word sequence, consisting of one word for\r
280# the number of different chunks, a sequence of 256 bytes (128 words)\r
281# of chunk numbers indexed by their original chunk position, and a\r
282# sequence of chunks (16 words each).\r
283\r
284# Compression is normally good: in a typical charset, large ranges of\r
285# Unicode will be either completely excluded (e.g. if only cyrillic\r
286# letters are to be matched), or completely included (e.g. if large\r
287# subranges of Kanji match). These ranges will be represented by\r
288# chunks of all one-bits or all zero-bits.\r
289\r
290# Matching can be also done efficiently: the more significant byte of\r
291# the Unicode character is an index into the chunk number, and the\r
292# less significant byte is a bit index in the chunk (just like the\r
293# CHARSET matching).\r
294\r
295# In UCS-4 mode, the BIGCHARSET opcode still supports only subsets\r
296# of the basic multilingual plane; an efficient representation\r
297# for all of UTF-16 has not yet been developed. This means,\r
298# in particular, that negated charsets cannot be represented as\r
299# bigcharsets.\r
300\r
301def _optimize_unicode(charset, fixup):\r
302 try:\r
303 import array\r
304 except ImportError:\r
305 return charset\r
306 charmap = [0]*65536\r
307 negate = 0\r
308 try:\r
309 for op, av in charset:\r
310 if op is NEGATE:\r
311 negate = 1\r
312 elif op is LITERAL:\r
313 charmap[fixup(av)] = 1\r
314 elif op is RANGE:\r
315 for i in xrange(fixup(av[0]), fixup(av[1])+1):\r
316 charmap[i] = 1\r
317 elif op is CATEGORY:\r
318 # XXX: could expand category\r
319 return charset # cannot compress\r
320 except IndexError:\r
321 # non-BMP characters\r
322 return charset\r
323 if negate:\r
324 if sys.maxunicode != 65535:\r
325 # XXX: negation does not work with big charsets\r
326 return charset\r
327 for i in xrange(65536):\r
328 charmap[i] = not charmap[i]\r
329 comps = {}\r
330 mapping = [0]*256\r
331 block = 0\r
332 data = []\r
333 for i in xrange(256):\r
334 chunk = tuple(charmap[i*256:(i+1)*256])\r
335 new = comps.setdefault(chunk, block)\r
336 mapping[i] = new\r
337 if new == block:\r
338 block = block + 1\r
339 data = data + _mk_bitmap(chunk)\r
340 header = [block]\r
341 if _sre.CODESIZE == 2:\r
342 code = 'H'\r
343 else:\r
344 code = 'I'\r
345 # Convert block indices to byte array of 256 bytes\r
346 mapping = array.array('b', mapping).tostring()\r
347 # Convert byte array to word array\r
348 mapping = array.array(code, mapping)\r
349 assert mapping.itemsize == _sre.CODESIZE\r
350 header = header + mapping.tolist()\r
351 data[0:0] = header\r
352 return [(BIGCHARSET, data)]\r
353\r
354def _simple(av):\r
355 # check if av is a "simple" operator\r
356 lo, hi = av[2].getwidth()\r
357 if lo == 0 and hi == MAXREPEAT:\r
358 raise error, "nothing to repeat"\r
359 return lo == hi == 1 and av[2][0][0] != SUBPATTERN\r
360\r
361def _compile_info(code, pattern, flags):\r
362 # internal: compile an info block. in the current version,\r
363 # this contains min/max pattern width, and an optional literal\r
364 # prefix or a character map\r
365 lo, hi = pattern.getwidth()\r
366 if lo == 0:\r
367 return # not worth it\r
368 # look for a literal prefix\r
369 prefix = []\r
370 prefixappend = prefix.append\r
371 prefix_skip = 0\r
372 charset = [] # not used\r
373 charsetappend = charset.append\r
374 if not (flags & SRE_FLAG_IGNORECASE):\r
375 # look for literal prefix\r
376 for op, av in pattern.data:\r
377 if op is LITERAL:\r
378 if len(prefix) == prefix_skip:\r
379 prefix_skip = prefix_skip + 1\r
380 prefixappend(av)\r
381 elif op is SUBPATTERN and len(av[1]) == 1:\r
382 op, av = av[1][0]\r
383 if op is LITERAL:\r
384 prefixappend(av)\r
385 else:\r
386 break\r
387 else:\r
388 break\r
389 # if no prefix, look for charset prefix\r
390 if not prefix and pattern.data:\r
391 op, av = pattern.data[0]\r
392 if op is SUBPATTERN and av[1]:\r
393 op, av = av[1][0]\r
394 if op is LITERAL:\r
395 charsetappend((op, av))\r
396 elif op is BRANCH:\r
397 c = []\r
398 cappend = c.append\r
399 for p in av[1]:\r
400 if not p:\r
401 break\r
402 op, av = p[0]\r
403 if op is LITERAL:\r
404 cappend((op, av))\r
405 else:\r
406 break\r
407 else:\r
408 charset = c\r
409 elif op is BRANCH:\r
410 c = []\r
411 cappend = c.append\r
412 for p in av[1]:\r
413 if not p:\r
414 break\r
415 op, av = p[0]\r
416 if op is LITERAL:\r
417 cappend((op, av))\r
418 else:\r
419 break\r
420 else:\r
421 charset = c\r
422 elif op is IN:\r
423 charset = av\r
424## if prefix:\r
425## print "*** PREFIX", prefix, prefix_skip\r
426## if charset:\r
427## print "*** CHARSET", charset\r
428 # add an info block\r
429 emit = code.append\r
430 emit(OPCODES[INFO])\r
431 skip = len(code); emit(0)\r
432 # literal flag\r
433 mask = 0\r
434 if prefix:\r
435 mask = SRE_INFO_PREFIX\r
436 if len(prefix) == prefix_skip == len(pattern.data):\r
437 mask = mask + SRE_INFO_LITERAL\r
438 elif charset:\r
439 mask = mask + SRE_INFO_CHARSET\r
440 emit(mask)\r
441 # pattern length\r
442 if lo < MAXCODE:\r
443 emit(lo)\r
444 else:\r
445 emit(MAXCODE)\r
446 prefix = prefix[:MAXCODE]\r
447 if hi < MAXCODE:\r
448 emit(hi)\r
449 else:\r
450 emit(0)\r
451 # add literal prefix\r
452 if prefix:\r
453 emit(len(prefix)) # length\r
454 emit(prefix_skip) # skip\r
455 code.extend(prefix)\r
456 # generate overlap table\r
457 table = [-1] + ([0]*len(prefix))\r
458 for i in xrange(len(prefix)):\r
459 table[i+1] = table[i]+1\r
460 while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:\r
461 table[i+1] = table[table[i+1]-1]+1\r
462 code.extend(table[1:]) # don't store first entry\r
463 elif charset:\r
464 _compile_charset(charset, flags, code)\r
465 code[skip] = len(code) - skip\r
466\r
467try:\r
468 unicode\r
469except NameError:\r
470 STRING_TYPES = (type(""),)\r
471else:\r
472 STRING_TYPES = (type(""), type(unicode("")))\r
473\r
474def isstring(obj):\r
475 for tp in STRING_TYPES:\r
476 if isinstance(obj, tp):\r
477 return 1\r
478 return 0\r
479\r
480def _code(p, flags):\r
481\r
482 flags = p.pattern.flags | flags\r
483 code = []\r
484\r
485 # compile info block\r
486 _compile_info(code, p, flags)\r
487\r
488 # compile the pattern\r
489 _compile(code, p.data, flags)\r
490\r
491 code.append(OPCODES[SUCCESS])\r
492\r
493 return code\r
494\r
495def compile(p, flags=0):\r
496 # internal: convert pattern list to internal format\r
497\r
498 if isstring(p):\r
499 pattern = p\r
500 p = sre_parse.parse(p, flags)\r
501 else:\r
502 pattern = None\r
503\r
504 code = _code(p, flags)\r
505\r
506 # print code\r
507\r
508 # XXX: <fl> get rid of this limitation!\r
509 if p.pattern.groups > 100:\r
510 raise AssertionError(\r
511 "sorry, but this version only supports 100 named groups"\r
512 )\r
513\r
514 # map in either direction\r
515 groupindex = p.pattern.groupdict\r
516 indexgroup = [None] * p.pattern.groups\r
517 for k, i in groupindex.items():\r
518 indexgroup[i] = k\r
519\r
520 return _sre.compile(\r
521 pattern, flags | p.pattern.flags, code,\r
522 p.pattern.groups-1,\r
523 groupindex, indexgroup\r
524 )\r