]>
Commit | Line | Data |
---|---|---|
4710c53d | 1 | #\r |
2 | # Secret Labs' Regular Expression Engine\r | |
3 | #\r | |
4 | # convert template to internal format\r | |
5 | #\r | |
6 | # Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.\r | |
7 | #\r | |
8 | # See the sre.py file for information on usage and redistribution.\r | |
9 | #\r | |
10 | \r | |
11 | """Internal support module for sre"""\r | |
12 | \r | |
13 | import _sre, sys\r | |
14 | import sre_parse\r | |
15 | from sre_constants import *\r | |
16 | \r | |
17 | assert _sre.MAGIC == MAGIC, "SRE module mismatch"\r | |
18 | \r | |
19 | if _sre.CODESIZE == 2:\r | |
20 | MAXCODE = 65535\r | |
21 | else:\r | |
22 | MAXCODE = 0xFFFFFFFFL\r | |
23 | \r | |
24 | def _identityfunction(x):\r | |
25 | return x\r | |
26 | \r | |
27 | _LITERAL_CODES = set([LITERAL, NOT_LITERAL])\r | |
28 | _REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT])\r | |
29 | _SUCCESS_CODES = set([SUCCESS, FAILURE])\r | |
30 | _ASSERT_CODES = set([ASSERT, ASSERT_NOT])\r | |
31 | \r | |
32 | def _compile(code, pattern, flags):\r | |
33 | # internal: compile a (sub)pattern\r | |
34 | emit = code.append\r | |
35 | _len = len\r | |
36 | LITERAL_CODES = _LITERAL_CODES\r | |
37 | REPEATING_CODES = _REPEATING_CODES\r | |
38 | SUCCESS_CODES = _SUCCESS_CODES\r | |
39 | ASSERT_CODES = _ASSERT_CODES\r | |
40 | for op, av in pattern:\r | |
41 | if op in LITERAL_CODES:\r | |
42 | if flags & SRE_FLAG_IGNORECASE:\r | |
43 | emit(OPCODES[OP_IGNORE[op]])\r | |
44 | emit(_sre.getlower(av, flags))\r | |
45 | else:\r | |
46 | emit(OPCODES[op])\r | |
47 | emit(av)\r | |
48 | elif op is IN:\r | |
49 | if flags & SRE_FLAG_IGNORECASE:\r | |
50 | emit(OPCODES[OP_IGNORE[op]])\r | |
51 | def fixup(literal, flags=flags):\r | |
52 | return _sre.getlower(literal, flags)\r | |
53 | else:\r | |
54 | emit(OPCODES[op])\r | |
55 | fixup = _identityfunction\r | |
56 | skip = _len(code); emit(0)\r | |
57 | _compile_charset(av, flags, code, fixup)\r | |
58 | code[skip] = _len(code) - skip\r | |
59 | elif op is ANY:\r | |
60 | if flags & SRE_FLAG_DOTALL:\r | |
61 | emit(OPCODES[ANY_ALL])\r | |
62 | else:\r | |
63 | emit(OPCODES[ANY])\r | |
64 | elif op in REPEATING_CODES:\r | |
65 | if flags & SRE_FLAG_TEMPLATE:\r | |
66 | raise error, "internal: unsupported template operator"\r | |
67 | emit(OPCODES[REPEAT])\r | |
68 | skip = _len(code); emit(0)\r | |
69 | emit(av[0])\r | |
70 | emit(av[1])\r | |
71 | _compile(code, av[2], flags)\r | |
72 | emit(OPCODES[SUCCESS])\r | |
73 | code[skip] = _len(code) - skip\r | |
74 | elif _simple(av) and op is not REPEAT:\r | |
75 | if op is MAX_REPEAT:\r | |
76 | emit(OPCODES[REPEAT_ONE])\r | |
77 | else:\r | |
78 | emit(OPCODES[MIN_REPEAT_ONE])\r | |
79 | skip = _len(code); emit(0)\r | |
80 | emit(av[0])\r | |
81 | emit(av[1])\r | |
82 | _compile(code, av[2], flags)\r | |
83 | emit(OPCODES[SUCCESS])\r | |
84 | code[skip] = _len(code) - skip\r | |
85 | else:\r | |
86 | emit(OPCODES[REPEAT])\r | |
87 | skip = _len(code); emit(0)\r | |
88 | emit(av[0])\r | |
89 | emit(av[1])\r | |
90 | _compile(code, av[2], flags)\r | |
91 | code[skip] = _len(code) - skip\r | |
92 | if op is MAX_REPEAT:\r | |
93 | emit(OPCODES[MAX_UNTIL])\r | |
94 | else:\r | |
95 | emit(OPCODES[MIN_UNTIL])\r | |
96 | elif op is SUBPATTERN:\r | |
97 | if av[0]:\r | |
98 | emit(OPCODES[MARK])\r | |
99 | emit((av[0]-1)*2)\r | |
100 | # _compile_info(code, av[1], flags)\r | |
101 | _compile(code, av[1], flags)\r | |
102 | if av[0]:\r | |
103 | emit(OPCODES[MARK])\r | |
104 | emit((av[0]-1)*2+1)\r | |
105 | elif op in SUCCESS_CODES:\r | |
106 | emit(OPCODES[op])\r | |
107 | elif op in ASSERT_CODES:\r | |
108 | emit(OPCODES[op])\r | |
109 | skip = _len(code); emit(0)\r | |
110 | if av[0] >= 0:\r | |
111 | emit(0) # look ahead\r | |
112 | else:\r | |
113 | lo, hi = av[1].getwidth()\r | |
114 | if lo != hi:\r | |
115 | raise error, "look-behind requires fixed-width pattern"\r | |
116 | emit(lo) # look behind\r | |
117 | _compile(code, av[1], flags)\r | |
118 | emit(OPCODES[SUCCESS])\r | |
119 | code[skip] = _len(code) - skip\r | |
120 | elif op is CALL:\r | |
121 | emit(OPCODES[op])\r | |
122 | skip = _len(code); emit(0)\r | |
123 | _compile(code, av, flags)\r | |
124 | emit(OPCODES[SUCCESS])\r | |
125 | code[skip] = _len(code) - skip\r | |
126 | elif op is AT:\r | |
127 | emit(OPCODES[op])\r | |
128 | if flags & SRE_FLAG_MULTILINE:\r | |
129 | av = AT_MULTILINE.get(av, av)\r | |
130 | if flags & SRE_FLAG_LOCALE:\r | |
131 | av = AT_LOCALE.get(av, av)\r | |
132 | elif flags & SRE_FLAG_UNICODE:\r | |
133 | av = AT_UNICODE.get(av, av)\r | |
134 | emit(ATCODES[av])\r | |
135 | elif op is BRANCH:\r | |
136 | emit(OPCODES[op])\r | |
137 | tail = []\r | |
138 | tailappend = tail.append\r | |
139 | for av in av[1]:\r | |
140 | skip = _len(code); emit(0)\r | |
141 | # _compile_info(code, av, flags)\r | |
142 | _compile(code, av, flags)\r | |
143 | emit(OPCODES[JUMP])\r | |
144 | tailappend(_len(code)); emit(0)\r | |
145 | code[skip] = _len(code) - skip\r | |
146 | emit(0) # end of branch\r | |
147 | for tail in tail:\r | |
148 | code[tail] = _len(code) - tail\r | |
149 | elif op is CATEGORY:\r | |
150 | emit(OPCODES[op])\r | |
151 | if flags & SRE_FLAG_LOCALE:\r | |
152 | av = CH_LOCALE[av]\r | |
153 | elif flags & SRE_FLAG_UNICODE:\r | |
154 | av = CH_UNICODE[av]\r | |
155 | emit(CHCODES[av])\r | |
156 | elif op is GROUPREF:\r | |
157 | if flags & SRE_FLAG_IGNORECASE:\r | |
158 | emit(OPCODES[OP_IGNORE[op]])\r | |
159 | else:\r | |
160 | emit(OPCODES[op])\r | |
161 | emit(av-1)\r | |
162 | elif op is GROUPREF_EXISTS:\r | |
163 | emit(OPCODES[op])\r | |
164 | emit(av[0]-1)\r | |
165 | skipyes = _len(code); emit(0)\r | |
166 | _compile(code, av[1], flags)\r | |
167 | if av[2]:\r | |
168 | emit(OPCODES[JUMP])\r | |
169 | skipno = _len(code); emit(0)\r | |
170 | code[skipyes] = _len(code) - skipyes + 1\r | |
171 | _compile(code, av[2], flags)\r | |
172 | code[skipno] = _len(code) - skipno\r | |
173 | else:\r | |
174 | code[skipyes] = _len(code) - skipyes + 1\r | |
175 | else:\r | |
176 | raise ValueError, ("unsupported operand type", op)\r | |
177 | \r | |
178 | def _compile_charset(charset, flags, code, fixup=None):\r | |
179 | # compile charset subprogram\r | |
180 | emit = code.append\r | |
181 | if fixup is None:\r | |
182 | fixup = _identityfunction\r | |
183 | for op, av in _optimize_charset(charset, fixup):\r | |
184 | emit(OPCODES[op])\r | |
185 | if op is NEGATE:\r | |
186 | pass\r | |
187 | elif op is LITERAL:\r | |
188 | emit(fixup(av))\r | |
189 | elif op is RANGE:\r | |
190 | emit(fixup(av[0]))\r | |
191 | emit(fixup(av[1]))\r | |
192 | elif op is CHARSET:\r | |
193 | code.extend(av)\r | |
194 | elif op is BIGCHARSET:\r | |
195 | code.extend(av)\r | |
196 | elif op is CATEGORY:\r | |
197 | if flags & SRE_FLAG_LOCALE:\r | |
198 | emit(CHCODES[CH_LOCALE[av]])\r | |
199 | elif flags & SRE_FLAG_UNICODE:\r | |
200 | emit(CHCODES[CH_UNICODE[av]])\r | |
201 | else:\r | |
202 | emit(CHCODES[av])\r | |
203 | else:\r | |
204 | raise error, "internal: unsupported set operator"\r | |
205 | emit(OPCODES[FAILURE])\r | |
206 | \r | |
207 | def _optimize_charset(charset, fixup):\r | |
208 | # internal: optimize character set\r | |
209 | out = []\r | |
210 | outappend = out.append\r | |
211 | charmap = [0]*256\r | |
212 | try:\r | |
213 | for op, av in charset:\r | |
214 | if op is NEGATE:\r | |
215 | outappend((op, av))\r | |
216 | elif op is LITERAL:\r | |
217 | charmap[fixup(av)] = 1\r | |
218 | elif op is RANGE:\r | |
219 | for i in range(fixup(av[0]), fixup(av[1])+1):\r | |
220 | charmap[i] = 1\r | |
221 | elif op is CATEGORY:\r | |
222 | # XXX: could append to charmap tail\r | |
223 | return charset # cannot compress\r | |
224 | except IndexError:\r | |
225 | # character set contains unicode characters\r | |
226 | return _optimize_unicode(charset, fixup)\r | |
227 | # compress character map\r | |
228 | i = p = n = 0\r | |
229 | runs = []\r | |
230 | runsappend = runs.append\r | |
231 | for c in charmap:\r | |
232 | if c:\r | |
233 | if n == 0:\r | |
234 | p = i\r | |
235 | n = n + 1\r | |
236 | elif n:\r | |
237 | runsappend((p, n))\r | |
238 | n = 0\r | |
239 | i = i + 1\r | |
240 | if n:\r | |
241 | runsappend((p, n))\r | |
242 | if len(runs) <= 2:\r | |
243 | # use literal/range\r | |
244 | for p, n in runs:\r | |
245 | if n == 1:\r | |
246 | outappend((LITERAL, p))\r | |
247 | else:\r | |
248 | outappend((RANGE, (p, p+n-1)))\r | |
249 | if len(out) < len(charset):\r | |
250 | return out\r | |
251 | else:\r | |
252 | # use bitmap\r | |
253 | data = _mk_bitmap(charmap)\r | |
254 | outappend((CHARSET, data))\r | |
255 | return out\r | |
256 | return charset\r | |
257 | \r | |
258 | def _mk_bitmap(bits):\r | |
259 | data = []\r | |
260 | dataappend = data.append\r | |
261 | if _sre.CODESIZE == 2:\r | |
262 | start = (1, 0)\r | |
263 | else:\r | |
264 | start = (1L, 0L)\r | |
265 | m, v = start\r | |
266 | for c in bits:\r | |
267 | if c:\r | |
268 | v = v + m\r | |
269 | m = m + m\r | |
270 | if m > MAXCODE:\r | |
271 | dataappend(v)\r | |
272 | m, v = start\r | |
273 | return data\r | |
274 | \r | |
275 | # To represent a big charset, first a bitmap of all characters in the\r | |
276 | # set is constructed. Then, this bitmap is sliced into chunks of 256\r | |
277 | # characters, duplicate chunks are eliminated, and each chunk is\r | |
278 | # given a number. In the compiled expression, the charset is\r | |
279 | # represented by a 16-bit word sequence, consisting of one word for\r | |
280 | # the number of different chunks, a sequence of 256 bytes (128 words)\r | |
281 | # of chunk numbers indexed by their original chunk position, and a\r | |
282 | # sequence of chunks (16 words each).\r | |
283 | \r | |
284 | # Compression is normally good: in a typical charset, large ranges of\r | |
285 | # Unicode will be either completely excluded (e.g. if only cyrillic\r | |
286 | # letters are to be matched), or completely included (e.g. if large\r | |
287 | # subranges of Kanji match). These ranges will be represented by\r | |
288 | # chunks of all one-bits or all zero-bits.\r | |
289 | \r | |
290 | # Matching can be also done efficiently: the more significant byte of\r | |
291 | # the Unicode character is an index into the chunk number, and the\r | |
292 | # less significant byte is a bit index in the chunk (just like the\r | |
293 | # CHARSET matching).\r | |
294 | \r | |
295 | # In UCS-4 mode, the BIGCHARSET opcode still supports only subsets\r | |
296 | # of the basic multilingual plane; an efficient representation\r | |
297 | # for all of UTF-16 has not yet been developed. This means,\r | |
298 | # in particular, that negated charsets cannot be represented as\r | |
299 | # bigcharsets.\r | |
300 | \r | |
301 | def _optimize_unicode(charset, fixup):\r | |
302 | try:\r | |
303 | import array\r | |
304 | except ImportError:\r | |
305 | return charset\r | |
306 | charmap = [0]*65536\r | |
307 | negate = 0\r | |
308 | try:\r | |
309 | for op, av in charset:\r | |
310 | if op is NEGATE:\r | |
311 | negate = 1\r | |
312 | elif op is LITERAL:\r | |
313 | charmap[fixup(av)] = 1\r | |
314 | elif op is RANGE:\r | |
315 | for i in xrange(fixup(av[0]), fixup(av[1])+1):\r | |
316 | charmap[i] = 1\r | |
317 | elif op is CATEGORY:\r | |
318 | # XXX: could expand category\r | |
319 | return charset # cannot compress\r | |
320 | except IndexError:\r | |
321 | # non-BMP characters\r | |
322 | return charset\r | |
323 | if negate:\r | |
324 | if sys.maxunicode != 65535:\r | |
325 | # XXX: negation does not work with big charsets\r | |
326 | return charset\r | |
327 | for i in xrange(65536):\r | |
328 | charmap[i] = not charmap[i]\r | |
329 | comps = {}\r | |
330 | mapping = [0]*256\r | |
331 | block = 0\r | |
332 | data = []\r | |
333 | for i in xrange(256):\r | |
334 | chunk = tuple(charmap[i*256:(i+1)*256])\r | |
335 | new = comps.setdefault(chunk, block)\r | |
336 | mapping[i] = new\r | |
337 | if new == block:\r | |
338 | block = block + 1\r | |
339 | data = data + _mk_bitmap(chunk)\r | |
340 | header = [block]\r | |
341 | if _sre.CODESIZE == 2:\r | |
342 | code = 'H'\r | |
343 | else:\r | |
344 | code = 'I'\r | |
345 | # Convert block indices to byte array of 256 bytes\r | |
346 | mapping = array.array('b', mapping).tostring()\r | |
347 | # Convert byte array to word array\r | |
348 | mapping = array.array(code, mapping)\r | |
349 | assert mapping.itemsize == _sre.CODESIZE\r | |
350 | header = header + mapping.tolist()\r | |
351 | data[0:0] = header\r | |
352 | return [(BIGCHARSET, data)]\r | |
353 | \r | |
354 | def _simple(av):\r | |
355 | # check if av is a "simple" operator\r | |
356 | lo, hi = av[2].getwidth()\r | |
357 | if lo == 0 and hi == MAXREPEAT:\r | |
358 | raise error, "nothing to repeat"\r | |
359 | return lo == hi == 1 and av[2][0][0] != SUBPATTERN\r | |
360 | \r | |
361 | def _compile_info(code, pattern, flags):\r | |
362 | # internal: compile an info block. in the current version,\r | |
363 | # this contains min/max pattern width, and an optional literal\r | |
364 | # prefix or a character map\r | |
365 | lo, hi = pattern.getwidth()\r | |
366 | if lo == 0:\r | |
367 | return # not worth it\r | |
368 | # look for a literal prefix\r | |
369 | prefix = []\r | |
370 | prefixappend = prefix.append\r | |
371 | prefix_skip = 0\r | |
372 | charset = [] # not used\r | |
373 | charsetappend = charset.append\r | |
374 | if not (flags & SRE_FLAG_IGNORECASE):\r | |
375 | # look for literal prefix\r | |
376 | for op, av in pattern.data:\r | |
377 | if op is LITERAL:\r | |
378 | if len(prefix) == prefix_skip:\r | |
379 | prefix_skip = prefix_skip + 1\r | |
380 | prefixappend(av)\r | |
381 | elif op is SUBPATTERN and len(av[1]) == 1:\r | |
382 | op, av = av[1][0]\r | |
383 | if op is LITERAL:\r | |
384 | prefixappend(av)\r | |
385 | else:\r | |
386 | break\r | |
387 | else:\r | |
388 | break\r | |
389 | # if no prefix, look for charset prefix\r | |
390 | if not prefix and pattern.data:\r | |
391 | op, av = pattern.data[0]\r | |
392 | if op is SUBPATTERN and av[1]:\r | |
393 | op, av = av[1][0]\r | |
394 | if op is LITERAL:\r | |
395 | charsetappend((op, av))\r | |
396 | elif op is BRANCH:\r | |
397 | c = []\r | |
398 | cappend = c.append\r | |
399 | for p in av[1]:\r | |
400 | if not p:\r | |
401 | break\r | |
402 | op, av = p[0]\r | |
403 | if op is LITERAL:\r | |
404 | cappend((op, av))\r | |
405 | else:\r | |
406 | break\r | |
407 | else:\r | |
408 | charset = c\r | |
409 | elif op is BRANCH:\r | |
410 | c = []\r | |
411 | cappend = c.append\r | |
412 | for p in av[1]:\r | |
413 | if not p:\r | |
414 | break\r | |
415 | op, av = p[0]\r | |
416 | if op is LITERAL:\r | |
417 | cappend((op, av))\r | |
418 | else:\r | |
419 | break\r | |
420 | else:\r | |
421 | charset = c\r | |
422 | elif op is IN:\r | |
423 | charset = av\r | |
424 | ## if prefix:\r | |
425 | ## print "*** PREFIX", prefix, prefix_skip\r | |
426 | ## if charset:\r | |
427 | ## print "*** CHARSET", charset\r | |
428 | # add an info block\r | |
429 | emit = code.append\r | |
430 | emit(OPCODES[INFO])\r | |
431 | skip = len(code); emit(0)\r | |
432 | # literal flag\r | |
433 | mask = 0\r | |
434 | if prefix:\r | |
435 | mask = SRE_INFO_PREFIX\r | |
436 | if len(prefix) == prefix_skip == len(pattern.data):\r | |
437 | mask = mask + SRE_INFO_LITERAL\r | |
438 | elif charset:\r | |
439 | mask = mask + SRE_INFO_CHARSET\r | |
440 | emit(mask)\r | |
441 | # pattern length\r | |
442 | if lo < MAXCODE:\r | |
443 | emit(lo)\r | |
444 | else:\r | |
445 | emit(MAXCODE)\r | |
446 | prefix = prefix[:MAXCODE]\r | |
447 | if hi < MAXCODE:\r | |
448 | emit(hi)\r | |
449 | else:\r | |
450 | emit(0)\r | |
451 | # add literal prefix\r | |
452 | if prefix:\r | |
453 | emit(len(prefix)) # length\r | |
454 | emit(prefix_skip) # skip\r | |
455 | code.extend(prefix)\r | |
456 | # generate overlap table\r | |
457 | table = [-1] + ([0]*len(prefix))\r | |
458 | for i in xrange(len(prefix)):\r | |
459 | table[i+1] = table[i]+1\r | |
460 | while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:\r | |
461 | table[i+1] = table[table[i+1]-1]+1\r | |
462 | code.extend(table[1:]) # don't store first entry\r | |
463 | elif charset:\r | |
464 | _compile_charset(charset, flags, code)\r | |
465 | code[skip] = len(code) - skip\r | |
466 | \r | |
467 | try:\r | |
468 | unicode\r | |
469 | except NameError:\r | |
470 | STRING_TYPES = (type(""),)\r | |
471 | else:\r | |
472 | STRING_TYPES = (type(""), type(unicode("")))\r | |
473 | \r | |
474 | def isstring(obj):\r | |
475 | for tp in STRING_TYPES:\r | |
476 | if isinstance(obj, tp):\r | |
477 | return 1\r | |
478 | return 0\r | |
479 | \r | |
480 | def _code(p, flags):\r | |
481 | \r | |
482 | flags = p.pattern.flags | flags\r | |
483 | code = []\r | |
484 | \r | |
485 | # compile info block\r | |
486 | _compile_info(code, p, flags)\r | |
487 | \r | |
488 | # compile the pattern\r | |
489 | _compile(code, p.data, flags)\r | |
490 | \r | |
491 | code.append(OPCODES[SUCCESS])\r | |
492 | \r | |
493 | return code\r | |
494 | \r | |
495 | def compile(p, flags=0):\r | |
496 | # internal: convert pattern list to internal format\r | |
497 | \r | |
498 | if isstring(p):\r | |
499 | pattern = p\r | |
500 | p = sre_parse.parse(p, flags)\r | |
501 | else:\r | |
502 | pattern = None\r | |
503 | \r | |
504 | code = _code(p, flags)\r | |
505 | \r | |
506 | # print code\r | |
507 | \r | |
508 | # XXX: <fl> get rid of this limitation!\r | |
509 | if p.pattern.groups > 100:\r | |
510 | raise AssertionError(\r | |
511 | "sorry, but this version only supports 100 named groups"\r | |
512 | )\r | |
513 | \r | |
514 | # map in either direction\r | |
515 | groupindex = p.pattern.groupdict\r | |
516 | indexgroup = [None] * p.pattern.groups\r | |
517 | for k, i in groupindex.items():\r | |
518 | indexgroup[i] = k\r | |
519 | \r | |
520 | return _sre.compile(\r | |
521 | pattern, flags | p.pattern.flags, code,\r | |
522 | p.pattern.groups-1,\r | |
523 | groupindex, indexgroup\r | |
524 | )\r |