]>
Commit | Line | Data |
---|---|---|
3257aa99 DM |
1 | # -*- coding: utf-8 -*-\r |
2 | #\r | |
3 | # Secret Labs' Regular Expression Engine\r | |
4 | #\r | |
5 | # convert template to internal format\r | |
6 | #\r | |
7 | # Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.\r | |
8 | #\r | |
9 | # See the sre.py file for information on usage and redistribution.\r | |
10 | #\r | |
11 | \r | |
12 | """Internal support module for sre"""\r | |
13 | \r | |
14 | import _sre, sys\r | |
15 | import sre_parse\r | |
16 | from sre_constants import *\r | |
17 | \r | |
18 | assert _sre.MAGIC == MAGIC, "SRE module mismatch"\r | |
19 | \r | |
20 | if _sre.CODESIZE == 2:\r | |
21 | MAXCODE = 65535\r | |
22 | else:\r | |
23 | MAXCODE = 0xFFFFFFFFL\r | |
24 | \r | |
25 | _LITERAL_CODES = set([LITERAL, NOT_LITERAL])\r | |
26 | _REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT])\r | |
27 | _SUCCESS_CODES = set([SUCCESS, FAILURE])\r | |
28 | _ASSERT_CODES = set([ASSERT, ASSERT_NOT])\r | |
29 | \r | |
30 | # Sets of lowercase characters which have the same uppercase.\r | |
31 | _equivalences = (\r | |
32 | # LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I\r | |
33 | (0x69, 0x131), # iı\r | |
34 | # LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S\r | |
35 | (0x73, 0x17f), # sſ\r | |
36 | # MICRO SIGN, GREEK SMALL LETTER MU\r | |
37 | (0xb5, 0x3bc), # µμ\r | |
38 | # COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI\r | |
39 | (0x345, 0x3b9, 0x1fbe), # \u0345ιι\r | |
40 | # GREEK SMALL LETTER BETA, GREEK BETA SYMBOL\r | |
41 | (0x3b2, 0x3d0), # βϐ\r | |
42 | # GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL\r | |
43 | (0x3b5, 0x3f5), # εϵ\r | |
44 | # GREEK SMALL LETTER THETA, GREEK THETA SYMBOL\r | |
45 | (0x3b8, 0x3d1), # θϑ\r | |
46 | # GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL\r | |
47 | (0x3ba, 0x3f0), # κϰ\r | |
48 | # GREEK SMALL LETTER PI, GREEK PI SYMBOL\r | |
49 | (0x3c0, 0x3d6), # πϖ\r | |
50 | # GREEK SMALL LETTER RHO, GREEK RHO SYMBOL\r | |
51 | (0x3c1, 0x3f1), # ρϱ\r | |
52 | # GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA\r | |
53 | (0x3c2, 0x3c3), # ςσ\r | |
54 | # GREEK SMALL LETTER PHI, GREEK PHI SYMBOL\r | |
55 | (0x3c6, 0x3d5), # φϕ\r | |
56 | # LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE\r | |
57 | (0x1e61, 0x1e9b), # ṡẛ\r | |
58 | )\r | |
59 | \r | |
60 | # Maps the lowercase code to lowercase codes which have the same uppercase.\r | |
61 | _ignorecase_fixes = {i: tuple(j for j in t if i != j)\r | |
62 | for t in _equivalences for i in t}\r | |
63 | \r | |
64 | def _compile(code, pattern, flags):\r | |
65 | # internal: compile a (sub)pattern\r | |
66 | emit = code.append\r | |
67 | _len = len\r | |
68 | LITERAL_CODES = _LITERAL_CODES\r | |
69 | REPEATING_CODES = _REPEATING_CODES\r | |
70 | SUCCESS_CODES = _SUCCESS_CODES\r | |
71 | ASSERT_CODES = _ASSERT_CODES\r | |
72 | if (flags & SRE_FLAG_IGNORECASE and\r | |
73 | not (flags & SRE_FLAG_LOCALE) and\r | |
74 | flags & SRE_FLAG_UNICODE):\r | |
75 | fixes = _ignorecase_fixes\r | |
76 | else:\r | |
77 | fixes = None\r | |
78 | for op, av in pattern:\r | |
79 | if op in LITERAL_CODES:\r | |
80 | if flags & SRE_FLAG_IGNORECASE:\r | |
81 | lo = _sre.getlower(av, flags)\r | |
82 | if fixes and lo in fixes:\r | |
83 | emit(OPCODES[IN_IGNORE])\r | |
84 | skip = _len(code); emit(0)\r | |
85 | if op is NOT_LITERAL:\r | |
86 | emit(OPCODES[NEGATE])\r | |
87 | for k in (lo,) + fixes[lo]:\r | |
88 | emit(OPCODES[LITERAL])\r | |
89 | emit(k)\r | |
90 | emit(OPCODES[FAILURE])\r | |
91 | code[skip] = _len(code) - skip\r | |
92 | else:\r | |
93 | emit(OPCODES[OP_IGNORE[op]])\r | |
94 | emit(lo)\r | |
95 | else:\r | |
96 | emit(OPCODES[op])\r | |
97 | emit(av)\r | |
98 | elif op is IN:\r | |
99 | if flags & SRE_FLAG_IGNORECASE:\r | |
100 | emit(OPCODES[OP_IGNORE[op]])\r | |
101 | def fixup(literal, flags=flags):\r | |
102 | return _sre.getlower(literal, flags)\r | |
103 | else:\r | |
104 | emit(OPCODES[op])\r | |
105 | fixup = None\r | |
106 | skip = _len(code); emit(0)\r | |
107 | _compile_charset(av, flags, code, fixup, fixes)\r | |
108 | code[skip] = _len(code) - skip\r | |
109 | elif op is ANY:\r | |
110 | if flags & SRE_FLAG_DOTALL:\r | |
111 | emit(OPCODES[ANY_ALL])\r | |
112 | else:\r | |
113 | emit(OPCODES[ANY])\r | |
114 | elif op in REPEATING_CODES:\r | |
115 | if flags & SRE_FLAG_TEMPLATE:\r | |
116 | raise error, "internal: unsupported template operator"\r | |
117 | emit(OPCODES[REPEAT])\r | |
118 | skip = _len(code); emit(0)\r | |
119 | emit(av[0])\r | |
120 | emit(av[1])\r | |
121 | _compile(code, av[2], flags)\r | |
122 | emit(OPCODES[SUCCESS])\r | |
123 | code[skip] = _len(code) - skip\r | |
124 | elif _simple(av) and op is not REPEAT:\r | |
125 | if op is MAX_REPEAT:\r | |
126 | emit(OPCODES[REPEAT_ONE])\r | |
127 | else:\r | |
128 | emit(OPCODES[MIN_REPEAT_ONE])\r | |
129 | skip = _len(code); emit(0)\r | |
130 | emit(av[0])\r | |
131 | emit(av[1])\r | |
132 | _compile(code, av[2], flags)\r | |
133 | emit(OPCODES[SUCCESS])\r | |
134 | code[skip] = _len(code) - skip\r | |
135 | else:\r | |
136 | emit(OPCODES[REPEAT])\r | |
137 | skip = _len(code); emit(0)\r | |
138 | emit(av[0])\r | |
139 | emit(av[1])\r | |
140 | _compile(code, av[2], flags)\r | |
141 | code[skip] = _len(code) - skip\r | |
142 | if op is MAX_REPEAT:\r | |
143 | emit(OPCODES[MAX_UNTIL])\r | |
144 | else:\r | |
145 | emit(OPCODES[MIN_UNTIL])\r | |
146 | elif op is SUBPATTERN:\r | |
147 | if av[0]:\r | |
148 | emit(OPCODES[MARK])\r | |
149 | emit((av[0]-1)*2)\r | |
150 | # _compile_info(code, av[1], flags)\r | |
151 | _compile(code, av[1], flags)\r | |
152 | if av[0]:\r | |
153 | emit(OPCODES[MARK])\r | |
154 | emit((av[0]-1)*2+1)\r | |
155 | elif op in SUCCESS_CODES:\r | |
156 | emit(OPCODES[op])\r | |
157 | elif op in ASSERT_CODES:\r | |
158 | emit(OPCODES[op])\r | |
159 | skip = _len(code); emit(0)\r | |
160 | if av[0] >= 0:\r | |
161 | emit(0) # look ahead\r | |
162 | else:\r | |
163 | lo, hi = av[1].getwidth()\r | |
164 | if lo != hi:\r | |
165 | raise error, "look-behind requires fixed-width pattern"\r | |
166 | emit(lo) # look behind\r | |
167 | _compile(code, av[1], flags)\r | |
168 | emit(OPCODES[SUCCESS])\r | |
169 | code[skip] = _len(code) - skip\r | |
170 | elif op is CALL:\r | |
171 | emit(OPCODES[op])\r | |
172 | skip = _len(code); emit(0)\r | |
173 | _compile(code, av, flags)\r | |
174 | emit(OPCODES[SUCCESS])\r | |
175 | code[skip] = _len(code) - skip\r | |
176 | elif op is AT:\r | |
177 | emit(OPCODES[op])\r | |
178 | if flags & SRE_FLAG_MULTILINE:\r | |
179 | av = AT_MULTILINE.get(av, av)\r | |
180 | if flags & SRE_FLAG_LOCALE:\r | |
181 | av = AT_LOCALE.get(av, av)\r | |
182 | elif flags & SRE_FLAG_UNICODE:\r | |
183 | av = AT_UNICODE.get(av, av)\r | |
184 | emit(ATCODES[av])\r | |
185 | elif op is BRANCH:\r | |
186 | emit(OPCODES[op])\r | |
187 | tail = []\r | |
188 | tailappend = tail.append\r | |
189 | for av in av[1]:\r | |
190 | skip = _len(code); emit(0)\r | |
191 | # _compile_info(code, av, flags)\r | |
192 | _compile(code, av, flags)\r | |
193 | emit(OPCODES[JUMP])\r | |
194 | tailappend(_len(code)); emit(0)\r | |
195 | code[skip] = _len(code) - skip\r | |
196 | emit(0) # end of branch\r | |
197 | for tail in tail:\r | |
198 | code[tail] = _len(code) - tail\r | |
199 | elif op is CATEGORY:\r | |
200 | emit(OPCODES[op])\r | |
201 | if flags & SRE_FLAG_LOCALE:\r | |
202 | av = CH_LOCALE[av]\r | |
203 | elif flags & SRE_FLAG_UNICODE:\r | |
204 | av = CH_UNICODE[av]\r | |
205 | emit(CHCODES[av])\r | |
206 | elif op is GROUPREF:\r | |
207 | if flags & SRE_FLAG_IGNORECASE:\r | |
208 | emit(OPCODES[OP_IGNORE[op]])\r | |
209 | else:\r | |
210 | emit(OPCODES[op])\r | |
211 | emit(av-1)\r | |
212 | elif op is GROUPREF_EXISTS:\r | |
213 | emit(OPCODES[op])\r | |
214 | emit(av[0]-1)\r | |
215 | skipyes = _len(code); emit(0)\r | |
216 | _compile(code, av[1], flags)\r | |
217 | if av[2]:\r | |
218 | emit(OPCODES[JUMP])\r | |
219 | skipno = _len(code); emit(0)\r | |
220 | code[skipyes] = _len(code) - skipyes + 1\r | |
221 | _compile(code, av[2], flags)\r | |
222 | code[skipno] = _len(code) - skipno\r | |
223 | else:\r | |
224 | code[skipyes] = _len(code) - skipyes + 1\r | |
225 | else:\r | |
226 | raise ValueError, ("unsupported operand type", op)\r | |
227 | \r | |
228 | def _compile_charset(charset, flags, code, fixup=None, fixes=None):\r | |
229 | # compile charset subprogram\r | |
230 | emit = code.append\r | |
231 | for op, av in _optimize_charset(charset, fixup, fixes,\r | |
232 | flags & SRE_FLAG_UNICODE):\r | |
233 | emit(OPCODES[op])\r | |
234 | if op is NEGATE:\r | |
235 | pass\r | |
236 | elif op is LITERAL:\r | |
237 | emit(av)\r | |
238 | elif op is RANGE:\r | |
239 | emit(av[0])\r | |
240 | emit(av[1])\r | |
241 | elif op is CHARSET:\r | |
242 | code.extend(av)\r | |
243 | elif op is BIGCHARSET:\r | |
244 | code.extend(av)\r | |
245 | elif op is CATEGORY:\r | |
246 | if flags & SRE_FLAG_LOCALE:\r | |
247 | emit(CHCODES[CH_LOCALE[av]])\r | |
248 | elif flags & SRE_FLAG_UNICODE:\r | |
249 | emit(CHCODES[CH_UNICODE[av]])\r | |
250 | else:\r | |
251 | emit(CHCODES[av])\r | |
252 | else:\r | |
253 | raise error, "internal: unsupported set operator"\r | |
254 | emit(OPCODES[FAILURE])\r | |
255 | \r | |
256 | def _optimize_charset(charset, fixup, fixes, isunicode):\r | |
257 | # internal: optimize character set\r | |
258 | out = []\r | |
259 | tail = []\r | |
260 | charmap = bytearray(256)\r | |
261 | for op, av in charset:\r | |
262 | while True:\r | |
263 | try:\r | |
264 | if op is LITERAL:\r | |
265 | if fixup:\r | |
266 | i = fixup(av)\r | |
267 | charmap[i] = 1\r | |
268 | if fixes and i in fixes:\r | |
269 | for k in fixes[i]:\r | |
270 | charmap[k] = 1\r | |
271 | else:\r | |
272 | charmap[av] = 1\r | |
273 | elif op is RANGE:\r | |
274 | r = range(av[0], av[1]+1)\r | |
275 | if fixup:\r | |
276 | r = map(fixup, r)\r | |
277 | if fixup and fixes:\r | |
278 | for i in r:\r | |
279 | charmap[i] = 1\r | |
280 | if i in fixes:\r | |
281 | for k in fixes[i]:\r | |
282 | charmap[k] = 1\r | |
283 | else:\r | |
284 | for i in r:\r | |
285 | charmap[i] = 1\r | |
286 | elif op is NEGATE:\r | |
287 | out.append((op, av))\r | |
288 | else:\r | |
289 | tail.append((op, av))\r | |
290 | except IndexError:\r | |
291 | if len(charmap) == 256:\r | |
292 | # character set contains non-UCS1 character codes\r | |
293 | charmap += b'\0' * 0xff00\r | |
294 | continue\r | |
295 | # character set contains non-BMP character codes\r | |
296 | if fixup and isunicode and op is RANGE:\r | |
297 | lo, hi = av\r | |
298 | ranges = [av]\r | |
299 | # There are only two ranges of cased astral characters:\r | |
300 | # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi).\r | |
301 | _fixup_range(max(0x10000, lo), min(0x11fff, hi),\r | |
302 | ranges, fixup)\r | |
303 | for lo, hi in ranges:\r | |
304 | if lo == hi:\r | |
305 | tail.append((LITERAL, hi))\r | |
306 | else:\r | |
307 | tail.append((RANGE, (lo, hi)))\r | |
308 | else:\r | |
309 | tail.append((op, av))\r | |
310 | break\r | |
311 | \r | |
312 | # compress character map\r | |
313 | runs = []\r | |
314 | q = 0\r | |
315 | while True:\r | |
316 | p = charmap.find(b'\1', q)\r | |
317 | if p < 0:\r | |
318 | break\r | |
319 | if len(runs) >= 2:\r | |
320 | runs = None\r | |
321 | break\r | |
322 | q = charmap.find(b'\0', p)\r | |
323 | if q < 0:\r | |
324 | runs.append((p, len(charmap)))\r | |
325 | break\r | |
326 | runs.append((p, q))\r | |
327 | if runs is not None:\r | |
328 | # use literal/range\r | |
329 | for p, q in runs:\r | |
330 | if q - p == 1:\r | |
331 | out.append((LITERAL, p))\r | |
332 | else:\r | |
333 | out.append((RANGE, (p, q - 1)))\r | |
334 | out += tail\r | |
335 | # if the case was changed or new representation is more compact\r | |
336 | if fixup or len(out) < len(charset):\r | |
337 | return out\r | |
338 | # else original character set is good enough\r | |
339 | return charset\r | |
340 | \r | |
341 | # use bitmap\r | |
342 | if len(charmap) == 256:\r | |
343 | data = _mk_bitmap(charmap)\r | |
344 | out.append((CHARSET, data))\r | |
345 | out += tail\r | |
346 | return out\r | |
347 | \r | |
348 | # To represent a big charset, first a bitmap of all characters in the\r | |
349 | # set is constructed. Then, this bitmap is sliced into chunks of 256\r | |
350 | # characters, duplicate chunks are eliminated, and each chunk is\r | |
351 | # given a number. In the compiled expression, the charset is\r | |
352 | # represented by a 32-bit word sequence, consisting of one word for\r | |
353 | # the number of different chunks, a sequence of 256 bytes (64 words)\r | |
354 | # of chunk numbers indexed by their original chunk position, and a\r | |
355 | # sequence of 256-bit chunks (8 words each).\r | |
356 | \r | |
357 | # Compression is normally good: in a typical charset, large ranges of\r | |
358 | # Unicode will be either completely excluded (e.g. if only cyrillic\r | |
359 | # letters are to be matched), or completely included (e.g. if large\r | |
360 | # subranges of Kanji match). These ranges will be represented by\r | |
361 | # chunks of all one-bits or all zero-bits.\r | |
362 | \r | |
363 | # Matching can be also done efficiently: the more significant byte of\r | |
364 | # the Unicode character is an index into the chunk number, and the\r | |
365 | # less significant byte is a bit index in the chunk (just like the\r | |
366 | # CHARSET matching).\r | |
367 | \r | |
368 | # In UCS-4 mode, the BIGCHARSET opcode still supports only subsets\r | |
369 | # of the basic multilingual plane; an efficient representation\r | |
370 | # for all of Unicode has not yet been developed.\r | |
371 | \r | |
372 | charmap = bytes(charmap) # should be hashable\r | |
373 | comps = {}\r | |
374 | mapping = bytearray(256)\r | |
375 | block = 0\r | |
376 | data = bytearray()\r | |
377 | for i in range(0, 65536, 256):\r | |
378 | chunk = charmap[i: i + 256]\r | |
379 | if chunk in comps:\r | |
380 | mapping[i // 256] = comps[chunk]\r | |
381 | else:\r | |
382 | mapping[i // 256] = comps[chunk] = block\r | |
383 | block += 1\r | |
384 | data += chunk\r | |
385 | data = _mk_bitmap(data)\r | |
386 | data[0:0] = [block] + _bytes_to_codes(mapping)\r | |
387 | out.append((BIGCHARSET, data))\r | |
388 | out += tail\r | |
389 | return out\r | |
390 | \r | |
391 | def _fixup_range(lo, hi, ranges, fixup):\r | |
392 | for i in map(fixup, range(lo, hi+1)):\r | |
393 | for k, (lo, hi) in enumerate(ranges):\r | |
394 | if i < lo:\r | |
395 | if l == lo - 1:\r | |
396 | ranges[k] = (i, hi)\r | |
397 | else:\r | |
398 | ranges.insert(k, (i, i))\r | |
399 | break\r | |
400 | elif i > hi:\r | |
401 | if i == hi + 1:\r | |
402 | ranges[k] = (lo, i)\r | |
403 | break\r | |
404 | else:\r | |
405 | break\r | |
406 | else:\r | |
407 | ranges.append((i, i))\r | |
408 | \r | |
409 | _CODEBITS = _sre.CODESIZE * 8\r | |
410 | _BITS_TRANS = b'0' + b'1' * 255\r | |
411 | def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):\r | |
412 | s = bytes(bits).translate(_BITS_TRANS)[::-1]\r | |
413 | return [_int(s[i - _CODEBITS: i], 2)\r | |
414 | for i in range(len(s), 0, -_CODEBITS)]\r | |
415 | \r | |
416 | def _bytes_to_codes(b):\r | |
417 | # Convert block indices to word array\r | |
418 | import array\r | |
419 | if _sre.CODESIZE == 2:\r | |
420 | code = 'H'\r | |
421 | else:\r | |
422 | code = 'I'\r | |
423 | a = array.array(code, bytes(b))\r | |
424 | assert a.itemsize == _sre.CODESIZE\r | |
425 | assert len(a) * a.itemsize == len(b)\r | |
426 | return a.tolist()\r | |
427 | \r | |
428 | def _simple(av):\r | |
429 | # check if av is a "simple" operator\r | |
430 | lo, hi = av[2].getwidth()\r | |
431 | return lo == hi == 1 and av[2][0][0] != SUBPATTERN\r | |
432 | \r | |
433 | def _compile_info(code, pattern, flags):\r | |
434 | # internal: compile an info block. in the current version,\r | |
435 | # this contains min/max pattern width, and an optional literal\r | |
436 | # prefix or a character map\r | |
437 | lo, hi = pattern.getwidth()\r | |
438 | if lo == 0:\r | |
439 | return # not worth it\r | |
440 | # look for a literal prefix\r | |
441 | prefix = []\r | |
442 | prefixappend = prefix.append\r | |
443 | prefix_skip = 0\r | |
444 | charset = [] # not used\r | |
445 | charsetappend = charset.append\r | |
446 | if not (flags & SRE_FLAG_IGNORECASE):\r | |
447 | # look for literal prefix\r | |
448 | for op, av in pattern.data:\r | |
449 | if op is LITERAL:\r | |
450 | if len(prefix) == prefix_skip:\r | |
451 | prefix_skip = prefix_skip + 1\r | |
452 | prefixappend(av)\r | |
453 | elif op is SUBPATTERN and len(av[1]) == 1:\r | |
454 | op, av = av[1][0]\r | |
455 | if op is LITERAL:\r | |
456 | prefixappend(av)\r | |
457 | else:\r | |
458 | break\r | |
459 | else:\r | |
460 | break\r | |
461 | # if no prefix, look for charset prefix\r | |
462 | if not prefix and pattern.data:\r | |
463 | op, av = pattern.data[0]\r | |
464 | if op is SUBPATTERN and av[1]:\r | |
465 | op, av = av[1][0]\r | |
466 | if op is LITERAL:\r | |
467 | charsetappend((op, av))\r | |
468 | elif op is BRANCH:\r | |
469 | c = []\r | |
470 | cappend = c.append\r | |
471 | for p in av[1]:\r | |
472 | if not p:\r | |
473 | break\r | |
474 | op, av = p[0]\r | |
475 | if op is LITERAL:\r | |
476 | cappend((op, av))\r | |
477 | else:\r | |
478 | break\r | |
479 | else:\r | |
480 | charset = c\r | |
481 | elif op is BRANCH:\r | |
482 | c = []\r | |
483 | cappend = c.append\r | |
484 | for p in av[1]:\r | |
485 | if not p:\r | |
486 | break\r | |
487 | op, av = p[0]\r | |
488 | if op is LITERAL:\r | |
489 | cappend((op, av))\r | |
490 | else:\r | |
491 | break\r | |
492 | else:\r | |
493 | charset = c\r | |
494 | elif op is IN:\r | |
495 | charset = av\r | |
496 | ## if prefix:\r | |
497 | ## print "*** PREFIX", prefix, prefix_skip\r | |
498 | ## if charset:\r | |
499 | ## print "*** CHARSET", charset\r | |
500 | # add an info block\r | |
501 | emit = code.append\r | |
502 | emit(OPCODES[INFO])\r | |
503 | skip = len(code); emit(0)\r | |
504 | # literal flag\r | |
505 | mask = 0\r | |
506 | if prefix:\r | |
507 | mask = SRE_INFO_PREFIX\r | |
508 | if len(prefix) == prefix_skip == len(pattern.data):\r | |
509 | mask = mask + SRE_INFO_LITERAL\r | |
510 | elif charset:\r | |
511 | mask = mask + SRE_INFO_CHARSET\r | |
512 | emit(mask)\r | |
513 | # pattern length\r | |
514 | if lo < MAXCODE:\r | |
515 | emit(lo)\r | |
516 | else:\r | |
517 | emit(MAXCODE)\r | |
518 | prefix = prefix[:MAXCODE]\r | |
519 | if hi < MAXCODE:\r | |
520 | emit(hi)\r | |
521 | else:\r | |
522 | emit(0)\r | |
523 | # add literal prefix\r | |
524 | if prefix:\r | |
525 | emit(len(prefix)) # length\r | |
526 | emit(prefix_skip) # skip\r | |
527 | code.extend(prefix)\r | |
528 | # generate overlap table\r | |
529 | table = [-1] + ([0]*len(prefix))\r | |
530 | for i in xrange(len(prefix)):\r | |
531 | table[i+1] = table[i]+1\r | |
532 | while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:\r | |
533 | table[i+1] = table[table[i+1]-1]+1\r | |
534 | code.extend(table[1:]) # don't store first entry\r | |
535 | elif charset:\r | |
536 | _compile_charset(charset, flags, code)\r | |
537 | code[skip] = len(code) - skip\r | |
538 | \r | |
539 | try:\r | |
540 | unicode\r | |
541 | except NameError:\r | |
542 | STRING_TYPES = (type(""),)\r | |
543 | else:\r | |
544 | STRING_TYPES = (type(""), type(unicode("")))\r | |
545 | \r | |
546 | def isstring(obj):\r | |
547 | for tp in STRING_TYPES:\r | |
548 | if isinstance(obj, tp):\r | |
549 | return 1\r | |
550 | return 0\r | |
551 | \r | |
552 | def _code(p, flags):\r | |
553 | \r | |
554 | flags = p.pattern.flags | flags\r | |
555 | code = []\r | |
556 | \r | |
557 | # compile info block\r | |
558 | _compile_info(code, p, flags)\r | |
559 | \r | |
560 | # compile the pattern\r | |
561 | _compile(code, p.data, flags)\r | |
562 | \r | |
563 | code.append(OPCODES[SUCCESS])\r | |
564 | \r | |
565 | return code\r | |
566 | \r | |
567 | def compile(p, flags=0):\r | |
568 | # internal: convert pattern list to internal format\r | |
569 | \r | |
570 | if isstring(p):\r | |
571 | pattern = p\r | |
572 | p = sre_parse.parse(p, flags)\r | |
573 | else:\r | |
574 | pattern = None\r | |
575 | \r | |
576 | code = _code(p, flags)\r | |
577 | \r | |
578 | # print code\r | |
579 | \r | |
580 | # XXX: <fl> get rid of this limitation!\r | |
581 | if p.pattern.groups > 100:\r | |
582 | raise AssertionError(\r | |
583 | "sorry, but this version only supports 100 named groups"\r | |
584 | )\r | |
585 | \r | |
586 | # map in either direction\r | |
587 | groupindex = p.pattern.groupdict\r | |
588 | indexgroup = [None] * p.pattern.groups\r | |
589 | for k, i in groupindex.items():\r | |
590 | indexgroup[i] = k\r | |
591 | \r | |
592 | return _sre.compile(\r | |
593 | pattern, flags | p.pattern.flags, code,\r | |
594 | p.pattern.groups-1,\r | |
595 | groupindex, indexgroup\r | |
596 | )\r |