--- /dev/null
+# -*- coding: utf-8 -*-\r
+#\r
+# Secret Labs' Regular Expression Engine\r
+#\r
+# convert template to internal format\r
+#\r
+# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.\r
+#\r
+# See the sre.py file for information on usage and redistribution.\r
+#\r
+\r
+"""Internal support module for sre"""\r
+\r
+import _sre, sys\r
+import sre_parse\r
+from sre_constants import *\r
+\r
+assert _sre.MAGIC == MAGIC, "SRE module mismatch"\r
+\r
+if _sre.CODESIZE == 2:\r
+ MAXCODE = 65535\r
+else:\r
+ MAXCODE = 0xFFFFFFFFL\r
+\r
+_LITERAL_CODES = set([LITERAL, NOT_LITERAL])\r
+_REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT])\r
+_SUCCESS_CODES = set([SUCCESS, FAILURE])\r
+_ASSERT_CODES = set([ASSERT, ASSERT_NOT])\r
+\r
+# Sets of lowercase characters which have the same uppercase.\r
+_equivalences = (\r
+ # LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I\r
+ (0x69, 0x131), # iı\r
+ # LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S\r
+ (0x73, 0x17f), # sſ\r
+ # MICRO SIGN, GREEK SMALL LETTER MU\r
+ (0xb5, 0x3bc), # µμ\r
+ # COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI\r
+ (0x345, 0x3b9, 0x1fbe), # \u0345ιι\r
+ # GREEK SMALL LETTER BETA, GREEK BETA SYMBOL\r
+ (0x3b2, 0x3d0), # βϐ\r
+ # GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL\r
+ (0x3b5, 0x3f5), # εϵ\r
+ # GREEK SMALL LETTER THETA, GREEK THETA SYMBOL\r
+ (0x3b8, 0x3d1), # θϑ\r
+ # GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL\r
+ (0x3ba, 0x3f0), # κϰ\r
+ # GREEK SMALL LETTER PI, GREEK PI SYMBOL\r
+ (0x3c0, 0x3d6), # πϖ\r
+ # GREEK SMALL LETTER RHO, GREEK RHO SYMBOL\r
+ (0x3c1, 0x3f1), # ρϱ\r
+ # GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA\r
+ (0x3c2, 0x3c3), # ςσ\r
+ # GREEK SMALL LETTER PHI, GREEK PHI SYMBOL\r
+ (0x3c6, 0x3d5), # φϕ\r
+ # LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE\r
+ (0x1e61, 0x1e9b), # ṡẛ\r
+)\r
+\r
+# Maps the lowercase code to lowercase codes which have the same uppercase.\r
+_ignorecase_fixes = {i: tuple(j for j in t if i != j)\r
+ for t in _equivalences for i in t}\r
+\r
+def _compile(code, pattern, flags):\r
+ # internal: compile a (sub)pattern\r
+ emit = code.append\r
+ _len = len\r
+ LITERAL_CODES = _LITERAL_CODES\r
+ REPEATING_CODES = _REPEATING_CODES\r
+ SUCCESS_CODES = _SUCCESS_CODES\r
+ ASSERT_CODES = _ASSERT_CODES\r
+ if (flags & SRE_FLAG_IGNORECASE and\r
+ not (flags & SRE_FLAG_LOCALE) and\r
+ flags & SRE_FLAG_UNICODE):\r
+ fixes = _ignorecase_fixes\r
+ else:\r
+ fixes = None\r
+ for op, av in pattern:\r
+ if op in LITERAL_CODES:\r
+ if flags & SRE_FLAG_IGNORECASE:\r
+ lo = _sre.getlower(av, flags)\r
+ if fixes and lo in fixes:\r
+ emit(OPCODES[IN_IGNORE])\r
+ skip = _len(code); emit(0)\r
+ if op is NOT_LITERAL:\r
+ emit(OPCODES[NEGATE])\r
+ for k in (lo,) + fixes[lo]:\r
+ emit(OPCODES[LITERAL])\r
+ emit(k)\r
+ emit(OPCODES[FAILURE])\r
+ code[skip] = _len(code) - skip\r
+ else:\r
+ emit(OPCODES[OP_IGNORE[op]])\r
+ emit(lo)\r
+ else:\r
+ emit(OPCODES[op])\r
+ emit(av)\r
+ elif op is IN:\r
+ if flags & SRE_FLAG_IGNORECASE:\r
+ emit(OPCODES[OP_IGNORE[op]])\r
+ def fixup(literal, flags=flags):\r
+ return _sre.getlower(literal, flags)\r
+ else:\r
+ emit(OPCODES[op])\r
+ fixup = None\r
+ skip = _len(code); emit(0)\r
+ _compile_charset(av, flags, code, fixup, fixes)\r
+ code[skip] = _len(code) - skip\r
+ elif op is ANY:\r
+ if flags & SRE_FLAG_DOTALL:\r
+ emit(OPCODES[ANY_ALL])\r
+ else:\r
+ emit(OPCODES[ANY])\r
+ elif op in REPEATING_CODES:\r
+ if flags & SRE_FLAG_TEMPLATE:\r
+ raise error, "internal: unsupported template operator"\r
+ emit(OPCODES[REPEAT])\r
+ skip = _len(code); emit(0)\r
+ emit(av[0])\r
+ emit(av[1])\r
+ _compile(code, av[2], flags)\r
+ emit(OPCODES[SUCCESS])\r
+ code[skip] = _len(code) - skip\r
+ elif _simple(av) and op is not REPEAT:\r
+ if op is MAX_REPEAT:\r
+ emit(OPCODES[REPEAT_ONE])\r
+ else:\r
+ emit(OPCODES[MIN_REPEAT_ONE])\r
+ skip = _len(code); emit(0)\r
+ emit(av[0])\r
+ emit(av[1])\r
+ _compile(code, av[2], flags)\r
+ emit(OPCODES[SUCCESS])\r
+ code[skip] = _len(code) - skip\r
+ else:\r
+ emit(OPCODES[REPEAT])\r
+ skip = _len(code); emit(0)\r
+ emit(av[0])\r
+ emit(av[1])\r
+ _compile(code, av[2], flags)\r
+ code[skip] = _len(code) - skip\r
+ if op is MAX_REPEAT:\r
+ emit(OPCODES[MAX_UNTIL])\r
+ else:\r
+ emit(OPCODES[MIN_UNTIL])\r
+ elif op is SUBPATTERN:\r
+ if av[0]:\r
+ emit(OPCODES[MARK])\r
+ emit((av[0]-1)*2)\r
+ # _compile_info(code, av[1], flags)\r
+ _compile(code, av[1], flags)\r
+ if av[0]:\r
+ emit(OPCODES[MARK])\r
+ emit((av[0]-1)*2+1)\r
+ elif op in SUCCESS_CODES:\r
+ emit(OPCODES[op])\r
+ elif op in ASSERT_CODES:\r
+ emit(OPCODES[op])\r
+ skip = _len(code); emit(0)\r
+ if av[0] >= 0:\r
+ emit(0) # look ahead\r
+ else:\r
+ lo, hi = av[1].getwidth()\r
+ if lo != hi:\r
+ raise error, "look-behind requires fixed-width pattern"\r
+ emit(lo) # look behind\r
+ _compile(code, av[1], flags)\r
+ emit(OPCODES[SUCCESS])\r
+ code[skip] = _len(code) - skip\r
+ elif op is CALL:\r
+ emit(OPCODES[op])\r
+ skip = _len(code); emit(0)\r
+ _compile(code, av, flags)\r
+ emit(OPCODES[SUCCESS])\r
+ code[skip] = _len(code) - skip\r
+ elif op is AT:\r
+ emit(OPCODES[op])\r
+ if flags & SRE_FLAG_MULTILINE:\r
+ av = AT_MULTILINE.get(av, av)\r
+ if flags & SRE_FLAG_LOCALE:\r
+ av = AT_LOCALE.get(av, av)\r
+ elif flags & SRE_FLAG_UNICODE:\r
+ av = AT_UNICODE.get(av, av)\r
+ emit(ATCODES[av])\r
+ elif op is BRANCH:\r
+ emit(OPCODES[op])\r
+ tail = []\r
+ tailappend = tail.append\r
+ for av in av[1]:\r
+ skip = _len(code); emit(0)\r
+ # _compile_info(code, av, flags)\r
+ _compile(code, av, flags)\r
+ emit(OPCODES[JUMP])\r
+ tailappend(_len(code)); emit(0)\r
+ code[skip] = _len(code) - skip\r
+ emit(0) # end of branch\r
+ for tail in tail:\r
+ code[tail] = _len(code) - tail\r
+ elif op is CATEGORY:\r
+ emit(OPCODES[op])\r
+ if flags & SRE_FLAG_LOCALE:\r
+ av = CH_LOCALE[av]\r
+ elif flags & SRE_FLAG_UNICODE:\r
+ av = CH_UNICODE[av]\r
+ emit(CHCODES[av])\r
+ elif op is GROUPREF:\r
+ if flags & SRE_FLAG_IGNORECASE:\r
+ emit(OPCODES[OP_IGNORE[op]])\r
+ else:\r
+ emit(OPCODES[op])\r
+ emit(av-1)\r
+ elif op is GROUPREF_EXISTS:\r
+ emit(OPCODES[op])\r
+ emit(av[0]-1)\r
+ skipyes = _len(code); emit(0)\r
+ _compile(code, av[1], flags)\r
+ if av[2]:\r
+ emit(OPCODES[JUMP])\r
+ skipno = _len(code); emit(0)\r
+ code[skipyes] = _len(code) - skipyes + 1\r
+ _compile(code, av[2], flags)\r
+ code[skipno] = _len(code) - skipno\r
+ else:\r
+ code[skipyes] = _len(code) - skipyes + 1\r
+ else:\r
+ raise ValueError, ("unsupported operand type", op)\r
+\r
+def _compile_charset(charset, flags, code, fixup=None, fixes=None):\r
+ # compile charset subprogram\r
+ emit = code.append\r
+ for op, av in _optimize_charset(charset, fixup, fixes,\r
+ flags & SRE_FLAG_UNICODE):\r
+ emit(OPCODES[op])\r
+ if op is NEGATE:\r
+ pass\r
+ elif op is LITERAL:\r
+ emit(av)\r
+ elif op is RANGE:\r
+ emit(av[0])\r
+ emit(av[1])\r
+ elif op is CHARSET:\r
+ code.extend(av)\r
+ elif op is BIGCHARSET:\r
+ code.extend(av)\r
+ elif op is CATEGORY:\r
+ if flags & SRE_FLAG_LOCALE:\r
+ emit(CHCODES[CH_LOCALE[av]])\r
+ elif flags & SRE_FLAG_UNICODE:\r
+ emit(CHCODES[CH_UNICODE[av]])\r
+ else:\r
+ emit(CHCODES[av])\r
+ else:\r
+ raise error, "internal: unsupported set operator"\r
+ emit(OPCODES[FAILURE])\r
+\r
+def _optimize_charset(charset, fixup, fixes, isunicode):\r
+ # internal: optimize character set\r
+ out = []\r
+ tail = []\r
+ charmap = bytearray(256)\r
+ for op, av in charset:\r
+ while True:\r
+ try:\r
+ if op is LITERAL:\r
+ if fixup:\r
+ i = fixup(av)\r
+ charmap[i] = 1\r
+ if fixes and i in fixes:\r
+ for k in fixes[i]:\r
+ charmap[k] = 1\r
+ else:\r
+ charmap[av] = 1\r
+ elif op is RANGE:\r
+ r = range(av[0], av[1]+1)\r
+ if fixup:\r
+ r = map(fixup, r)\r
+ if fixup and fixes:\r
+ for i in r:\r
+ charmap[i] = 1\r
+ if i in fixes:\r
+ for k in fixes[i]:\r
+ charmap[k] = 1\r
+ else:\r
+ for i in r:\r
+ charmap[i] = 1\r
+ elif op is NEGATE:\r
+ out.append((op, av))\r
+ else:\r
+ tail.append((op, av))\r
+ except IndexError:\r
+ if len(charmap) == 256:\r
+ # character set contains non-UCS1 character codes\r
+ charmap += b'\0' * 0xff00\r
+ continue\r
+ # character set contains non-BMP character codes\r
+ if fixup and isunicode and op is RANGE:\r
+ lo, hi = av\r
+ ranges = [av]\r
+ # There are only two ranges of cased astral characters:\r
+ # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi).\r
+ _fixup_range(max(0x10000, lo), min(0x11fff, hi),\r
+ ranges, fixup)\r
+ for lo, hi in ranges:\r
+ if lo == hi:\r
+ tail.append((LITERAL, hi))\r
+ else:\r
+ tail.append((RANGE, (lo, hi)))\r
+ else:\r
+ tail.append((op, av))\r
+ break\r
+\r
+ # compress character map\r
+ runs = []\r
+ q = 0\r
+ while True:\r
+ p = charmap.find(b'\1', q)\r
+ if p < 0:\r
+ break\r
+ if len(runs) >= 2:\r
+ runs = None\r
+ break\r
+ q = charmap.find(b'\0', p)\r
+ if q < 0:\r
+ runs.append((p, len(charmap)))\r
+ break\r
+ runs.append((p, q))\r
+ if runs is not None:\r
+ # use literal/range\r
+ for p, q in runs:\r
+ if q - p == 1:\r
+ out.append((LITERAL, p))\r
+ else:\r
+ out.append((RANGE, (p, q - 1)))\r
+ out += tail\r
+ # if the case was changed or new representation is more compact\r
+ if fixup or len(out) < len(charset):\r
+ return out\r
+ # else original character set is good enough\r
+ return charset\r
+\r
+ # use bitmap\r
+ if len(charmap) == 256:\r
+ data = _mk_bitmap(charmap)\r
+ out.append((CHARSET, data))\r
+ out += tail\r
+ return out\r
+\r
+ # To represent a big charset, first a bitmap of all characters in the\r
+ # set is constructed. Then, this bitmap is sliced into chunks of 256\r
+ # characters, duplicate chunks are eliminated, and each chunk is\r
+ # given a number. In the compiled expression, the charset is\r
+ # represented by a 32-bit word sequence, consisting of one word for\r
+ # the number of different chunks, a sequence of 256 bytes (64 words)\r
+ # of chunk numbers indexed by their original chunk position, and a\r
+ # sequence of 256-bit chunks (8 words each).\r
+\r
+ # Compression is normally good: in a typical charset, large ranges of\r
+ # Unicode will be either completely excluded (e.g. if only cyrillic\r
+ # letters are to be matched), or completely included (e.g. if large\r
+ # subranges of Kanji match). These ranges will be represented by\r
+ # chunks of all one-bits or all zero-bits.\r
+\r
+ # Matching can be also done efficiently: the more significant byte of\r
+ # the Unicode character is an index into the chunk number, and the\r
+ # less significant byte is a bit index in the chunk (just like the\r
+ # CHARSET matching).\r
+\r
+ # In UCS-4 mode, the BIGCHARSET opcode still supports only subsets\r
+ # of the basic multilingual plane; an efficient representation\r
+ # for all of Unicode has not yet been developed.\r
+\r
+ charmap = bytes(charmap) # should be hashable\r
+ comps = {}\r
+ mapping = bytearray(256)\r
+ block = 0\r
+ data = bytearray()\r
+ for i in range(0, 65536, 256):\r
+ chunk = charmap[i: i + 256]\r
+ if chunk in comps:\r
+ mapping[i // 256] = comps[chunk]\r
+ else:\r
+ mapping[i // 256] = comps[chunk] = block\r
+ block += 1\r
+ data += chunk\r
+ data = _mk_bitmap(data)\r
+ data[0:0] = [block] + _bytes_to_codes(mapping)\r
+ out.append((BIGCHARSET, data))\r
+ out += tail\r
+ return out\r
+\r
+def _fixup_range(lo, hi, ranges, fixup):\r
+ for i in map(fixup, range(lo, hi+1)):\r
+ for k, (lo, hi) in enumerate(ranges):\r
+ if i < lo:\r
+ if l == lo - 1:\r
+ ranges[k] = (i, hi)\r
+ else:\r
+ ranges.insert(k, (i, i))\r
+ break\r
+ elif i > hi:\r
+ if i == hi + 1:\r
+ ranges[k] = (lo, i)\r
+ break\r
+ else:\r
+ break\r
+ else:\r
+ ranges.append((i, i))\r
+\r
+_CODEBITS = _sre.CODESIZE * 8\r
+_BITS_TRANS = b'0' + b'1' * 255\r
+def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):\r
+ s = bytes(bits).translate(_BITS_TRANS)[::-1]\r
+ return [_int(s[i - _CODEBITS: i], 2)\r
+ for i in range(len(s), 0, -_CODEBITS)]\r
+\r
+def _bytes_to_codes(b):\r
+ # Convert block indices to word array\r
+ import array\r
+ if _sre.CODESIZE == 2:\r
+ code = 'H'\r
+ else:\r
+ code = 'I'\r
+ a = array.array(code, bytes(b))\r
+ assert a.itemsize == _sre.CODESIZE\r
+ assert len(a) * a.itemsize == len(b)\r
+ return a.tolist()\r
+\r
+def _simple(av):\r
+ # check if av is a "simple" operator\r
+ lo, hi = av[2].getwidth()\r
+ return lo == hi == 1 and av[2][0][0] != SUBPATTERN\r
+\r
+def _compile_info(code, pattern, flags):\r
+ # internal: compile an info block. in the current version,\r
+ # this contains min/max pattern width, and an optional literal\r
+ # prefix or a character map\r
+ lo, hi = pattern.getwidth()\r
+ if lo == 0:\r
+ return # not worth it\r
+ # look for a literal prefix\r
+ prefix = []\r
+ prefixappend = prefix.append\r
+ prefix_skip = 0\r
+ charset = [] # not used\r
+ charsetappend = charset.append\r
+ if not (flags & SRE_FLAG_IGNORECASE):\r
+ # look for literal prefix\r
+ for op, av in pattern.data:\r
+ if op is LITERAL:\r
+ if len(prefix) == prefix_skip:\r
+ prefix_skip = prefix_skip + 1\r
+ prefixappend(av)\r
+ elif op is SUBPATTERN and len(av[1]) == 1:\r
+ op, av = av[1][0]\r
+ if op is LITERAL:\r
+ prefixappend(av)\r
+ else:\r
+ break\r
+ else:\r
+ break\r
+ # if no prefix, look for charset prefix\r
+ if not prefix and pattern.data:\r
+ op, av = pattern.data[0]\r
+ if op is SUBPATTERN and av[1]:\r
+ op, av = av[1][0]\r
+ if op is LITERAL:\r
+ charsetappend((op, av))\r
+ elif op is BRANCH:\r
+ c = []\r
+ cappend = c.append\r
+ for p in av[1]:\r
+ if not p:\r
+ break\r
+ op, av = p[0]\r
+ if op is LITERAL:\r
+ cappend((op, av))\r
+ else:\r
+ break\r
+ else:\r
+ charset = c\r
+ elif op is BRANCH:\r
+ c = []\r
+ cappend = c.append\r
+ for p in av[1]:\r
+ if not p:\r
+ break\r
+ op, av = p[0]\r
+ if op is LITERAL:\r
+ cappend((op, av))\r
+ else:\r
+ break\r
+ else:\r
+ charset = c\r
+ elif op is IN:\r
+ charset = av\r
+## if prefix:\r
+## print "*** PREFIX", prefix, prefix_skip\r
+## if charset:\r
+## print "*** CHARSET", charset\r
+ # add an info block\r
+ emit = code.append\r
+ emit(OPCODES[INFO])\r
+ skip = len(code); emit(0)\r
+ # literal flag\r
+ mask = 0\r
+ if prefix:\r
+ mask = SRE_INFO_PREFIX\r
+ if len(prefix) == prefix_skip == len(pattern.data):\r
+ mask = mask + SRE_INFO_LITERAL\r
+ elif charset:\r
+ mask = mask + SRE_INFO_CHARSET\r
+ emit(mask)\r
+ # pattern length\r
+ if lo < MAXCODE:\r
+ emit(lo)\r
+ else:\r
+ emit(MAXCODE)\r
+ prefix = prefix[:MAXCODE]\r
+ if hi < MAXCODE:\r
+ emit(hi)\r
+ else:\r
+ emit(0)\r
+ # add literal prefix\r
+ if prefix:\r
+ emit(len(prefix)) # length\r
+ emit(prefix_skip) # skip\r
+ code.extend(prefix)\r
+ # generate overlap table\r
+ table = [-1] + ([0]*len(prefix))\r
+ for i in xrange(len(prefix)):\r
+ table[i+1] = table[i]+1\r
+ while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:\r
+ table[i+1] = table[table[i+1]-1]+1\r
+ code.extend(table[1:]) # don't store first entry\r
+ elif charset:\r
+ _compile_charset(charset, flags, code)\r
+ code[skip] = len(code) - skip\r
+\r
+try:\r
+ unicode\r
+except NameError:\r
+ STRING_TYPES = (type(""),)\r
+else:\r
+ STRING_TYPES = (type(""), type(unicode("")))\r
+\r
+def isstring(obj):\r
+ for tp in STRING_TYPES:\r
+ if isinstance(obj, tp):\r
+ return 1\r
+ return 0\r
+\r
+def _code(p, flags):\r
+\r
+ flags = p.pattern.flags | flags\r
+ code = []\r
+\r
+ # compile info block\r
+ _compile_info(code, p, flags)\r
+\r
+ # compile the pattern\r
+ _compile(code, p.data, flags)\r
+\r
+ code.append(OPCODES[SUCCESS])\r
+\r
+ return code\r
+\r
+def compile(p, flags=0):\r
+ # internal: convert pattern list to internal format\r
+\r
+ if isstring(p):\r
+ pattern = p\r
+ p = sre_parse.parse(p, flags)\r
+ else:\r
+ pattern = None\r
+\r
+ code = _code(p, flags)\r
+\r
+ # print code\r
+\r
+ # XXX: <fl> get rid of this limitation!\r
+ if p.pattern.groups > 100:\r
+ raise AssertionError(\r
+ "sorry, but this version only supports 100 named groups"\r
+ )\r
+\r
+ # map in either direction\r
+ groupindex = p.pattern.groupdict\r
+ indexgroup = [None] * p.pattern.groups\r
+ for k, i in groupindex.items():\r
+ indexgroup[i] = k\r
+\r
+ return _sre.compile(\r
+ pattern, flags | p.pattern.flags, code,\r
+ p.pattern.groups-1,\r
+ groupindex, indexgroup\r
+ )\r