+++ /dev/null
-# -*- coding: utf-8 -*-\r
-#\r
-# Secret Labs' Regular Expression Engine\r
-#\r
-# convert template to internal format\r
-#\r
-# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.\r
-#\r
-# See the sre.py file for information on usage and redistribution.\r
-#\r
-\r
-"""Internal support module for sre"""\r
-\r
-import _sre, sys\r
-import sre_parse\r
-from sre_constants import *\r
-\r
-assert _sre.MAGIC == MAGIC, "SRE module mismatch"\r
-\r
-if _sre.CODESIZE == 2:\r
- MAXCODE = 65535\r
-else:\r
- MAXCODE = 0xFFFFFFFFL\r
-\r
-_LITERAL_CODES = set([LITERAL, NOT_LITERAL])\r
-_REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT])\r
-_SUCCESS_CODES = set([SUCCESS, FAILURE])\r
-_ASSERT_CODES = set([ASSERT, ASSERT_NOT])\r
-\r
-# Sets of lowercase characters which have the same uppercase.\r
-_equivalences = (\r
- # LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I\r
- (0x69, 0x131), # iı\r
- # LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S\r
- (0x73, 0x17f), # sſ\r
- # MICRO SIGN, GREEK SMALL LETTER MU\r
- (0xb5, 0x3bc), # µμ\r
- # COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI\r
- (0x345, 0x3b9, 0x1fbe), # \u0345ιι\r
- # GREEK SMALL LETTER BETA, GREEK BETA SYMBOL\r
- (0x3b2, 0x3d0), # βϐ\r
- # GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL\r
- (0x3b5, 0x3f5), # εϵ\r
- # GREEK SMALL LETTER THETA, GREEK THETA SYMBOL\r
- (0x3b8, 0x3d1), # θϑ\r
- # GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL\r
- (0x3ba, 0x3f0), # κϰ\r
- # GREEK SMALL LETTER PI, GREEK PI SYMBOL\r
- (0x3c0, 0x3d6), # πϖ\r
- # GREEK SMALL LETTER RHO, GREEK RHO SYMBOL\r
- (0x3c1, 0x3f1), # ρϱ\r
- # GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA\r
- (0x3c2, 0x3c3), # ςσ\r
- # GREEK SMALL LETTER PHI, GREEK PHI SYMBOL\r
- (0x3c6, 0x3d5), # φϕ\r
- # LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE\r
- (0x1e61, 0x1e9b), # ṡẛ\r
-)\r
-\r
-# Maps the lowercase code to lowercase codes which have the same uppercase.\r
-_ignorecase_fixes = {i: tuple(j for j in t if i != j)\r
- for t in _equivalences for i in t}\r
-\r
-def _compile(code, pattern, flags):\r
- # internal: compile a (sub)pattern\r
- emit = code.append\r
- _len = len\r
- LITERAL_CODES = _LITERAL_CODES\r
- REPEATING_CODES = _REPEATING_CODES\r
- SUCCESS_CODES = _SUCCESS_CODES\r
- ASSERT_CODES = _ASSERT_CODES\r
- if (flags & SRE_FLAG_IGNORECASE and\r
- not (flags & SRE_FLAG_LOCALE) and\r
- flags & SRE_FLAG_UNICODE):\r
- fixes = _ignorecase_fixes\r
- else:\r
- fixes = None\r
- for op, av in pattern:\r
- if op in LITERAL_CODES:\r
- if flags & SRE_FLAG_IGNORECASE:\r
- lo = _sre.getlower(av, flags)\r
- if fixes and lo in fixes:\r
- emit(OPCODES[IN_IGNORE])\r
- skip = _len(code); emit(0)\r
- if op is NOT_LITERAL:\r
- emit(OPCODES[NEGATE])\r
- for k in (lo,) + fixes[lo]:\r
- emit(OPCODES[LITERAL])\r
- emit(k)\r
- emit(OPCODES[FAILURE])\r
- code[skip] = _len(code) - skip\r
- else:\r
- emit(OPCODES[OP_IGNORE[op]])\r
- emit(lo)\r
- else:\r
- emit(OPCODES[op])\r
- emit(av)\r
- elif op is IN:\r
- if flags & SRE_FLAG_IGNORECASE:\r
- emit(OPCODES[OP_IGNORE[op]])\r
- def fixup(literal, flags=flags):\r
- return _sre.getlower(literal, flags)\r
- else:\r
- emit(OPCODES[op])\r
- fixup = None\r
- skip = _len(code); emit(0)\r
- _compile_charset(av, flags, code, fixup, fixes)\r
- code[skip] = _len(code) - skip\r
- elif op is ANY:\r
- if flags & SRE_FLAG_DOTALL:\r
- emit(OPCODES[ANY_ALL])\r
- else:\r
- emit(OPCODES[ANY])\r
- elif op in REPEATING_CODES:\r
- if flags & SRE_FLAG_TEMPLATE:\r
- raise error, "internal: unsupported template operator"\r
- emit(OPCODES[REPEAT])\r
- skip = _len(code); emit(0)\r
- emit(av[0])\r
- emit(av[1])\r
- _compile(code, av[2], flags)\r
- emit(OPCODES[SUCCESS])\r
- code[skip] = _len(code) - skip\r
- elif _simple(av) and op is not REPEAT:\r
- if op is MAX_REPEAT:\r
- emit(OPCODES[REPEAT_ONE])\r
- else:\r
- emit(OPCODES[MIN_REPEAT_ONE])\r
- skip = _len(code); emit(0)\r
- emit(av[0])\r
- emit(av[1])\r
- _compile(code, av[2], flags)\r
- emit(OPCODES[SUCCESS])\r
- code[skip] = _len(code) - skip\r
- else:\r
- emit(OPCODES[REPEAT])\r
- skip = _len(code); emit(0)\r
- emit(av[0])\r
- emit(av[1])\r
- _compile(code, av[2], flags)\r
- code[skip] = _len(code) - skip\r
- if op is MAX_REPEAT:\r
- emit(OPCODES[MAX_UNTIL])\r
- else:\r
- emit(OPCODES[MIN_UNTIL])\r
- elif op is SUBPATTERN:\r
- if av[0]:\r
- emit(OPCODES[MARK])\r
- emit((av[0]-1)*2)\r
- # _compile_info(code, av[1], flags)\r
- _compile(code, av[1], flags)\r
- if av[0]:\r
- emit(OPCODES[MARK])\r
- emit((av[0]-1)*2+1)\r
- elif op in SUCCESS_CODES:\r
- emit(OPCODES[op])\r
- elif op in ASSERT_CODES:\r
- emit(OPCODES[op])\r
- skip = _len(code); emit(0)\r
- if av[0] >= 0:\r
- emit(0) # look ahead\r
- else:\r
- lo, hi = av[1].getwidth()\r
- if lo != hi:\r
- raise error, "look-behind requires fixed-width pattern"\r
- emit(lo) # look behind\r
- _compile(code, av[1], flags)\r
- emit(OPCODES[SUCCESS])\r
- code[skip] = _len(code) - skip\r
- elif op is CALL:\r
- emit(OPCODES[op])\r
- skip = _len(code); emit(0)\r
- _compile(code, av, flags)\r
- emit(OPCODES[SUCCESS])\r
- code[skip] = _len(code) - skip\r
- elif op is AT:\r
- emit(OPCODES[op])\r
- if flags & SRE_FLAG_MULTILINE:\r
- av = AT_MULTILINE.get(av, av)\r
- if flags & SRE_FLAG_LOCALE:\r
- av = AT_LOCALE.get(av, av)\r
- elif flags & SRE_FLAG_UNICODE:\r
- av = AT_UNICODE.get(av, av)\r
- emit(ATCODES[av])\r
- elif op is BRANCH:\r
- emit(OPCODES[op])\r
- tail = []\r
- tailappend = tail.append\r
- for av in av[1]:\r
- skip = _len(code); emit(0)\r
- # _compile_info(code, av, flags)\r
- _compile(code, av, flags)\r
- emit(OPCODES[JUMP])\r
- tailappend(_len(code)); emit(0)\r
- code[skip] = _len(code) - skip\r
- emit(0) # end of branch\r
- for tail in tail:\r
- code[tail] = _len(code) - tail\r
- elif op is CATEGORY:\r
- emit(OPCODES[op])\r
- if flags & SRE_FLAG_LOCALE:\r
- av = CH_LOCALE[av]\r
- elif flags & SRE_FLAG_UNICODE:\r
- av = CH_UNICODE[av]\r
- emit(CHCODES[av])\r
- elif op is GROUPREF:\r
- if flags & SRE_FLAG_IGNORECASE:\r
- emit(OPCODES[OP_IGNORE[op]])\r
- else:\r
- emit(OPCODES[op])\r
- emit(av-1)\r
- elif op is GROUPREF_EXISTS:\r
- emit(OPCODES[op])\r
- emit(av[0]-1)\r
- skipyes = _len(code); emit(0)\r
- _compile(code, av[1], flags)\r
- if av[2]:\r
- emit(OPCODES[JUMP])\r
- skipno = _len(code); emit(0)\r
- code[skipyes] = _len(code) - skipyes + 1\r
- _compile(code, av[2], flags)\r
- code[skipno] = _len(code) - skipno\r
- else:\r
- code[skipyes] = _len(code) - skipyes + 1\r
- else:\r
- raise ValueError, ("unsupported operand type", op)\r
-\r
-def _compile_charset(charset, flags, code, fixup=None, fixes=None):\r
- # compile charset subprogram\r
- emit = code.append\r
- for op, av in _optimize_charset(charset, fixup, fixes,\r
- flags & SRE_FLAG_UNICODE):\r
- emit(OPCODES[op])\r
- if op is NEGATE:\r
- pass\r
- elif op is LITERAL:\r
- emit(av)\r
- elif op is RANGE:\r
- emit(av[0])\r
- emit(av[1])\r
- elif op is CHARSET:\r
- code.extend(av)\r
- elif op is BIGCHARSET:\r
- code.extend(av)\r
- elif op is CATEGORY:\r
- if flags & SRE_FLAG_LOCALE:\r
- emit(CHCODES[CH_LOCALE[av]])\r
- elif flags & SRE_FLAG_UNICODE:\r
- emit(CHCODES[CH_UNICODE[av]])\r
- else:\r
- emit(CHCODES[av])\r
- else:\r
- raise error, "internal: unsupported set operator"\r
- emit(OPCODES[FAILURE])\r
-\r
-def _optimize_charset(charset, fixup, fixes, isunicode):\r
- # internal: optimize character set\r
- out = []\r
- tail = []\r
- charmap = bytearray(256)\r
- for op, av in charset:\r
- while True:\r
- try:\r
- if op is LITERAL:\r
- if fixup:\r
- i = fixup(av)\r
- charmap[i] = 1\r
- if fixes and i in fixes:\r
- for k in fixes[i]:\r
- charmap[k] = 1\r
- else:\r
- charmap[av] = 1\r
- elif op is RANGE:\r
- r = range(av[0], av[1]+1)\r
- if fixup:\r
- r = map(fixup, r)\r
- if fixup and fixes:\r
- for i in r:\r
- charmap[i] = 1\r
- if i in fixes:\r
- for k in fixes[i]:\r
- charmap[k] = 1\r
- else:\r
- for i in r:\r
- charmap[i] = 1\r
- elif op is NEGATE:\r
- out.append((op, av))\r
- else:\r
- tail.append((op, av))\r
- except IndexError:\r
- if len(charmap) == 256:\r
- # character set contains non-UCS1 character codes\r
- charmap += b'\0' * 0xff00\r
- continue\r
- # character set contains non-BMP character codes\r
- if fixup and isunicode and op is RANGE:\r
- lo, hi = av\r
- ranges = [av]\r
- # There are only two ranges of cased astral characters:\r
- # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi).\r
- _fixup_range(max(0x10000, lo), min(0x11fff, hi),\r
- ranges, fixup)\r
- for lo, hi in ranges:\r
- if lo == hi:\r
- tail.append((LITERAL, hi))\r
- else:\r
- tail.append((RANGE, (lo, hi)))\r
- else:\r
- tail.append((op, av))\r
- break\r
-\r
- # compress character map\r
- runs = []\r
- q = 0\r
- while True:\r
- p = charmap.find(b'\1', q)\r
- if p < 0:\r
- break\r
- if len(runs) >= 2:\r
- runs = None\r
- break\r
- q = charmap.find(b'\0', p)\r
- if q < 0:\r
- runs.append((p, len(charmap)))\r
- break\r
- runs.append((p, q))\r
- if runs is not None:\r
- # use literal/range\r
- for p, q in runs:\r
- if q - p == 1:\r
- out.append((LITERAL, p))\r
- else:\r
- out.append((RANGE, (p, q - 1)))\r
- out += tail\r
- # if the case was changed or new representation is more compact\r
- if fixup or len(out) < len(charset):\r
- return out\r
- # else original character set is good enough\r
- return charset\r
-\r
- # use bitmap\r
- if len(charmap) == 256:\r
- data = _mk_bitmap(charmap)\r
- out.append((CHARSET, data))\r
- out += tail\r
- return out\r
-\r
- # To represent a big charset, first a bitmap of all characters in the\r
- # set is constructed. Then, this bitmap is sliced into chunks of 256\r
- # characters, duplicate chunks are eliminated, and each chunk is\r
- # given a number. In the compiled expression, the charset is\r
- # represented by a 32-bit word sequence, consisting of one word for\r
- # the number of different chunks, a sequence of 256 bytes (64 words)\r
- # of chunk numbers indexed by their original chunk position, and a\r
- # sequence of 256-bit chunks (8 words each).\r
-\r
- # Compression is normally good: in a typical charset, large ranges of\r
- # Unicode will be either completely excluded (e.g. if only cyrillic\r
- # letters are to be matched), or completely included (e.g. if large\r
- # subranges of Kanji match). These ranges will be represented by\r
- # chunks of all one-bits or all zero-bits.\r
-\r
- # Matching can be also done efficiently: the more significant byte of\r
- # the Unicode character is an index into the chunk number, and the\r
- # less significant byte is a bit index in the chunk (just like the\r
- # CHARSET matching).\r
-\r
- # In UCS-4 mode, the BIGCHARSET opcode still supports only subsets\r
- # of the basic multilingual plane; an efficient representation\r
- # for all of Unicode has not yet been developed.\r
-\r
- charmap = bytes(charmap) # should be hashable\r
- comps = {}\r
- mapping = bytearray(256)\r
- block = 0\r
- data = bytearray()\r
- for i in range(0, 65536, 256):\r
- chunk = charmap[i: i + 256]\r
- if chunk in comps:\r
- mapping[i // 256] = comps[chunk]\r
- else:\r
- mapping[i // 256] = comps[chunk] = block\r
- block += 1\r
- data += chunk\r
- data = _mk_bitmap(data)\r
- data[0:0] = [block] + _bytes_to_codes(mapping)\r
- out.append((BIGCHARSET, data))\r
- out += tail\r
- return out\r
-\r
-def _fixup_range(lo, hi, ranges, fixup):\r
- for i in map(fixup, range(lo, hi+1)):\r
- for k, (lo, hi) in enumerate(ranges):\r
- if i < lo:\r
- if l == lo - 1:\r
- ranges[k] = (i, hi)\r
- else:\r
- ranges.insert(k, (i, i))\r
- break\r
- elif i > hi:\r
- if i == hi + 1:\r
- ranges[k] = (lo, i)\r
- break\r
- else:\r
- break\r
- else:\r
- ranges.append((i, i))\r
-\r
-_CODEBITS = _sre.CODESIZE * 8\r
-_BITS_TRANS = b'0' + b'1' * 255\r
-def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):\r
- s = bytes(bits).translate(_BITS_TRANS)[::-1]\r
- return [_int(s[i - _CODEBITS: i], 2)\r
- for i in range(len(s), 0, -_CODEBITS)]\r
-\r
-def _bytes_to_codes(b):\r
- # Convert block indices to word array\r
- import array\r
- if _sre.CODESIZE == 2:\r
- code = 'H'\r
- else:\r
- code = 'I'\r
- a = array.array(code, bytes(b))\r
- assert a.itemsize == _sre.CODESIZE\r
- assert len(a) * a.itemsize == len(b)\r
- return a.tolist()\r
-\r
-def _simple(av):\r
- # check if av is a "simple" operator\r
- lo, hi = av[2].getwidth()\r
- return lo == hi == 1 and av[2][0][0] != SUBPATTERN\r
-\r
-def _compile_info(code, pattern, flags):\r
- # internal: compile an info block. in the current version,\r
- # this contains min/max pattern width, and an optional literal\r
- # prefix or a character map\r
- lo, hi = pattern.getwidth()\r
- if lo == 0:\r
- return # not worth it\r
- # look for a literal prefix\r
- prefix = []\r
- prefixappend = prefix.append\r
- prefix_skip = 0\r
- charset = [] # not used\r
- charsetappend = charset.append\r
- if not (flags & SRE_FLAG_IGNORECASE):\r
- # look for literal prefix\r
- for op, av in pattern.data:\r
- if op is LITERAL:\r
- if len(prefix) == prefix_skip:\r
- prefix_skip = prefix_skip + 1\r
- prefixappend(av)\r
- elif op is SUBPATTERN and len(av[1]) == 1:\r
- op, av = av[1][0]\r
- if op is LITERAL:\r
- prefixappend(av)\r
- else:\r
- break\r
- else:\r
- break\r
- # if no prefix, look for charset prefix\r
- if not prefix and pattern.data:\r
- op, av = pattern.data[0]\r
- if op is SUBPATTERN and av[1]:\r
- op, av = av[1][0]\r
- if op is LITERAL:\r
- charsetappend((op, av))\r
- elif op is BRANCH:\r
- c = []\r
- cappend = c.append\r
- for p in av[1]:\r
- if not p:\r
- break\r
- op, av = p[0]\r
- if op is LITERAL:\r
- cappend((op, av))\r
- else:\r
- break\r
- else:\r
- charset = c\r
- elif op is BRANCH:\r
- c = []\r
- cappend = c.append\r
- for p in av[1]:\r
- if not p:\r
- break\r
- op, av = p[0]\r
- if op is LITERAL:\r
- cappend((op, av))\r
- else:\r
- break\r
- else:\r
- charset = c\r
- elif op is IN:\r
- charset = av\r
-## if prefix:\r
-## print "*** PREFIX", prefix, prefix_skip\r
-## if charset:\r
-## print "*** CHARSET", charset\r
- # add an info block\r
- emit = code.append\r
- emit(OPCODES[INFO])\r
- skip = len(code); emit(0)\r
- # literal flag\r
- mask = 0\r
- if prefix:\r
- mask = SRE_INFO_PREFIX\r
- if len(prefix) == prefix_skip == len(pattern.data):\r
- mask = mask + SRE_INFO_LITERAL\r
- elif charset:\r
- mask = mask + SRE_INFO_CHARSET\r
- emit(mask)\r
- # pattern length\r
- if lo < MAXCODE:\r
- emit(lo)\r
- else:\r
- emit(MAXCODE)\r
- prefix = prefix[:MAXCODE]\r
- if hi < MAXCODE:\r
- emit(hi)\r
- else:\r
- emit(0)\r
- # add literal prefix\r
- if prefix:\r
- emit(len(prefix)) # length\r
- emit(prefix_skip) # skip\r
- code.extend(prefix)\r
- # generate overlap table\r
- table = [-1] + ([0]*len(prefix))\r
- for i in xrange(len(prefix)):\r
- table[i+1] = table[i]+1\r
- while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:\r
- table[i+1] = table[table[i+1]-1]+1\r
- code.extend(table[1:]) # don't store first entry\r
- elif charset:\r
- _compile_charset(charset, flags, code)\r
- code[skip] = len(code) - skip\r
-\r
-try:\r
- unicode\r
-except NameError:\r
- STRING_TYPES = (type(""),)\r
-else:\r
- STRING_TYPES = (type(""), type(unicode("")))\r
-\r
-def isstring(obj):\r
- for tp in STRING_TYPES:\r
- if isinstance(obj, tp):\r
- return 1\r
- return 0\r
-\r
-def _code(p, flags):\r
-\r
- flags = p.pattern.flags | flags\r
- code = []\r
-\r
- # compile info block\r
- _compile_info(code, p, flags)\r
-\r
- # compile the pattern\r
- _compile(code, p.data, flags)\r
-\r
- code.append(OPCODES[SUCCESS])\r
-\r
- return code\r
-\r
-def compile(p, flags=0):\r
- # internal: convert pattern list to internal format\r
-\r
- if isstring(p):\r
- pattern = p\r
- p = sre_parse.parse(p, flags)\r
- else:\r
- pattern = None\r
-\r
- code = _code(p, flags)\r
-\r
- # print code\r
-\r
- # XXX: <fl> get rid of this limitation!\r
- if p.pattern.groups > 100:\r
- raise AssertionError(\r
- "sorry, but this version only supports 100 named groups"\r
- )\r
-\r
- # map in either direction\r
- groupindex = p.pattern.groupdict\r
- indexgroup = [None] * p.pattern.groups\r
- for k, i in groupindex.items():\r
- indexgroup[i] = k\r
-\r
- return _sre.compile(\r
- pattern, flags | p.pattern.flags, code,\r
- p.pattern.groups-1,\r
- groupindex, indexgroup\r
- )\r