[mirror_edk2.git] / AppPkg / Applications / Python / Python-2.7.10 / Lib / tokenize.py

"""Tokenization help for Python programs.\r
\r
generate_tokens(readline) is a generator that breaks a stream of\r
text into Python tokens.  It accepts a readline-like method which is called\r
repeatedly to get the next line of input (or "" for EOF).  It generates\r
5-tuples with these members:\r
\r
    the token type (see token.py)\r
    the token (a string)\r
    the starting (row, column) indices of the token (a 2-tuple of ints)\r
    the ending (row, column) indices of the token (a 2-tuple of ints)\r
    the original line (string)\r
\r
It is designed to match the working of the Python tokenizer exactly, except\r
that it produces COMMENT tokens for comments and gives type OP for all\r
operators\r
\r
Older entry points\r
    tokenize_loop(readline, tokeneater)\r
    tokenize(readline, tokeneater=printtoken)\r
are the same, except instead of generating tokens, tokeneater is a callback\r
function to which the 5 fields described above are passed as 5 arguments,\r
each time a new token is found."""\r
\r
__author__ = 'Ka-Ping Yee <ping@lfw.org>'\r
__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '\r
               'Skip Montanaro, Raymond Hettinger')\r
\r
from itertools import chain\r
import string, re\r
from token import *\r
\r
import token\r
__all__ = [x for x in dir(token) if not x.startswith("_")]\r
__all__ += ["COMMENT", "tokenize", "generate_tokens", "NL", "untokenize"]\r
del x\r
del token\r
\r
COMMENT = N_TOKENS\r
tok_name[COMMENT] = 'COMMENT'\r
NL = N_TOKENS + 1\r
tok_name[NL] = 'NL'\r
N_TOKENS += 2\r
\r
def group(*choices): return '(' + '|'.join(choices) + ')'\r
def any(*choices): return group(*choices) + '*'\r
def maybe(*choices): return group(*choices) + '?'\r
\r
Whitespace = r'[ \f\t]*'\r
Comment = r'#[^\r\n]*'\r
Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)\r
Name = r'[a-zA-Z_]\w*'\r
\r
Hexnumber = r'0[xX][\da-fA-F]+[lL]?'\r
Octnumber = r'(0[oO][0-7]+)|(0[0-7]*)[lL]?'\r
Binnumber = r'0[bB][01]+[lL]?'\r
Decnumber = r'[1-9]\d*[lL]?'\r
Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)\r
Exponent = r'[eE][-+]?\d+'\r
Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)\r
Expfloat = r'\d+' + Exponent\r
Floatnumber = group(Pointfloat, Expfloat)\r
Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')\r
Number = group(Imagnumber, Floatnumber, Intnumber)\r
\r
# Tail end of ' string.\r
Single = r"[^'\\]*(?:\\.[^'\\]*)*'"\r
# Tail end of " string.\r
Double = r'[^"\\]*(?:\\.[^"\\]*)*"'\r
# Tail end of ''' string.\r
Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"\r
# Tail end of """ string.\r
Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'\r
Triple = group("[uUbB]?[rR]?'''", '[uUbB]?[rR]?"""')\r
# Single-line ' or " string.\r
String = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",\r
               r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')\r
\r
# Because of leftmost-then-longest match semantics, be sure to put the\r
# longest operators first (e.g., if = came before ==, == would get\r
# recognized as two instances of =).\r
Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",\r
                 r"//=?",\r
                 r"[+\-*/%&|^=<>]=?",\r
                 r"~")\r
\r
Bracket = '[][(){}]'\r
Special = group(r'\r?\n', r'[:;.,`@]')\r
Funny = group(Operator, Bracket, Special)\r
\r
PlainToken = group(Number, Funny, String, Name)\r
Token = Ignore + PlainToken\r
\r
# First (or only) line of ' or " string.\r
ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +\r
                group("'", r'\\\r?\n'),\r
                r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +\r
                group('"', r'\\\r?\n'))\r
PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)\r
PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)\r
\r
tokenprog, pseudoprog, single3prog, double3prog = map(\r
    re.compile, (Token, PseudoToken, Single3, Double3))\r
endprogs = {"'": re.compile(Single), '"': re.compile(Double),\r
            "'''": single3prog, '"""': double3prog,\r
            "r'''": single3prog, 'r"""': double3prog,\r
            "u'''": single3prog, 'u"""': double3prog,\r
            "ur'''": single3prog, 'ur"""': double3prog,\r
            "R'''": single3prog, 'R"""': double3prog,\r
            "U'''": single3prog, 'U"""': double3prog,\r
            "uR'''": single3prog, 'uR"""': double3prog,\r
            "Ur'''": single3prog, 'Ur"""': double3prog,\r
            "UR'''": single3prog, 'UR"""': double3prog,\r
            "b'''": single3prog, 'b"""': double3prog,\r
            "br'''": single3prog, 'br"""': double3prog,\r
            "B'''": single3prog, 'B"""': double3prog,\r
            "bR'''": single3prog, 'bR"""': double3prog,\r
            "Br'''": single3prog, 'Br"""': double3prog,\r
            "BR'''": single3prog, 'BR"""': double3prog,\r
            'r': None, 'R': None, 'u': None, 'U': None,\r
            'b': None, 'B': None}\r
\r
triple_quoted = {}\r
for t in ("'''", '"""',\r
          "r'''", 'r"""', "R'''", 'R"""',\r
          "u'''", 'u"""', "U'''", 'U"""',\r
          "ur'''", 'ur"""', "Ur'''", 'Ur"""',\r
          "uR'''", 'uR"""', "UR'''", 'UR"""',\r
          "b'''", 'b"""', "B'''", 'B"""',\r
          "br'''", 'br"""', "Br'''", 'Br"""',\r
          "bR'''", 'bR"""', "BR'''", 'BR"""'):\r
    triple_quoted[t] = t\r
single_quoted = {}\r
for t in ("'", '"',\r
          "r'", 'r"', "R'", 'R"',\r
          "u'", 'u"', "U'", 'U"',\r
          "ur'", 'ur"', "Ur'", 'Ur"',\r
          "uR'", 'uR"', "UR'", 'UR"',\r
          "b'", 'b"', "B'", 'B"',\r
          "br'", 'br"', "Br'", 'Br"',\r
          "bR'", 'bR"', "BR'", 'BR"' ):\r
    single_quoted[t] = t\r
\r
tabsize = 8\r
\r
class TokenError(Exception): pass\r
\r
class StopTokenizing(Exception): pass\r
\r
def printtoken(type, token, srow_scol, erow_ecol, line): # for testing\r
    srow, scol = srow_scol\r
    erow, ecol = erow_ecol\r
    print "%d,%d-%d,%d:\t%s\t%s" % \\r
        (srow, scol, erow, ecol, tok_name[type], repr(token))\r
\r
def tokenize(readline, tokeneater=printtoken):\r
    """\r
    The tokenize() function accepts two parameters: one representing the\r
    input stream, and one providing an output mechanism for tokenize().\r
\r
    The first parameter, readline, must be a callable object which provides\r
    the same interface as the readline() method of built-in file objects.\r
    Each call to the function should return one line of input as a string.\r
\r
    The second parameter, tokeneater, must also be a callable object. It is\r
    called once for each token, with five arguments, corresponding to the\r
    tuples generated by generate_tokens().\r
    """\r
    try:\r
        tokenize_loop(readline, tokeneater)\r
    except StopTokenizing:\r
        pass\r
\r
# backwards compatible interface\r
def tokenize_loop(readline, tokeneater):\r
    for token_info in generate_tokens(readline):\r
        tokeneater(*token_info)\r
\r
class Untokenizer:\r
\r
    def __init__(self):\r
        self.tokens = []\r
        self.prev_row = 1\r
        self.prev_col = 0\r
\r
    def add_whitespace(self, start):\r
        row, col = start\r
        if row < self.prev_row or row == self.prev_row and col < self.prev_col:\r
            raise ValueError("start ({},{}) precedes previous end ({},{})"\r
                             .format(row, col, self.prev_row, self.prev_col))\r
        row_offset = row - self.prev_row\r
        if row_offset:\r
            self.tokens.append("\\\n" * row_offset)\r
            self.prev_col = 0\r
        col_offset = col - self.prev_col\r
        if col_offset:\r
            self.tokens.append(" " * col_offset)\r
\r
    def untokenize(self, iterable):\r
        it = iter(iterable)\r
        for t in it:\r
            if len(t) == 2:\r
                self.compat(t, it)\r
                break\r
            tok_type, token, start, end, line = t\r
            if tok_type == ENDMARKER:\r
                break\r
            self.add_whitespace(start)\r
            self.tokens.append(token)\r
            self.prev_row, self.prev_col = end\r
            if tok_type in (NEWLINE, NL):\r
                self.prev_row += 1\r
                self.prev_col = 0\r
        return "".join(self.tokens)\r
\r
    def compat(self, token, iterable):\r
        indents = []\r
        toks_append = self.tokens.append\r
        startline = token[0] in (NEWLINE, NL)\r
        prevstring = False\r
\r
        for tok in chain([token], iterable):\r
            toknum, tokval = tok[:2]\r
\r
            if toknum in (NAME, NUMBER):\r
                tokval += ' '\r
\r
            # Insert a space between two consecutive strings\r
            if toknum == STRING:\r
                if prevstring:\r
                    tokval = ' ' + tokval\r
                prevstring = True\r
            else:\r
                prevstring = False\r
\r
            if toknum == INDENT:\r
                indents.append(tokval)\r
                continue\r
            elif toknum == DEDENT:\r
                indents.pop()\r
                continue\r
            elif toknum in (NEWLINE, NL):\r
                startline = True\r
            elif startline and indents:\r
                toks_append(indents[-1])\r
                startline = False\r
            toks_append(tokval)\r
\r
def untokenize(iterable):\r
    """Transform tokens back into Python source code.\r
\r
    Each element returned by the iterable must be a token sequence\r
    with at least two elements, a token number and token value.  If\r
    only two tokens are passed, the resulting output is poor.\r
\r
    Round-trip invariant for full input:\r
        Untokenized source will match input source exactly\r
\r
    Round-trip invariant for limited intput:\r
        # Output text will tokenize the back to the input\r
        t1 = [tok[:2] for tok in generate_tokens(f.readline)]\r
        newcode = untokenize(t1)\r
        readline = iter(newcode.splitlines(1)).next\r
        t2 = [tok[:2] for tok in generate_tokens(readline)]\r
        assert t1 == t2\r
    """\r
    ut = Untokenizer()\r
    return ut.untokenize(iterable)\r
\r
def generate_tokens(readline):\r
    """\r
    The generate_tokens() generator requires one argument, readline, which\r
    must be a callable object which provides the same interface as the\r
    readline() method of built-in file objects. Each call to the function\r
    should return one line of input as a string.  Alternately, readline\r
    can be a callable function terminating with StopIteration:\r
        readline = open(myfile).next    # Example of alternate readline\r
\r
    The generator produces 5-tuples with these members: the token type; the\r
    token string; a 2-tuple (srow, scol) of ints specifying the row and\r
    column where the token begins in the source; a 2-tuple (erow, ecol) of\r
    ints specifying the row and column where the token ends in the source;\r
    and the line on which the token was found. The line passed is the\r
    logical line; continuation lines are included.\r
    """\r
    lnum = parenlev = continued = 0\r
    namechars, numchars = string.ascii_letters + '_', '0123456789'\r
    contstr, needcont = '', 0\r
    contline = None\r
    indents = [0]\r
\r
    while 1:                                   # loop over lines in stream\r
        try:\r
            line = readline()\r
        except StopIteration:\r
            line = ''\r
        lnum += 1\r
        pos, max = 0, len(line)\r
\r
        if contstr:                            # continued string\r
            if not line:\r
                raise TokenError, ("EOF in multi-line string", strstart)\r
            endmatch = endprog.match(line)\r
            if endmatch:\r
                pos = end = endmatch.end(0)\r
                yield (STRING, contstr + line[:end],\r
                       strstart, (lnum, end), contline + line)\r
                contstr, needcont = '', 0\r
                contline = None\r
            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':\r
                yield (ERRORTOKEN, contstr + line,\r
                           strstart, (lnum, len(line)), contline)\r
                contstr = ''\r
                contline = None\r
                continue\r
            else:\r
                contstr = contstr + line\r
                contline = contline + line\r
                continue\r
\r
        elif parenlev == 0 and not continued:  # new statement\r
            if not line: break\r
            column = 0\r
            while pos < max:                   # measure leading whitespace\r
                if line[pos] == ' ':\r
                    column += 1\r
                elif line[pos] == '\t':\r
                    column = (column//tabsize + 1)*tabsize\r
                elif line[pos] == '\f':\r
                    column = 0\r
                else:\r
                    break\r
                pos += 1\r
            if pos == max:\r
                break\r
\r
            if line[pos] in '#\r\n':           # skip comments or blank lines\r
                if line[pos] == '#':\r
                    comment_token = line[pos:].rstrip('\r\n')\r
                    nl_pos = pos + len(comment_token)\r
                    yield (COMMENT, comment_token,\r
                           (lnum, pos), (lnum, pos + len(comment_token)), line)\r
                    yield (NL, line[nl_pos:],\r
                           (lnum, nl_pos), (lnum, len(line)), line)\r
                else:\r
                    yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],\r
                           (lnum, pos), (lnum, len(line)), line)\r
                continue\r
\r
            if column > indents[-1]:           # count indents or dedents\r
                indents.append(column)\r
                yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)\r
            while column < indents[-1]:\r
                if column not in indents:\r
                    raise IndentationError(\r
                        "unindent does not match any outer indentation level",\r
                        ("<tokenize>", lnum, pos, line))\r
                indents = indents[:-1]\r
                yield (DEDENT, '', (lnum, pos), (lnum, pos), line)\r
\r
        else:                                  # continued statement\r
            if not line:\r
                raise TokenError, ("EOF in multi-line statement", (lnum, 0))\r
            continued = 0\r
\r
        while pos < max:\r
            pseudomatch = pseudoprog.match(line, pos)\r
            if pseudomatch:                                # scan for tokens\r
                start, end = pseudomatch.span(1)\r
                spos, epos, pos = (lnum, start), (lnum, end), end\r
                if start == end:\r
                    continue\r
                token, initial = line[start:end], line[start]\r
\r
                if initial in numchars or \\r
                   (initial == '.' and token != '.'):      # ordinary number\r
                    yield (NUMBER, token, spos, epos, line)\r
                elif initial in '\r\n':\r
                    yield (NL if parenlev > 0 else NEWLINE,\r
                           token, spos, epos, line)\r
                elif initial == '#':\r
                    assert not token.endswith("\n")\r
                    yield (COMMENT, token, spos, epos, line)\r
                elif token in triple_quoted:\r
                    endprog = endprogs[token]\r
                    endmatch = endprog.match(line, pos)\r
                    if endmatch:                           # all on one line\r
                        pos = endmatch.end(0)\r
                        token = line[start:pos]\r
                        yield (STRING, token, spos, (lnum, pos), line)\r
                    else:\r
                        strstart = (lnum, start)           # multiple lines\r
                        contstr = line[start:]\r
                        contline = line\r
                        break\r
                elif initial in single_quoted or \\r
                    token[:2] in single_quoted or \\r
                    token[:3] in single_quoted:\r
                    if token[-1] == '\n':                  # continued string\r
                        strstart = (lnum, start)\r
                        endprog = (endprogs[initial] or endprogs[token[1]] or\r
                                   endprogs[token[2]])\r
                        contstr, needcont = line[start:], 1\r
                        contline = line\r
                        break\r
                    else:                                  # ordinary string\r
                        yield (STRING, token, spos, epos, line)\r
                elif initial in namechars:                 # ordinary name\r
                    yield (NAME, token, spos, epos, line)\r
                elif initial == '\\':                      # continued stmt\r
                    continued = 1\r
                else:\r
                    if initial in '([{':\r
                        parenlev += 1\r
                    elif initial in ')]}':\r
                        parenlev -= 1\r
                    yield (OP, token, spos, epos, line)\r
            else:\r
                yield (ERRORTOKEN, line[pos],\r
                           (lnum, pos), (lnum, pos+1), line)\r
                pos += 1\r
\r
    for indent in indents[1:]:                 # pop remaining indent levels\r
        yield (DEDENT, '', (lnum, 0), (lnum, 0), '')\r
    yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')\r
\r
if __name__ == '__main__':                     # testing\r
    import sys\r
    if len(sys.argv) > 1:\r
        tokenize(open(sys.argv[1]).readline)\r
    else:\r
        tokenize(sys.stdin.readline)\r
Commit	Line	Data
3257aa99 DM	1	"""Tokenization help for Python programs.\r
	2	\r
	3	generate_tokens(readline) is a generator that breaks a stream of\r
	4	text into Python tokens. It accepts a readline-like method which is called\r
	5	repeatedly to get the next line of input (or "" for EOF). It generates\r
	6	5-tuples with these members:\r
	7	\r
	8	the token type (see token.py)\r
	9	the token (a string)\r
	10	the starting (row, column) indices of the token (a 2-tuple of ints)\r
	11	the ending (row, column) indices of the token (a 2-tuple of ints)\r
	12	the original line (string)\r
	13	\r
	14	It is designed to match the working of the Python tokenizer exactly, except\r
	15	that it produces COMMENT tokens for comments and gives type OP for all\r
	16	operators\r
	17	\r
	18	Older entry points\r
	19	tokenize_loop(readline, tokeneater)\r
	20	tokenize(readline, tokeneater=printtoken)\r
	21	are the same, except instead of generating tokens, tokeneater is a callback\r
	22	function to which the 5 fields described above are passed as 5 arguments,\r
	23	each time a new token is found."""\r
	24	\r
	25	__author__ = 'Ka-Ping Yee <ping@lfw.org>'\r
	26	__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '\r
	27	'Skip Montanaro, Raymond Hettinger')\r
	28	\r
	29	from itertools import chain\r
	30	import string, re\r
	31	from token import *\r
	32	\r
	33	import token\r
	34	__all__ = [x for x in dir(token) if not x.startswith("_")]\r
	35	__all__ += ["COMMENT", "tokenize", "generate_tokens", "NL", "untokenize"]\r
	36	del x\r
	37	del token\r
	38	\r
	39	COMMENT = N_TOKENS\r
	40	tok_name[COMMENT] = 'COMMENT'\r
	41	NL = N_TOKENS + 1\r
	42	tok_name[NL] = 'NL'\r
	43	N_TOKENS += 2\r
	44	\r
	45	def group(*choices): return '(' + '\|'.join(choices) + ')'\r
	46	def any(choices): return group(choices) + '*'\r
	47	def maybe(choices): return group(choices) + '?'\r
	48	\r
	49	Whitespace = r'[ \f\t]*'\r
	50	Comment = r'#[^\r\n]*'\r
	51	Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)\r
	52	Name = r'[a-zA-Z_]\w*'\r
	53	\r
	54	Hexnumber = r'0[xX][\da-fA-F]+[lL]?'\r
	55	Octnumber = r'(0[oO][0-7]+)\|(0[0-7]*)[lL]?'\r
	56	Binnumber = r'0[bB][01]+[lL]?'\r
	57	Decnumber = r'[1-9]\d*[lL]?'\r
	58	Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)\r
	59	Exponent = r'[eE][-+]?\d+'\r
	60	Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)\r
	61	Expfloat = r'\d+' + Exponent\r
	62	Floatnumber = group(Pointfloat, Expfloat)\r
	63	Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')\r
	64	Number = group(Imagnumber, Floatnumber, Intnumber)\r
65	\r
66	# Tail end of ' string.\r
67	Single = r"[^'\\](?:\\.[^'\\])*'"\r
68	# Tail end of " string.\r
69	Double = r'[^"\\](?:\\.[^"\\])*"'\r
70	# Tail end of ''' string.\r
71	Single3 = r"[^'\\](?:(?:\\.\|'(?!''))[^'\\])*'''"\r
72	# Tail end of """ string.\r
73	Double3 = r'[^"\\](?:(?:\\.\|"(?!""))[^"\\])*"""'\r
74	Triple = group("[uUbB]?[rR]?'''", '[uUbB]?[rR]?"""')\r
75	# Single-line ' or " string.\r
76	String = group(r"[uUbB]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*'",\r
77	r'[uUbB]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*"')\r
78	\r
79	# Because of leftmost-then-longest match semantics, be sure to put the\r
80	# longest operators first (e.g., if = came before ==, == would get\r
81	# recognized as two instances of =).\r
82	Operator = group(r"\\=?", r">>=?", r"<<=?", r"<>", r"!=",\r
83	r"//=?",\r
84	r"[+\-*/%&\|^=<>]=?",\r
85	r"~")\r
86	\r
87	Bracket = '[][(){}]'\r
88	Special = group(r'\r?\n', r'[:;.,`@]')\r
89	Funny = group(Operator, Bracket, Special)\r
90	\r
91	PlainToken = group(Number, Funny, String, Name)\r
92	Token = Ignore + PlainToken\r
93	\r
94	# First (or only) line of ' or " string.\r
95	ContStr = group(r"[uUbB]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*" +\r
96	group("'", r'\\\r?\n'),\r
97	r'[uUbB]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*' +\r
98	group('"', r'\\\r?\n'))\r
99	PseudoExtras = group(r'\\\r?\n\|\Z', Comment, Triple)\r
100	PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)\r
101	\r
102	tokenprog, pseudoprog, single3prog, double3prog = map(\r
103	re.compile, (Token, PseudoToken, Single3, Double3))\r
104	endprogs = {"'": re.compile(Single), '"': re.compile(Double),\r
105	"'''": single3prog, '"""': double3prog,\r
106	"r'''": single3prog, 'r"""': double3prog,\r
107	"u'''": single3prog, 'u"""': double3prog,\r
108	"ur'''": single3prog, 'ur"""': double3prog,\r
109	"R'''": single3prog, 'R"""': double3prog,\r
110	"U'''": single3prog, 'U"""': double3prog,\r
111	"uR'''": single3prog, 'uR"""': double3prog,\r
112	"Ur'''": single3prog, 'Ur"""': double3prog,\r
113	"UR'''": single3prog, 'UR"""': double3prog,\r
114	"b'''": single3prog, 'b"""': double3prog,\r
115	"br'''": single3prog, 'br"""': double3prog,\r
116	"B'''": single3prog, 'B"""': double3prog,\r
117	"bR'''": single3prog, 'bR"""': double3prog,\r
118	"Br'''": single3prog, 'Br"""': double3prog,\r
119	"BR'''": single3prog, 'BR"""': double3prog,\r
120	'r': None, 'R': None, 'u': None, 'U': None,\r
121	'b': None, 'B': None}\r
122	\r
123	triple_quoted = {}\r
124	for t in ("'''", '"""',\r
125	"r'''", 'r"""', "R'''", 'R"""',\r
126	"u'''", 'u"""', "U'''", 'U"""',\r
127	"ur'''", 'ur"""', "Ur'''", 'Ur"""',\r
128	"uR'''", 'uR"""', "UR'''", 'UR"""',\r
129	"b'''", 'b"""', "B'''", 'B"""',\r
130	"br'''", 'br"""', "Br'''", 'Br"""',\r
131	"bR'''", 'bR"""', "BR'''", 'BR"""'):\r
132	triple_quoted[t] = t\r
133	single_quoted = {}\r
134	for t in ("'", '"',\r
135	"r'", 'r"', "R'", 'R"',\r
136	"u'", 'u"', "U'", 'U"',\r
137	"ur'", 'ur"', "Ur'", 'Ur"',\r
138	"uR'", 'uR"', "UR'", 'UR"',\r
139	"b'", 'b"', "B'", 'B"',\r
140	"br'", 'br"', "Br'", 'Br"',\r
141	"bR'", 'bR"', "BR'", 'BR"' ):\r
142	single_quoted[t] = t\r
143	\r
144	tabsize = 8\r
145	\r
146	class TokenError(Exception): pass\r
147	\r
148	class StopTokenizing(Exception): pass\r
149	\r
150	def printtoken(type, token, srow_scol, erow_ecol, line): # for testing\r
151	srow, scol = srow_scol\r
152	erow, ecol = erow_ecol\r
153	print "%d,%d-%d,%d:\t%s\t%s" % \\r
154	(srow, scol, erow, ecol, tok_name[type], repr(token))\r
155	\r
156	def tokenize(readline, tokeneater=printtoken):\r
157	"""\r
158	The tokenize() function accepts two parameters: one representing the\r
159	input stream, and one providing an output mechanism for tokenize().\r
160	\r
161	The first parameter, readline, must be a callable object which provides\r
162	the same interface as the readline() method of built-in file objects.\r
163	Each call to the function should return one line of input as a string.\r
164	\r
165	The second parameter, tokeneater, must also be a callable object. It is\r
166	called once for each token, with five arguments, corresponding to the\r
167	tuples generated by generate_tokens().\r
168	"""\r
169	try:\r
170	tokenize_loop(readline, tokeneater)\r
171	except StopTokenizing:\r
172	pass\r
173	\r
174	# backwards compatible interface\r
175	def tokenize_loop(readline, tokeneater):\r
176	for token_info in generate_tokens(readline):\r
177	tokeneater(*token_info)\r
178	\r
179	class Untokenizer:\r
180	\r
181	def __init__(self):\r
182	self.tokens = []\r
183	self.prev_row = 1\r
184	self.prev_col = 0\r
185	\r
186	def add_whitespace(self, start):\r
187	row, col = start\r
188	if row < self.prev_row or row == self.prev_row and col < self.prev_col:\r
189	raise ValueError("start ({},{}) precedes previous end ({},{})"\r
190	.format(row, col, self.prev_row, self.prev_col))\r
191	row_offset = row - self.prev_row\r
192	if row_offset:\r
193	self.tokens.append("\\\n" * row_offset)\r
194	self.prev_col = 0\r
195	col_offset = col - self.prev_col\r
196	if col_offset:\r
197	self.tokens.append(" " * col_offset)\r
198	\r
199	def untokenize(self, iterable):\r
200	it = iter(iterable)\r
201	for t in it:\r
202	if len(t) == 2:\r
203	self.compat(t, it)\r
204	break\r
205	tok_type, token, start, end, line = t\r
206	if tok_type == ENDMARKER:\r
207	break\r
208	self.add_whitespace(start)\r
209	self.tokens.append(token)\r
210	self.prev_row, self.prev_col = end\r
211	if tok_type in (NEWLINE, NL):\r
212	self.prev_row += 1\r
213	self.prev_col = 0\r
214	return "".join(self.tokens)\r
215	\r
216	def compat(self, token, iterable):\r
217	indents = []\r
218	toks_append = self.tokens.append\r
219	startline = token[0] in (NEWLINE, NL)\r
220	prevstring = False\r
221	\r
222	for tok in chain([token], iterable):\r
223	toknum, tokval = tok[:2]\r
224	\r
225	if toknum in (NAME, NUMBER):\r
226	tokval += ' '\r
227	\r
228	# Insert a space between two consecutive strings\r
229	if toknum == STRING:\r
230	if prevstring:\r
231	tokval = ' ' + tokval\r
232	prevstring = True\r
233	else:\r
234	prevstring = False\r
235	\r
236	if toknum == INDENT:\r
237	indents.append(tokval)\r
238	continue\r
239	elif toknum == DEDENT:\r
240	indents.pop()\r
241	continue\r
242	elif toknum in (NEWLINE, NL):\r
243	startline = True\r
244	elif startline and indents:\r
245	toks_append(indents[-1])\r
246	startline = False\r
247	toks_append(tokval)\r
248	\r
249	def untokenize(iterable):\r
250	"""Transform tokens back into Python source code.\r
251	\r
252	Each element returned by the iterable must be a token sequence\r
253	with at least two elements, a token number and token value. If\r
254	only two tokens are passed, the resulting output is poor.\r
255	\r
256	Round-trip invariant for full input:\r
257	Untokenized source will match input source exactly\r
258	\r
259	Round-trip invariant for limited intput:\r
260	# Output text will tokenize the back to the input\r
261	t1 = [tok[:2] for tok in generate_tokens(f.readline)]\r
262	newcode = untokenize(t1)\r
263	readline = iter(newcode.splitlines(1)).next\r
264	t2 = [tok[:2] for tok in generate_tokens(readline)]\r
265	assert t1 == t2\r
266	"""\r
267	ut = Untokenizer()\r
268	return ut.untokenize(iterable)\r
269	\r
270	def generate_tokens(readline):\r
271	"""\r
272	The generate_tokens() generator requires one argument, readline, which\r
273	must be a callable object which provides the same interface as the\r
274	readline() method of built-in file objects. Each call to the function\r
275	should return one line of input as a string. Alternately, readline\r
276	can be a callable function terminating with StopIteration:\r
277	readline = open(myfile).next # Example of alternate readline\r
278	\r
279	The generator produces 5-tuples with these members: the token type; the\r
280	token string; a 2-tuple (srow, scol) of ints specifying the row and\r
281	column where the token begins in the source; a 2-tuple (erow, ecol) of\r
282	ints specifying the row and column where the token ends in the source;\r
283	and the line on which the token was found. The line passed is the\r
284	logical line; continuation lines are included.\r
285	"""\r
286	lnum = parenlev = continued = 0\r
287	namechars, numchars = string.ascii_letters + '_', '0123456789'\r
288	contstr, needcont = '', 0\r
289	contline = None\r
290	indents = [0]\r
291	\r
292	while 1: # loop over lines in stream\r
293	try:\r
294	line = readline()\r
295	except StopIteration:\r
296	line = ''\r
297	lnum += 1\r
298	pos, max = 0, len(line)\r
299	\r
300	if contstr: # continued string\r
301	if not line:\r
302	raise TokenError, ("EOF in multi-line string", strstart)\r
303	endmatch = endprog.match(line)\r
304	if endmatch:\r
305	pos = end = endmatch.end(0)\r
306	yield (STRING, contstr + line[:end],\r
307	strstart, (lnum, end), contline + line)\r
308	contstr, needcont = '', 0\r
309	contline = None\r
310	elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':\r
311	yield (ERRORTOKEN, contstr + line,\r
312	strstart, (lnum, len(line)), contline)\r
313	contstr = ''\r
314	contline = None\r
315	continue\r
316	else:\r
317	contstr = contstr + line\r
318	contline = contline + line\r
319	continue\r
320	\r
321	elif parenlev == 0 and not continued: # new statement\r
322	if not line: break\r
323	column = 0\r
324	while pos < max: # measure leading whitespace\r
325	if line[pos] == ' ':\r
326	column += 1\r
327	elif line[pos] == '\t':\r
328	column = (column//tabsize + 1)*tabsize\r
329	elif line[pos] == '\f':\r
330	column = 0\r
331	else:\r
332	break\r
333	pos += 1\r
334	if pos == max:\r
335	break\r
336	\r
337	if line[pos] in '#\r\n': # skip comments or blank lines\r
338	if line[pos] == '#':\r
339	comment_token = line[pos:].rstrip('\r\n')\r
340	nl_pos = pos + len(comment_token)\r
341	yield (COMMENT, comment_token,\r
342	(lnum, pos), (lnum, pos + len(comment_token)), line)\r
343	yield (NL, line[nl_pos:],\r
344	(lnum, nl_pos), (lnum, len(line)), line)\r
345	else:\r
346	yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],\r
347	(lnum, pos), (lnum, len(line)), line)\r
348	continue\r
349	\r
350	if column > indents[-1]: # count indents or dedents\r
351	indents.append(column)\r
352	yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)\r
353	while column < indents[-1]:\r
354	if column not in indents:\r
355	raise IndentationError(\r
356	"unindent does not match any outer indentation level",\r
357	("<tokenize>", lnum, pos, line))\r
358	indents = indents[:-1]\r
359	yield (DEDENT, '', (lnum, pos), (lnum, pos), line)\r
360	\r
361	else: # continued statement\r
362	if not line:\r
363	raise TokenError, ("EOF in multi-line statement", (lnum, 0))\r
364	continued = 0\r
365	\r
366	while pos < max:\r
367	pseudomatch = pseudoprog.match(line, pos)\r
368	if pseudomatch: # scan for tokens\r
369	start, end = pseudomatch.span(1)\r
370	spos, epos, pos = (lnum, start), (lnum, end), end\r
371	if start == end:\r
372	continue\r
373	token, initial = line[start:end], line[start]\r
374	\r
375	if initial in numchars or \\r
376	(initial == '.' and token != '.'): # ordinary number\r
377	yield (NUMBER, token, spos, epos, line)\r
378	elif initial in '\r\n':\r
379	yield (NL if parenlev > 0 else NEWLINE,\r
380	token, spos, epos, line)\r
381	elif initial == '#':\r
382	assert not token.endswith("\n")\r
383	yield (COMMENT, token, spos, epos, line)\r
384	elif token in triple_quoted:\r
385	endprog = endprogs[token]\r
386	endmatch = endprog.match(line, pos)\r
387	if endmatch: # all on one line\r
388	pos = endmatch.end(0)\r
389	token = line[start:pos]\r
390	yield (STRING, token, spos, (lnum, pos), line)\r
391	else:\r
392	strstart = (lnum, start) # multiple lines\r
393	contstr = line[start:]\r
394	contline = line\r
395	break\r
396	elif initial in single_quoted or \\r
397	token[:2] in single_quoted or \\r
398	token[:3] in single_quoted:\r
399	if token[-1] == '\n': # continued string\r
400	strstart = (lnum, start)\r
401	endprog = (endprogs[initial] or endprogs[token[1]] or\r
402	endprogs[token[2]])\r
403	contstr, needcont = line[start:], 1\r
404	contline = line\r
405	break\r
406	else: # ordinary string\r
407	yield (STRING, token, spos, epos, line)\r
408	elif initial in namechars: # ordinary name\r
409	yield (NAME, token, spos, epos, line)\r
410	elif initial == '\\': # continued stmt\r
411	continued = 1\r
412	else:\r
413	if initial in '([{':\r
414	parenlev += 1\r
415	elif initial in ')]}':\r
416	parenlev -= 1\r
417	yield (OP, token, spos, epos, line)\r
418	else:\r
419	yield (ERRORTOKEN, line[pos],\r
420	(lnum, pos), (lnum, pos+1), line)\r
421	pos += 1\r
422	\r
423	for indent in indents[1:]: # pop remaining indent levels\r
424	yield (DEDENT, '', (lnum, 0), (lnum, 0), '')\r
425	yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')\r
426	\r
427	if __name__ == '__main__': # testing\r
428	import sys\r
429	if len(sys.argv) > 1:\r
430	tokenize(open(sys.argv[1]).readline)\r
431	else:\r
432	tokenize(sys.stdin.readline)\r