[mirror_edk2.git] / AppPkg / Applications / Python / Python-2.7.2 / Lib / tokenize.py

"""Tokenization help for Python programs.\r
\r
generate_tokens(readline) is a generator that breaks a stream of\r
text into Python tokens.  It accepts a readline-like method which is called\r
repeatedly to get the next line of input (or "" for EOF).  It generates\r
5-tuples with these members:\r
\r
    the token type (see token.py)\r
    the token (a string)\r
    the starting (row, column) indices of the token (a 2-tuple of ints)\r
    the ending (row, column) indices of the token (a 2-tuple of ints)\r
    the original line (string)\r
\r
It is designed to match the working of the Python tokenizer exactly, except\r
that it produces COMMENT tokens for comments and gives type OP for all\r
operators\r
\r
Older entry points\r
    tokenize_loop(readline, tokeneater)\r
    tokenize(readline, tokeneater=printtoken)\r
are the same, except instead of generating tokens, tokeneater is a callback\r
function to which the 5 fields described above are passed as 5 arguments,\r
each time a new token is found."""\r
\r
__author__ = 'Ka-Ping Yee <ping@lfw.org>'\r
__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '\r
               'Skip Montanaro, Raymond Hettinger')\r
\r
import string, re\r
from token import *\r
\r
import token\r
__all__ = [x for x in dir(token) if not x.startswith("_")]\r
__all__ += ["COMMENT", "tokenize", "generate_tokens", "NL", "untokenize"]\r
del x\r
del token\r
\r
COMMENT = N_TOKENS\r
tok_name[COMMENT] = 'COMMENT'\r
NL = N_TOKENS + 1\r
tok_name[NL] = 'NL'\r
N_TOKENS += 2\r
\r
def group(*choices): return '(' + '|'.join(choices) + ')'\r
def any(*choices): return group(*choices) + '*'\r
def maybe(*choices): return group(*choices) + '?'\r
\r
Whitespace = r'[ \f\t]*'\r
Comment = r'#[^\r\n]*'\r
Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)\r
Name = r'[a-zA-Z_]\w*'\r
\r
Hexnumber = r'0[xX][\da-fA-F]+[lL]?'\r
Octnumber = r'(0[oO][0-7]+)|(0[0-7]*)[lL]?'\r
Binnumber = r'0[bB][01]+[lL]?'\r
Decnumber = r'[1-9]\d*[lL]?'\r
Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)\r
Exponent = r'[eE][-+]?\d+'\r
Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)\r
Expfloat = r'\d+' + Exponent\r
Floatnumber = group(Pointfloat, Expfloat)\r
Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')\r
Number = group(Imagnumber, Floatnumber, Intnumber)\r
\r
# Tail end of ' string.\r
Single = r"[^'\\]*(?:\\.[^'\\]*)*'"\r
# Tail end of " string.\r
Double = r'[^"\\]*(?:\\.[^"\\]*)*"'\r
# Tail end of ''' string.\r
Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"\r
# Tail end of """ string.\r
Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'\r
Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')\r
# Single-line ' or " string.\r
String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",\r
               r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')\r
\r
# Because of leftmost-then-longest match semantics, be sure to put the\r
# longest operators first (e.g., if = came before ==, == would get\r
# recognized as two instances of =).\r
Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",\r
                 r"//=?",\r
                 r"[+\-*/%&|^=<>]=?",\r
                 r"~")\r
\r
Bracket = '[][(){}]'\r
Special = group(r'\r?\n', r'[:;.,`@]')\r
Funny = group(Operator, Bracket, Special)\r
\r
PlainToken = group(Number, Funny, String, Name)\r
Token = Ignore + PlainToken\r
\r
# First (or only) line of ' or " string.\r
ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +\r
                group("'", r'\\\r?\n'),\r
                r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +\r
                group('"', r'\\\r?\n'))\r
PseudoExtras = group(r'\\\r?\n', Comment, Triple)\r
PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)\r
\r
tokenprog, pseudoprog, single3prog, double3prog = map(\r
    re.compile, (Token, PseudoToken, Single3, Double3))\r
endprogs = {"'": re.compile(Single), '"': re.compile(Double),\r
            "'''": single3prog, '"""': double3prog,\r
            "r'''": single3prog, 'r"""': double3prog,\r
            "u'''": single3prog, 'u"""': double3prog,\r
            "ur'''": single3prog, 'ur"""': double3prog,\r
            "R'''": single3prog, 'R"""': double3prog,\r
            "U'''": single3prog, 'U"""': double3prog,\r
            "uR'''": single3prog, 'uR"""': double3prog,\r
            "Ur'''": single3prog, 'Ur"""': double3prog,\r
            "UR'''": single3prog, 'UR"""': double3prog,\r
            "b'''": single3prog, 'b"""': double3prog,\r
            "br'''": single3prog, 'br"""': double3prog,\r
            "B'''": single3prog, 'B"""': double3prog,\r
            "bR'''": single3prog, 'bR"""': double3prog,\r
            "Br'''": single3prog, 'Br"""': double3prog,\r
            "BR'''": single3prog, 'BR"""': double3prog,\r
            'r': None, 'R': None, 'u': None, 'U': None,\r
            'b': None, 'B': None}\r
\r
triple_quoted = {}\r
for t in ("'''", '"""',\r
          "r'''", 'r"""', "R'''", 'R"""',\r
          "u'''", 'u"""', "U'''", 'U"""',\r
          "ur'''", 'ur"""', "Ur'''", 'Ur"""',\r
          "uR'''", 'uR"""', "UR'''", 'UR"""',\r
          "b'''", 'b"""', "B'''", 'B"""',\r
          "br'''", 'br"""', "Br'''", 'Br"""',\r
          "bR'''", 'bR"""', "BR'''", 'BR"""'):\r
    triple_quoted[t] = t\r
single_quoted = {}\r
for t in ("'", '"',\r
          "r'", 'r"', "R'", 'R"',\r
          "u'", 'u"', "U'", 'U"',\r
          "ur'", 'ur"', "Ur'", 'Ur"',\r
          "uR'", 'uR"', "UR'", 'UR"',\r
          "b'", 'b"', "B'", 'B"',\r
          "br'", 'br"', "Br'", 'Br"',\r
          "bR'", 'bR"', "BR'", 'BR"' ):\r
    single_quoted[t] = t\r
\r
tabsize = 8\r
\r
class TokenError(Exception): pass\r
\r
class StopTokenizing(Exception): pass\r
\r
def printtoken(type, token, srow_scol, erow_ecol, line): # for testing\r
    srow, scol = srow_scol\r
    erow, ecol = erow_ecol\r
    print "%d,%d-%d,%d:\t%s\t%s" % \\r
        (srow, scol, erow, ecol, tok_name[type], repr(token))\r
\r
def tokenize(readline, tokeneater=printtoken):\r
    """\r
    The tokenize() function accepts two parameters: one representing the\r
    input stream, and one providing an output mechanism for tokenize().\r
\r
    The first parameter, readline, must be a callable object which provides\r
    the same interface as the readline() method of built-in file objects.\r
    Each call to the function should return one line of input as a string.\r
\r
    The second parameter, tokeneater, must also be a callable object. It is\r
    called once for each token, with five arguments, corresponding to the\r
    tuples generated by generate_tokens().\r
    """\r
    try:\r
        tokenize_loop(readline, tokeneater)\r
    except StopTokenizing:\r
        pass\r
\r
# backwards compatible interface\r
def tokenize_loop(readline, tokeneater):\r
    for token_info in generate_tokens(readline):\r
        tokeneater(*token_info)\r
\r
class Untokenizer:\r
\r
    def __init__(self):\r
        self.tokens = []\r
        self.prev_row = 1\r
        self.prev_col = 0\r
\r
    def add_whitespace(self, start):\r
        row, col = start\r
        assert row <= self.prev_row\r
        col_offset = col - self.prev_col\r
        if col_offset:\r
            self.tokens.append(" " * col_offset)\r
\r
    def untokenize(self, iterable):\r
        for t in iterable:\r
            if len(t) == 2:\r
                self.compat(t, iterable)\r
                break\r
            tok_type, token, start, end, line = t\r
            self.add_whitespace(start)\r
            self.tokens.append(token)\r
            self.prev_row, self.prev_col = end\r
            if tok_type in (NEWLINE, NL):\r
                self.prev_row += 1\r
                self.prev_col = 0\r
        return "".join(self.tokens)\r
\r
    def compat(self, token, iterable):\r
        startline = False\r
        indents = []\r
        toks_append = self.tokens.append\r
        toknum, tokval = token\r
        if toknum in (NAME, NUMBER):\r
            tokval += ' '\r
        if toknum in (NEWLINE, NL):\r
            startline = True\r
        prevstring = False\r
        for tok in iterable:\r
            toknum, tokval = tok[:2]\r
\r
            if toknum in (NAME, NUMBER):\r
                tokval += ' '\r
\r
            # Insert a space between two consecutive strings\r
            if toknum == STRING:\r
                if prevstring:\r
                    tokval = ' ' + tokval\r
                prevstring = True\r
            else:\r
                prevstring = False\r
\r
            if toknum == INDENT:\r
                indents.append(tokval)\r
                continue\r
            elif toknum == DEDENT:\r
                indents.pop()\r
                continue\r
            elif toknum in (NEWLINE, NL):\r
                startline = True\r
            elif startline and indents:\r
                toks_append(indents[-1])\r
                startline = False\r
            toks_append(tokval)\r
\r
def untokenize(iterable):\r
    """Transform tokens back into Python source code.\r
\r
    Each element returned by the iterable must be a token sequence\r
    with at least two elements, a token number and token value.  If\r
    only two tokens are passed, the resulting output is poor.\r
\r
    Round-trip invariant for full input:\r
        Untokenized source will match input source exactly\r
\r
    Round-trip invariant for limited intput:\r
        # Output text will tokenize the back to the input\r
        t1 = [tok[:2] for tok in generate_tokens(f.readline)]\r
        newcode = untokenize(t1)\r
        readline = iter(newcode.splitlines(1)).next\r
        t2 = [tok[:2] for tok in generate_tokens(readline)]\r
        assert t1 == t2\r
    """\r
    ut = Untokenizer()\r
    return ut.untokenize(iterable)\r
\r
def generate_tokens(readline):\r
    """\r
    The generate_tokens() generator requires one argment, readline, which\r
    must be a callable object which provides the same interface as the\r
    readline() method of built-in file objects. Each call to the function\r
    should return one line of input as a string.  Alternately, readline\r
    can be a callable function terminating with StopIteration:\r
        readline = open(myfile).next    # Example of alternate readline\r
\r
    The generator produces 5-tuples with these members: the token type; the\r
    token string; a 2-tuple (srow, scol) of ints specifying the row and\r
    column where the token begins in the source; a 2-tuple (erow, ecol) of\r
    ints specifying the row and column where the token ends in the source;\r
    and the line on which the token was found. The line passed is the\r
    logical line; continuation lines are included.\r
    """\r
    lnum = parenlev = continued = 0\r
    namechars, numchars = string.ascii_letters + '_', '0123456789'\r
    contstr, needcont = '', 0\r
    contline = None\r
    indents = [0]\r
\r
    while 1:                                   # loop over lines in stream\r
        try:\r
            line = readline()\r
        except StopIteration:\r
            line = ''\r
        lnum += 1\r
        pos, max = 0, len(line)\r
\r
        if contstr:                            # continued string\r
            if not line:\r
                raise TokenError, ("EOF in multi-line string", strstart)\r
            endmatch = endprog.match(line)\r
            if endmatch:\r
                pos = end = endmatch.end(0)\r
                yield (STRING, contstr + line[:end],\r
                       strstart, (lnum, end), contline + line)\r
                contstr, needcont = '', 0\r
                contline = None\r
            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':\r
                yield (ERRORTOKEN, contstr + line,\r
                           strstart, (lnum, len(line)), contline)\r
                contstr = ''\r
                contline = None\r
                continue\r
            else:\r
                contstr = contstr + line\r
                contline = contline + line\r
                continue\r
\r
        elif parenlev == 0 and not continued:  # new statement\r
            if not line: break\r
            column = 0\r
            while pos < max:                   # measure leading whitespace\r
                if line[pos] == ' ':\r
                    column += 1\r
                elif line[pos] == '\t':\r
                    column = (column//tabsize + 1)*tabsize\r
                elif line[pos] == '\f':\r
                    column = 0\r
                else:\r
                    break\r
                pos += 1\r
            if pos == max:\r
                break\r
\r
            if line[pos] in '#\r\n':           # skip comments or blank lines\r
                if line[pos] == '#':\r
                    comment_token = line[pos:].rstrip('\r\n')\r
                    nl_pos = pos + len(comment_token)\r
                    yield (COMMENT, comment_token,\r
                           (lnum, pos), (lnum, pos + len(comment_token)), line)\r
                    yield (NL, line[nl_pos:],\r
                           (lnum, nl_pos), (lnum, len(line)), line)\r
                else:\r
                    yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],\r
                           (lnum, pos), (lnum, len(line)), line)\r
                continue\r
\r
            if column > indents[-1]:           # count indents or dedents\r
                indents.append(column)\r
                yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)\r
            while column < indents[-1]:\r
                if column not in indents:\r
                    raise IndentationError(\r
                        "unindent does not match any outer indentation level",\r
                        ("<tokenize>", lnum, pos, line))\r
                indents = indents[:-1]\r
                yield (DEDENT, '', (lnum, pos), (lnum, pos), line)\r
\r
        else:                                  # continued statement\r
            if not line:\r
                raise TokenError, ("EOF in multi-line statement", (lnum, 0))\r
            continued = 0\r
\r
        while pos < max:\r
            pseudomatch = pseudoprog.match(line, pos)\r
            if pseudomatch:                                # scan for tokens\r
                start, end = pseudomatch.span(1)\r
                spos, epos, pos = (lnum, start), (lnum, end), end\r
                token, initial = line[start:end], line[start]\r
\r
                if initial in numchars or \\r
                   (initial == '.' and token != '.'):      # ordinary number\r
                    yield (NUMBER, token, spos, epos, line)\r
                elif initial in '\r\n':\r
                    yield (NL if parenlev > 0 else NEWLINE,\r
                           token, spos, epos, line)\r
                elif initial == '#':\r
                    assert not token.endswith("\n")\r
                    yield (COMMENT, token, spos, epos, line)\r
                elif token in triple_quoted:\r
                    endprog = endprogs[token]\r
                    endmatch = endprog.match(line, pos)\r
                    if endmatch:                           # all on one line\r
                        pos = endmatch.end(0)\r
                        token = line[start:pos]\r
                        yield (STRING, token, spos, (lnum, pos), line)\r
                    else:\r
                        strstart = (lnum, start)           # multiple lines\r
                        contstr = line[start:]\r
                        contline = line\r
                        break\r
                elif initial in single_quoted or \\r
                    token[:2] in single_quoted or \\r
                    token[:3] in single_quoted:\r
                    if token[-1] == '\n':                  # continued string\r
                        strstart = (lnum, start)\r
                        endprog = (endprogs[initial] or endprogs[token[1]] or\r
                                   endprogs[token[2]])\r
                        contstr, needcont = line[start:], 1\r
                        contline = line\r
                        break\r
                    else:                                  # ordinary string\r
                        yield (STRING, token, spos, epos, line)\r
                elif initial in namechars:                 # ordinary name\r
                    yield (NAME, token, spos, epos, line)\r
                elif initial == '\\':                      # continued stmt\r
                    continued = 1\r
                else:\r
                    if initial in '([{':\r
                        parenlev += 1\r
                    elif initial in ')]}':\r
                        parenlev -= 1\r
                    yield (OP, token, spos, epos, line)\r
            else:\r
                yield (ERRORTOKEN, line[pos],\r
                           (lnum, pos), (lnum, pos+1), line)\r
                pos += 1\r
\r
    for indent in indents[1:]:                 # pop remaining indent levels\r
        yield (DEDENT, '', (lnum, 0), (lnum, 0), '')\r
    yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')\r
\r
if __name__ == '__main__':                     # testing\r
    import sys\r
    if len(sys.argv) > 1:\r
        tokenize(open(sys.argv[1]).readline)\r
    else:\r
        tokenize(sys.stdin.readline)\r
Commit	Line	Data
4710c53d	1	"""Tokenization help for Python programs.\r
	2	\r
	3	generate_tokens(readline) is a generator that breaks a stream of\r
	4	text into Python tokens. It accepts a readline-like method which is called\r
	5	repeatedly to get the next line of input (or "" for EOF). It generates\r
	6	5-tuples with these members:\r
	7	\r
	8	the token type (see token.py)\r
	9	the token (a string)\r
	10	the starting (row, column) indices of the token (a 2-tuple of ints)\r
	11	the ending (row, column) indices of the token (a 2-tuple of ints)\r
	12	the original line (string)\r
	13	\r
	14	It is designed to match the working of the Python tokenizer exactly, except\r
	15	that it produces COMMENT tokens for comments and gives type OP for all\r
	16	operators\r
	17	\r
	18	Older entry points\r
	19	tokenize_loop(readline, tokeneater)\r
	20	tokenize(readline, tokeneater=printtoken)\r
	21	are the same, except instead of generating tokens, tokeneater is a callback\r
	22	function to which the 5 fields described above are passed as 5 arguments,\r
	23	each time a new token is found."""\r
	24	\r
	25	__author__ = 'Ka-Ping Yee <ping@lfw.org>'\r
	26	__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '\r
	27	'Skip Montanaro, Raymond Hettinger')\r
	28	\r
	29	import string, re\r
	30	from token import *\r
	31	\r
	32	import token\r
	33	__all__ = [x for x in dir(token) if not x.startswith("_")]\r
	34	__all__ += ["COMMENT", "tokenize", "generate_tokens", "NL", "untokenize"]\r
	35	del x\r
	36	del token\r
	37	\r
	38	COMMENT = N_TOKENS\r
	39	tok_name[COMMENT] = 'COMMENT'\r
	40	NL = N_TOKENS + 1\r
	41	tok_name[NL] = 'NL'\r
	42	N_TOKENS += 2\r
	43	\r
	44	def group(*choices): return '(' + '\|'.join(choices) + ')'\r
	45	def any(choices): return group(choices) + '*'\r
	46	def maybe(choices): return group(choices) + '?'\r
	47	\r
	48	Whitespace = r'[ \f\t]*'\r
	49	Comment = r'#[^\r\n]*'\r
	50	Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)\r
	51	Name = r'[a-zA-Z_]\w*'\r
	52	\r
	53	Hexnumber = r'0[xX][\da-fA-F]+[lL]?'\r
	54	Octnumber = r'(0[oO][0-7]+)\|(0[0-7]*)[lL]?'\r
	55	Binnumber = r'0[bB][01]+[lL]?'\r
	56	Decnumber = r'[1-9]\d*[lL]?'\r
	57	Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)\r
	58	Exponent = r'[eE][-+]?\d+'\r
	59	Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)\r
	60	Expfloat = r'\d+' + Exponent\r
	61	Floatnumber = group(Pointfloat, Expfloat)\r
	62	Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')\r
	63	Number = group(Imagnumber, Floatnumber, Intnumber)\r
	64	\r
65	# Tail end of ' string.\r
66	Single = r"[^'\\](?:\\.[^'\\])*'"\r
67	# Tail end of " string.\r
68	Double = r'[^"\\](?:\\.[^"\\])*"'\r
69	# Tail end of ''' string.\r
70	Single3 = r"[^'\\](?:(?:\\.\|'(?!''))[^'\\])*'''"\r
71	# Tail end of """ string.\r
72	Double3 = r'[^"\\](?:(?:\\.\|"(?!""))[^"\\])*"""'\r
73	Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')\r
74	# Single-line ' or " string.\r
75	String = group(r"[uU]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*'",\r
76	r'[uU]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*"')\r
77	\r
78	# Because of leftmost-then-longest match semantics, be sure to put the\r
79	# longest operators first (e.g., if = came before ==, == would get\r
80	# recognized as two instances of =).\r
81	Operator = group(r"\\=?", r">>=?", r"<<=?", r"<>", r"!=",\r
82	r"//=?",\r
83	r"[+\-*/%&\|^=<>]=?",\r
84	r"~")\r
85	\r
86	Bracket = '[][(){}]'\r
87	Special = group(r'\r?\n', r'[:;.,`@]')\r
88	Funny = group(Operator, Bracket, Special)\r
89	\r
90	PlainToken = group(Number, Funny, String, Name)\r
91	Token = Ignore + PlainToken\r
92	\r
93	# First (or only) line of ' or " string.\r
94	ContStr = group(r"[uU]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*" +\r
95	group("'", r'\\\r?\n'),\r
96	r'[uU]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*' +\r
97	group('"', r'\\\r?\n'))\r
98	PseudoExtras = group(r'\\\r?\n', Comment, Triple)\r
99	PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)\r
100	\r
101	tokenprog, pseudoprog, single3prog, double3prog = map(\r
102	re.compile, (Token, PseudoToken, Single3, Double3))\r
103	endprogs = {"'": re.compile(Single), '"': re.compile(Double),\r
104	"'''": single3prog, '"""': double3prog,\r
105	"r'''": single3prog, 'r"""': double3prog,\r
106	"u'''": single3prog, 'u"""': double3prog,\r
107	"ur'''": single3prog, 'ur"""': double3prog,\r
108	"R'''": single3prog, 'R"""': double3prog,\r
109	"U'''": single3prog, 'U"""': double3prog,\r
110	"uR'''": single3prog, 'uR"""': double3prog,\r
111	"Ur'''": single3prog, 'Ur"""': double3prog,\r
112	"UR'''": single3prog, 'UR"""': double3prog,\r
113	"b'''": single3prog, 'b"""': double3prog,\r
114	"br'''": single3prog, 'br"""': double3prog,\r
115	"B'''": single3prog, 'B"""': double3prog,\r
116	"bR'''": single3prog, 'bR"""': double3prog,\r
117	"Br'''": single3prog, 'Br"""': double3prog,\r
118	"BR'''": single3prog, 'BR"""': double3prog,\r
119	'r': None, 'R': None, 'u': None, 'U': None,\r
120	'b': None, 'B': None}\r
121	\r
122	triple_quoted = {}\r
123	for t in ("'''", '"""',\r
124	"r'''", 'r"""', "R'''", 'R"""',\r
125	"u'''", 'u"""', "U'''", 'U"""',\r
126	"ur'''", 'ur"""', "Ur'''", 'Ur"""',\r
127	"uR'''", 'uR"""', "UR'''", 'UR"""',\r
128	"b'''", 'b"""', "B'''", 'B"""',\r
129	"br'''", 'br"""', "Br'''", 'Br"""',\r
130	"bR'''", 'bR"""', "BR'''", 'BR"""'):\r
131	triple_quoted[t] = t\r
132	single_quoted = {}\r
133	for t in ("'", '"',\r
134	"r'", 'r"', "R'", 'R"',\r
135	"u'", 'u"', "U'", 'U"',\r
136	"ur'", 'ur"', "Ur'", 'Ur"',\r
137	"uR'", 'uR"', "UR'", 'UR"',\r
138	"b'", 'b"', "B'", 'B"',\r
139	"br'", 'br"', "Br'", 'Br"',\r
140	"bR'", 'bR"', "BR'", 'BR"' ):\r
141	single_quoted[t] = t\r
142	\r
143	tabsize = 8\r
144	\r
145	class TokenError(Exception): pass\r
146	\r
147	class StopTokenizing(Exception): pass\r
148	\r
149	def printtoken(type, token, srow_scol, erow_ecol, line): # for testing\r
150	srow, scol = srow_scol\r
151	erow, ecol = erow_ecol\r
152	print "%d,%d-%d,%d:\t%s\t%s" % \\r
153	(srow, scol, erow, ecol, tok_name[type], repr(token))\r
154	\r
155	def tokenize(readline, tokeneater=printtoken):\r
156	"""\r
157	The tokenize() function accepts two parameters: one representing the\r
158	input stream, and one providing an output mechanism for tokenize().\r
159	\r
160	The first parameter, readline, must be a callable object which provides\r
161	the same interface as the readline() method of built-in file objects.\r
162	Each call to the function should return one line of input as a string.\r
163	\r
164	The second parameter, tokeneater, must also be a callable object. It is\r
165	called once for each token, with five arguments, corresponding to the\r
166	tuples generated by generate_tokens().\r
167	"""\r
168	try:\r
169	tokenize_loop(readline, tokeneater)\r
170	except StopTokenizing:\r
171	pass\r
172	\r
173	# backwards compatible interface\r
174	def tokenize_loop(readline, tokeneater):\r
175	for token_info in generate_tokens(readline):\r
176	tokeneater(*token_info)\r
177	\r
178	class Untokenizer:\r
179	\r
180	def __init__(self):\r
181	self.tokens = []\r
182	self.prev_row = 1\r
183	self.prev_col = 0\r
184	\r
185	def add_whitespace(self, start):\r
186	row, col = start\r
187	assert row <= self.prev_row\r
188	col_offset = col - self.prev_col\r
189	if col_offset:\r
190	self.tokens.append(" " * col_offset)\r
191	\r
192	def untokenize(self, iterable):\r
193	for t in iterable:\r
194	if len(t) == 2:\r
195	self.compat(t, iterable)\r
196	break\r
197	tok_type, token, start, end, line = t\r
198	self.add_whitespace(start)\r
199	self.tokens.append(token)\r
200	self.prev_row, self.prev_col = end\r
201	if tok_type in (NEWLINE, NL):\r
202	self.prev_row += 1\r
203	self.prev_col = 0\r
204	return "".join(self.tokens)\r
205	\r
206	def compat(self, token, iterable):\r
207	startline = False\r
208	indents = []\r
209	toks_append = self.tokens.append\r
210	toknum, tokval = token\r
211	if toknum in (NAME, NUMBER):\r
212	tokval += ' '\r
213	if toknum in (NEWLINE, NL):\r
214	startline = True\r
215	prevstring = False\r
216	for tok in iterable:\r
217	toknum, tokval = tok[:2]\r
218	\r
219	if toknum in (NAME, NUMBER):\r
220	tokval += ' '\r
221	\r
222	# Insert a space between two consecutive strings\r
223	if toknum == STRING:\r
224	if prevstring:\r
225	tokval = ' ' + tokval\r
226	prevstring = True\r
227	else:\r
228	prevstring = False\r
229	\r
230	if toknum == INDENT:\r
231	indents.append(tokval)\r
232	continue\r
233	elif toknum == DEDENT:\r
234	indents.pop()\r
235	continue\r
236	elif toknum in (NEWLINE, NL):\r
237	startline = True\r
238	elif startline and indents:\r
239	toks_append(indents[-1])\r
240	startline = False\r
241	toks_append(tokval)\r
242	\r
243	def untokenize(iterable):\r
244	"""Transform tokens back into Python source code.\r
245	\r
246	Each element returned by the iterable must be a token sequence\r
247	with at least two elements, a token number and token value. If\r
248	only two tokens are passed, the resulting output is poor.\r
249	\r
250	Round-trip invariant for full input:\r
251	Untokenized source will match input source exactly\r
252	\r
253	Round-trip invariant for limited intput:\r
254	# Output text will tokenize the back to the input\r
255	t1 = [tok[:2] for tok in generate_tokens(f.readline)]\r
256	newcode = untokenize(t1)\r
257	readline = iter(newcode.splitlines(1)).next\r
258	t2 = [tok[:2] for tok in generate_tokens(readline)]\r
259	assert t1 == t2\r
260	"""\r
261	ut = Untokenizer()\r
262	return ut.untokenize(iterable)\r
263	\r
264	def generate_tokens(readline):\r
265	"""\r
266	The generate_tokens() generator requires one argment, readline, which\r
267	must be a callable object which provides the same interface as the\r
268	readline() method of built-in file objects. Each call to the function\r
269	should return one line of input as a string. Alternately, readline\r
270	can be a callable function terminating with StopIteration:\r
271	readline = open(myfile).next # Example of alternate readline\r
272	\r
273	The generator produces 5-tuples with these members: the token type; the\r
274	token string; a 2-tuple (srow, scol) of ints specifying the row and\r
275	column where the token begins in the source; a 2-tuple (erow, ecol) of\r
276	ints specifying the row and column where the token ends in the source;\r
277	and the line on which the token was found. The line passed is the\r
278	logical line; continuation lines are included.\r
279	"""\r
280	lnum = parenlev = continued = 0\r
281	namechars, numchars = string.ascii_letters + '_', '0123456789'\r
282	contstr, needcont = '', 0\r
283	contline = None\r
284	indents = [0]\r
285	\r
286	while 1: # loop over lines in stream\r
287	try:\r
288	line = readline()\r
289	except StopIteration:\r
290	line = ''\r
291	lnum += 1\r
292	pos, max = 0, len(line)\r
293	\r
294	if contstr: # continued string\r
295	if not line:\r
296	raise TokenError, ("EOF in multi-line string", strstart)\r
297	endmatch = endprog.match(line)\r
298	if endmatch:\r
299	pos = end = endmatch.end(0)\r
300	yield (STRING, contstr + line[:end],\r
301	strstart, (lnum, end), contline + line)\r
302	contstr, needcont = '', 0\r
303	contline = None\r
304	elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':\r
305	yield (ERRORTOKEN, contstr + line,\r
306	strstart, (lnum, len(line)), contline)\r
307	contstr = ''\r
308	contline = None\r
309	continue\r
310	else:\r
311	contstr = contstr + line\r
312	contline = contline + line\r
313	continue\r
314	\r
315	elif parenlev == 0 and not continued: # new statement\r
316	if not line: break\r
317	column = 0\r
318	while pos < max: # measure leading whitespace\r
319	if line[pos] == ' ':\r
320	column += 1\r
321	elif line[pos] == '\t':\r
322	column = (column//tabsize + 1)*tabsize\r
323	elif line[pos] == '\f':\r
324	column = 0\r
325	else:\r
326	break\r
327	pos += 1\r
328	if pos == max:\r
329	break\r
330	\r
331	if line[pos] in '#\r\n': # skip comments or blank lines\r
332	if line[pos] == '#':\r
333	comment_token = line[pos:].rstrip('\r\n')\r
334	nl_pos = pos + len(comment_token)\r
335	yield (COMMENT, comment_token,\r
336	(lnum, pos), (lnum, pos + len(comment_token)), line)\r
337	yield (NL, line[nl_pos:],\r
338	(lnum, nl_pos), (lnum, len(line)), line)\r
339	else:\r
340	yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],\r
341	(lnum, pos), (lnum, len(line)), line)\r
342	continue\r
343	\r
344	if column > indents[-1]: # count indents or dedents\r
345	indents.append(column)\r
346	yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)\r
347	while column < indents[-1]:\r
348	if column not in indents:\r
349	raise IndentationError(\r
350	"unindent does not match any outer indentation level",\r
351	("<tokenize>", lnum, pos, line))\r
352	indents = indents[:-1]\r
353	yield (DEDENT, '', (lnum, pos), (lnum, pos), line)\r
354	\r
355	else: # continued statement\r
356	if not line:\r
357	raise TokenError, ("EOF in multi-line statement", (lnum, 0))\r
358	continued = 0\r
359	\r
360	while pos < max:\r
361	pseudomatch = pseudoprog.match(line, pos)\r
362	if pseudomatch: # scan for tokens\r
363	start, end = pseudomatch.span(1)\r
364	spos, epos, pos = (lnum, start), (lnum, end), end\r
365	token, initial = line[start:end], line[start]\r
366	\r
367	if initial in numchars or \\r
368	(initial == '.' and token != '.'): # ordinary number\r
369	yield (NUMBER, token, spos, epos, line)\r
370	elif initial in '\r\n':\r
371	yield (NL if parenlev > 0 else NEWLINE,\r
372	token, spos, epos, line)\r
373	elif initial == '#':\r
374	assert not token.endswith("\n")\r
375	yield (COMMENT, token, spos, epos, line)\r
376	elif token in triple_quoted:\r
377	endprog = endprogs[token]\r
378	endmatch = endprog.match(line, pos)\r
379	if endmatch: # all on one line\r
380	pos = endmatch.end(0)\r
381	token = line[start:pos]\r
382	yield (STRING, token, spos, (lnum, pos), line)\r
383	else:\r
384	strstart = (lnum, start) # multiple lines\r
385	contstr = line[start:]\r
386	contline = line\r
387	break\r
388	elif initial in single_quoted or \\r
389	token[:2] in single_quoted or \\r
390	token[:3] in single_quoted:\r
391	if token[-1] == '\n': # continued string\r
392	strstart = (lnum, start)\r
393	endprog = (endprogs[initial] or endprogs[token[1]] or\r
394	endprogs[token[2]])\r
395	contstr, needcont = line[start:], 1\r
396	contline = line\r
397	break\r
398	else: # ordinary string\r
399	yield (STRING, token, spos, epos, line)\r
400	elif initial in namechars: # ordinary name\r
401	yield (NAME, token, spos, epos, line)\r
402	elif initial == '\\': # continued stmt\r
403	continued = 1\r
404	else:\r
405	if initial in '([{':\r
406	parenlev += 1\r
407	elif initial in ')]}':\r
408	parenlev -= 1\r
409	yield (OP, token, spos, epos, line)\r
410	else:\r
411	yield (ERRORTOKEN, line[pos],\r
412	(lnum, pos), (lnum, pos+1), line)\r
413	pos += 1\r
414	\r
415	for indent in indents[1:]: # pop remaining indent levels\r
416	yield (DEDENT, '', (lnum, 0), (lnum, 0), '')\r
417	yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')\r
418	\r
419	if __name__ == '__main__': # testing\r
420	import sys\r
421	if len(sys.argv) > 1:\r
422	tokenize(open(sys.argv[1]).readline)\r
423	else:\r
424	tokenize(sys.stdin.readline)\r