+++ /dev/null
-"""Tokenization help for Python programs.\r
-\r
-generate_tokens(readline) is a generator that breaks a stream of\r
-text into Python tokens. It accepts a readline-like method which is called\r
-repeatedly to get the next line of input (or "" for EOF). It generates\r
-5-tuples with these members:\r
-\r
- the token type (see token.py)\r
- the token (a string)\r
- the starting (row, column) indices of the token (a 2-tuple of ints)\r
- the ending (row, column) indices of the token (a 2-tuple of ints)\r
- the original line (string)\r
-\r
-It is designed to match the working of the Python tokenizer exactly, except\r
-that it produces COMMENT tokens for comments and gives type OP for all\r
-operators\r
-\r
-Older entry points\r
- tokenize_loop(readline, tokeneater)\r
- tokenize(readline, tokeneater=printtoken)\r
-are the same, except instead of generating tokens, tokeneater is a callback\r
-function to which the 5 fields described above are passed as 5 arguments,\r
-each time a new token is found."""\r
-\r
-__author__ = 'Ka-Ping Yee <ping@lfw.org>'\r
-__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '\r
- 'Skip Montanaro, Raymond Hettinger')\r
-\r
-from itertools import chain\r
-import string, re\r
-from token import *\r
-\r
-import token\r
-__all__ = [x for x in dir(token) if not x.startswith("_")]\r
-__all__ += ["COMMENT", "tokenize", "generate_tokens", "NL", "untokenize"]\r
-del x\r
-del token\r
-\r
-COMMENT = N_TOKENS\r
-tok_name[COMMENT] = 'COMMENT'\r
-NL = N_TOKENS + 1\r
-tok_name[NL] = 'NL'\r
-N_TOKENS += 2\r
-\r
-def group(*choices): return '(' + '|'.join(choices) + ')'\r
-def any(*choices): return group(*choices) + '*'\r
-def maybe(*choices): return group(*choices) + '?'\r
-\r
-Whitespace = r'[ \f\t]*'\r
-Comment = r'#[^\r\n]*'\r
-Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)\r
-Name = r'[a-zA-Z_]\w*'\r
-\r
-Hexnumber = r'0[xX][\da-fA-F]+[lL]?'\r
-Octnumber = r'(0[oO][0-7]+)|(0[0-7]*)[lL]?'\r
-Binnumber = r'0[bB][01]+[lL]?'\r
-Decnumber = r'[1-9]\d*[lL]?'\r
-Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)\r
-Exponent = r'[eE][-+]?\d+'\r
-Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)\r
-Expfloat = r'\d+' + Exponent\r
-Floatnumber = group(Pointfloat, Expfloat)\r
-Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')\r
-Number = group(Imagnumber, Floatnumber, Intnumber)\r
-\r
-# Tail end of ' string.\r
-Single = r"[^'\\]*(?:\\.[^'\\]*)*'"\r
-# Tail end of " string.\r
-Double = r'[^"\\]*(?:\\.[^"\\]*)*"'\r
-# Tail end of ''' string.\r
-Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"\r
-# Tail end of """ string.\r
-Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'\r
-Triple = group("[uUbB]?[rR]?'''", '[uUbB]?[rR]?"""')\r
-# Single-line ' or " string.\r
-String = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",\r
- r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')\r
-\r
-# Because of leftmost-then-longest match semantics, be sure to put the\r
-# longest operators first (e.g., if = came before ==, == would get\r
-# recognized as two instances of =).\r
-Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",\r
- r"//=?",\r
- r"[+\-*/%&|^=<>]=?",\r
- r"~")\r
-\r
-Bracket = '[][(){}]'\r
-Special = group(r'\r?\n', r'[:;.,`@]')\r
-Funny = group(Operator, Bracket, Special)\r
-\r
-PlainToken = group(Number, Funny, String, Name)\r
-Token = Ignore + PlainToken\r
-\r
-# First (or only) line of ' or " string.\r
-ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +\r
- group("'", r'\\\r?\n'),\r
- r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +\r
- group('"', r'\\\r?\n'))\r
-PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)\r
-PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)\r
-\r
-tokenprog, pseudoprog, single3prog, double3prog = map(\r
- re.compile, (Token, PseudoToken, Single3, Double3))\r
-endprogs = {"'": re.compile(Single), '"': re.compile(Double),\r
- "'''": single3prog, '"""': double3prog,\r
- "r'''": single3prog, 'r"""': double3prog,\r
- "u'''": single3prog, 'u"""': double3prog,\r
- "ur'''": single3prog, 'ur"""': double3prog,\r
- "R'''": single3prog, 'R"""': double3prog,\r
- "U'''": single3prog, 'U"""': double3prog,\r
- "uR'''": single3prog, 'uR"""': double3prog,\r
- "Ur'''": single3prog, 'Ur"""': double3prog,\r
- "UR'''": single3prog, 'UR"""': double3prog,\r
- "b'''": single3prog, 'b"""': double3prog,\r
- "br'''": single3prog, 'br"""': double3prog,\r
- "B'''": single3prog, 'B"""': double3prog,\r
- "bR'''": single3prog, 'bR"""': double3prog,\r
- "Br'''": single3prog, 'Br"""': double3prog,\r
- "BR'''": single3prog, 'BR"""': double3prog,\r
- 'r': None, 'R': None, 'u': None, 'U': None,\r
- 'b': None, 'B': None}\r
-\r
-triple_quoted = {}\r
-for t in ("'''", '"""',\r
- "r'''", 'r"""', "R'''", 'R"""',\r
- "u'''", 'u"""', "U'''", 'U"""',\r
- "ur'''", 'ur"""', "Ur'''", 'Ur"""',\r
- "uR'''", 'uR"""', "UR'''", 'UR"""',\r
- "b'''", 'b"""', "B'''", 'B"""',\r
- "br'''", 'br"""', "Br'''", 'Br"""',\r
- "bR'''", 'bR"""', "BR'''", 'BR"""'):\r
- triple_quoted[t] = t\r
-single_quoted = {}\r
-for t in ("'", '"',\r
- "r'", 'r"', "R'", 'R"',\r
- "u'", 'u"', "U'", 'U"',\r
- "ur'", 'ur"', "Ur'", 'Ur"',\r
- "uR'", 'uR"', "UR'", 'UR"',\r
- "b'", 'b"', "B'", 'B"',\r
- "br'", 'br"', "Br'", 'Br"',\r
- "bR'", 'bR"', "BR'", 'BR"' ):\r
- single_quoted[t] = t\r
-\r
-tabsize = 8\r
-\r
-class TokenError(Exception): pass\r
-\r
-class StopTokenizing(Exception): pass\r
-\r
-def printtoken(type, token, srow_scol, erow_ecol, line): # for testing\r
- srow, scol = srow_scol\r
- erow, ecol = erow_ecol\r
- print "%d,%d-%d,%d:\t%s\t%s" % \\r
- (srow, scol, erow, ecol, tok_name[type], repr(token))\r
-\r
-def tokenize(readline, tokeneater=printtoken):\r
- """\r
- The tokenize() function accepts two parameters: one representing the\r
- input stream, and one providing an output mechanism for tokenize().\r
-\r
- The first parameter, readline, must be a callable object which provides\r
- the same interface as the readline() method of built-in file objects.\r
- Each call to the function should return one line of input as a string.\r
-\r
- The second parameter, tokeneater, must also be a callable object. It is\r
- called once for each token, with five arguments, corresponding to the\r
- tuples generated by generate_tokens().\r
- """\r
- try:\r
- tokenize_loop(readline, tokeneater)\r
- except StopTokenizing:\r
- pass\r
-\r
-# backwards compatible interface\r
-def tokenize_loop(readline, tokeneater):\r
- for token_info in generate_tokens(readline):\r
- tokeneater(*token_info)\r
-\r
-class Untokenizer:\r
-\r
- def __init__(self):\r
- self.tokens = []\r
- self.prev_row = 1\r
- self.prev_col = 0\r
-\r
- def add_whitespace(self, start):\r
- row, col = start\r
- if row < self.prev_row or row == self.prev_row and col < self.prev_col:\r
- raise ValueError("start ({},{}) precedes previous end ({},{})"\r
- .format(row, col, self.prev_row, self.prev_col))\r
- row_offset = row - self.prev_row\r
- if row_offset:\r
- self.tokens.append("\\\n" * row_offset)\r
- self.prev_col = 0\r
- col_offset = col - self.prev_col\r
- if col_offset:\r
- self.tokens.append(" " * col_offset)\r
-\r
- def untokenize(self, iterable):\r
- it = iter(iterable)\r
- for t in it:\r
- if len(t) == 2:\r
- self.compat(t, it)\r
- break\r
- tok_type, token, start, end, line = t\r
- if tok_type == ENDMARKER:\r
- break\r
- self.add_whitespace(start)\r
- self.tokens.append(token)\r
- self.prev_row, self.prev_col = end\r
- if tok_type in (NEWLINE, NL):\r
- self.prev_row += 1\r
- self.prev_col = 0\r
- return "".join(self.tokens)\r
-\r
- def compat(self, token, iterable):\r
- indents = []\r
- toks_append = self.tokens.append\r
- startline = token[0] in (NEWLINE, NL)\r
- prevstring = False\r
-\r
- for tok in chain([token], iterable):\r
- toknum, tokval = tok[:2]\r
-\r
- if toknum in (NAME, NUMBER):\r
- tokval += ' '\r
-\r
- # Insert a space between two consecutive strings\r
- if toknum == STRING:\r
- if prevstring:\r
- tokval = ' ' + tokval\r
- prevstring = True\r
- else:\r
- prevstring = False\r
-\r
- if toknum == INDENT:\r
- indents.append(tokval)\r
- continue\r
- elif toknum == DEDENT:\r
- indents.pop()\r
- continue\r
- elif toknum in (NEWLINE, NL):\r
- startline = True\r
- elif startline and indents:\r
- toks_append(indents[-1])\r
- startline = False\r
- toks_append(tokval)\r
-\r
-def untokenize(iterable):\r
- """Transform tokens back into Python source code.\r
-\r
- Each element returned by the iterable must be a token sequence\r
- with at least two elements, a token number and token value. If\r
- only two tokens are passed, the resulting output is poor.\r
-\r
- Round-trip invariant for full input:\r
- Untokenized source will match input source exactly\r
-\r
- Round-trip invariant for limited intput:\r
- # Output text will tokenize the back to the input\r
- t1 = [tok[:2] for tok in generate_tokens(f.readline)]\r
- newcode = untokenize(t1)\r
- readline = iter(newcode.splitlines(1)).next\r
- t2 = [tok[:2] for tok in generate_tokens(readline)]\r
- assert t1 == t2\r
- """\r
- ut = Untokenizer()\r
- return ut.untokenize(iterable)\r
-\r
-def generate_tokens(readline):\r
- """\r
- The generate_tokens() generator requires one argument, readline, which\r
- must be a callable object which provides the same interface as the\r
- readline() method of built-in file objects. Each call to the function\r
- should return one line of input as a string. Alternately, readline\r
- can be a callable function terminating with StopIteration:\r
- readline = open(myfile).next # Example of alternate readline\r
-\r
- The generator produces 5-tuples with these members: the token type; the\r
- token string; a 2-tuple (srow, scol) of ints specifying the row and\r
- column where the token begins in the source; a 2-tuple (erow, ecol) of\r
- ints specifying the row and column where the token ends in the source;\r
- and the line on which the token was found. The line passed is the\r
- logical line; continuation lines are included.\r
- """\r
- lnum = parenlev = continued = 0\r
- namechars, numchars = string.ascii_letters + '_', '0123456789'\r
- contstr, needcont = '', 0\r
- contline = None\r
- indents = [0]\r
-\r
- while 1: # loop over lines in stream\r
- try:\r
- line = readline()\r
- except StopIteration:\r
- line = ''\r
- lnum += 1\r
- pos, max = 0, len(line)\r
-\r
- if contstr: # continued string\r
- if not line:\r
- raise TokenError, ("EOF in multi-line string", strstart)\r
- endmatch = endprog.match(line)\r
- if endmatch:\r
- pos = end = endmatch.end(0)\r
- yield (STRING, contstr + line[:end],\r
- strstart, (lnum, end), contline + line)\r
- contstr, needcont = '', 0\r
- contline = None\r
- elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':\r
- yield (ERRORTOKEN, contstr + line,\r
- strstart, (lnum, len(line)), contline)\r
- contstr = ''\r
- contline = None\r
- continue\r
- else:\r
- contstr = contstr + line\r
- contline = contline + line\r
- continue\r
-\r
- elif parenlev == 0 and not continued: # new statement\r
- if not line: break\r
- column = 0\r
- while pos < max: # measure leading whitespace\r
- if line[pos] == ' ':\r
- column += 1\r
- elif line[pos] == '\t':\r
- column = (column//tabsize + 1)*tabsize\r
- elif line[pos] == '\f':\r
- column = 0\r
- else:\r
- break\r
- pos += 1\r
- if pos == max:\r
- break\r
-\r
- if line[pos] in '#\r\n': # skip comments or blank lines\r
- if line[pos] == '#':\r
- comment_token = line[pos:].rstrip('\r\n')\r
- nl_pos = pos + len(comment_token)\r
- yield (COMMENT, comment_token,\r
- (lnum, pos), (lnum, pos + len(comment_token)), line)\r
- yield (NL, line[nl_pos:],\r
- (lnum, nl_pos), (lnum, len(line)), line)\r
- else:\r
- yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],\r
- (lnum, pos), (lnum, len(line)), line)\r
- continue\r
-\r
- if column > indents[-1]: # count indents or dedents\r
- indents.append(column)\r
- yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)\r
- while column < indents[-1]:\r
- if column not in indents:\r
- raise IndentationError(\r
- "unindent does not match any outer indentation level",\r
- ("<tokenize>", lnum, pos, line))\r
- indents = indents[:-1]\r
- yield (DEDENT, '', (lnum, pos), (lnum, pos), line)\r
-\r
- else: # continued statement\r
- if not line:\r
- raise TokenError, ("EOF in multi-line statement", (lnum, 0))\r
- continued = 0\r
-\r
- while pos < max:\r
- pseudomatch = pseudoprog.match(line, pos)\r
- if pseudomatch: # scan for tokens\r
- start, end = pseudomatch.span(1)\r
- spos, epos, pos = (lnum, start), (lnum, end), end\r
- if start == end:\r
- continue\r
- token, initial = line[start:end], line[start]\r
-\r
- if initial in numchars or \\r
- (initial == '.' and token != '.'): # ordinary number\r
- yield (NUMBER, token, spos, epos, line)\r
- elif initial in '\r\n':\r
- yield (NL if parenlev > 0 else NEWLINE,\r
- token, spos, epos, line)\r
- elif initial == '#':\r
- assert not token.endswith("\n")\r
- yield (COMMENT, token, spos, epos, line)\r
- elif token in triple_quoted:\r
- endprog = endprogs[token]\r
- endmatch = endprog.match(line, pos)\r
- if endmatch: # all on one line\r
- pos = endmatch.end(0)\r
- token = line[start:pos]\r
- yield (STRING, token, spos, (lnum, pos), line)\r
- else:\r
- strstart = (lnum, start) # multiple lines\r
- contstr = line[start:]\r
- contline = line\r
- break\r
- elif initial in single_quoted or \\r
- token[:2] in single_quoted or \\r
- token[:3] in single_quoted:\r
- if token[-1] == '\n': # continued string\r
- strstart = (lnum, start)\r
- endprog = (endprogs[initial] or endprogs[token[1]] or\r
- endprogs[token[2]])\r
- contstr, needcont = line[start:], 1\r
- contline = line\r
- break\r
- else: # ordinary string\r
- yield (STRING, token, spos, epos, line)\r
- elif initial in namechars: # ordinary name\r
- yield (NAME, token, spos, epos, line)\r
- elif initial == '\\': # continued stmt\r
- continued = 1\r
- else:\r
- if initial in '([{':\r
- parenlev += 1\r
- elif initial in ')]}':\r
- parenlev -= 1\r
- yield (OP, token, spos, epos, line)\r
- else:\r
- yield (ERRORTOKEN, line[pos],\r
- (lnum, pos), (lnum, pos+1), line)\r
- pos += 1\r
-\r
- for indent in indents[1:]: # pop remaining indent levels\r
- yield (DEDENT, '', (lnum, 0), (lnum, 0), '')\r
- yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')\r
-\r
-if __name__ == '__main__': # testing\r
- import sys\r
- if len(sys.argv) > 1:\r
- tokenize(open(sys.argv[1]).readline)\r
- else:\r
- tokenize(sys.stdin.readline)\r