]>
Commit | Line | Data |
---|---|---|
3257aa99 DM |
1 | """Tokenization help for Python programs.\r |
2 | \r | |
3 | generate_tokens(readline) is a generator that breaks a stream of\r | |
4 | text into Python tokens. It accepts a readline-like method which is called\r | |
5 | repeatedly to get the next line of input (or "" for EOF). It generates\r | |
6 | 5-tuples with these members:\r | |
7 | \r | |
8 | the token type (see token.py)\r | |
9 | the token (a string)\r | |
10 | the starting (row, column) indices of the token (a 2-tuple of ints)\r | |
11 | the ending (row, column) indices of the token (a 2-tuple of ints)\r | |
12 | the original line (string)\r | |
13 | \r | |
14 | It is designed to match the working of the Python tokenizer exactly, except\r | |
15 | that it produces COMMENT tokens for comments and gives type OP for all\r | |
16 | operators\r | |
17 | \r | |
18 | Older entry points\r | |
19 | tokenize_loop(readline, tokeneater)\r | |
20 | tokenize(readline, tokeneater=printtoken)\r | |
21 | are the same, except instead of generating tokens, tokeneater is a callback\r | |
22 | function to which the 5 fields described above are passed as 5 arguments,\r | |
23 | each time a new token is found."""\r | |
24 | \r | |
25 | __author__ = 'Ka-Ping Yee <ping@lfw.org>'\r | |
26 | __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '\r | |
27 | 'Skip Montanaro, Raymond Hettinger')\r | |
28 | \r | |
29 | from itertools import chain\r | |
30 | import string, re\r | |
31 | from token import *\r | |
32 | \r | |
33 | import token\r | |
34 | __all__ = [x for x in dir(token) if not x.startswith("_")]\r | |
35 | __all__ += ["COMMENT", "tokenize", "generate_tokens", "NL", "untokenize"]\r | |
36 | del x\r | |
37 | del token\r | |
38 | \r | |
39 | COMMENT = N_TOKENS\r | |
40 | tok_name[COMMENT] = 'COMMENT'\r | |
41 | NL = N_TOKENS + 1\r | |
42 | tok_name[NL] = 'NL'\r | |
43 | N_TOKENS += 2\r | |
44 | \r | |
45 | def group(*choices): return '(' + '|'.join(choices) + ')'\r | |
46 | def any(*choices): return group(*choices) + '*'\r | |
47 | def maybe(*choices): return group(*choices) + '?'\r | |
48 | \r | |
49 | Whitespace = r'[ \f\t]*'\r | |
50 | Comment = r'#[^\r\n]*'\r | |
51 | Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)\r | |
52 | Name = r'[a-zA-Z_]\w*'\r | |
53 | \r | |
54 | Hexnumber = r'0[xX][\da-fA-F]+[lL]?'\r | |
55 | Octnumber = r'(0[oO][0-7]+)|(0[0-7]*)[lL]?'\r | |
56 | Binnumber = r'0[bB][01]+[lL]?'\r | |
57 | Decnumber = r'[1-9]\d*[lL]?'\r | |
58 | Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)\r | |
59 | Exponent = r'[eE][-+]?\d+'\r | |
60 | Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)\r | |
61 | Expfloat = r'\d+' + Exponent\r | |
62 | Floatnumber = group(Pointfloat, Expfloat)\r | |
63 | Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')\r | |
64 | Number = group(Imagnumber, Floatnumber, Intnumber)\r | |
65 | \r | |
66 | # Tail end of ' string.\r | |
67 | Single = r"[^'\\]*(?:\\.[^'\\]*)*'"\r | |
68 | # Tail end of " string.\r | |
69 | Double = r'[^"\\]*(?:\\.[^"\\]*)*"'\r | |
70 | # Tail end of ''' string.\r | |
71 | Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"\r | |
72 | # Tail end of """ string.\r | |
73 | Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'\r | |
74 | Triple = group("[uUbB]?[rR]?'''", '[uUbB]?[rR]?"""')\r | |
75 | # Single-line ' or " string.\r | |
76 | String = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",\r | |
77 | r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')\r | |
78 | \r | |
79 | # Because of leftmost-then-longest match semantics, be sure to put the\r | |
80 | # longest operators first (e.g., if = came before ==, == would get\r | |
81 | # recognized as two instances of =).\r | |
82 | Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",\r | |
83 | r"//=?",\r | |
84 | r"[+\-*/%&|^=<>]=?",\r | |
85 | r"~")\r | |
86 | \r | |
87 | Bracket = '[][(){}]'\r | |
88 | Special = group(r'\r?\n', r'[:;.,`@]')\r | |
89 | Funny = group(Operator, Bracket, Special)\r | |
90 | \r | |
91 | PlainToken = group(Number, Funny, String, Name)\r | |
92 | Token = Ignore + PlainToken\r | |
93 | \r | |
94 | # First (or only) line of ' or " string.\r | |
95 | ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +\r | |
96 | group("'", r'\\\r?\n'),\r | |
97 | r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +\r | |
98 | group('"', r'\\\r?\n'))\r | |
99 | PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)\r | |
100 | PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)\r | |
101 | \r | |
102 | tokenprog, pseudoprog, single3prog, double3prog = map(\r | |
103 | re.compile, (Token, PseudoToken, Single3, Double3))\r | |
104 | endprogs = {"'": re.compile(Single), '"': re.compile(Double),\r | |
105 | "'''": single3prog, '"""': double3prog,\r | |
106 | "r'''": single3prog, 'r"""': double3prog,\r | |
107 | "u'''": single3prog, 'u"""': double3prog,\r | |
108 | "ur'''": single3prog, 'ur"""': double3prog,\r | |
109 | "R'''": single3prog, 'R"""': double3prog,\r | |
110 | "U'''": single3prog, 'U"""': double3prog,\r | |
111 | "uR'''": single3prog, 'uR"""': double3prog,\r | |
112 | "Ur'''": single3prog, 'Ur"""': double3prog,\r | |
113 | "UR'''": single3prog, 'UR"""': double3prog,\r | |
114 | "b'''": single3prog, 'b"""': double3prog,\r | |
115 | "br'''": single3prog, 'br"""': double3prog,\r | |
116 | "B'''": single3prog, 'B"""': double3prog,\r | |
117 | "bR'''": single3prog, 'bR"""': double3prog,\r | |
118 | "Br'''": single3prog, 'Br"""': double3prog,\r | |
119 | "BR'''": single3prog, 'BR"""': double3prog,\r | |
120 | 'r': None, 'R': None, 'u': None, 'U': None,\r | |
121 | 'b': None, 'B': None}\r | |
122 | \r | |
123 | triple_quoted = {}\r | |
124 | for t in ("'''", '"""',\r | |
125 | "r'''", 'r"""', "R'''", 'R"""',\r | |
126 | "u'''", 'u"""', "U'''", 'U"""',\r | |
127 | "ur'''", 'ur"""', "Ur'''", 'Ur"""',\r | |
128 | "uR'''", 'uR"""', "UR'''", 'UR"""',\r | |
129 | "b'''", 'b"""', "B'''", 'B"""',\r | |
130 | "br'''", 'br"""', "Br'''", 'Br"""',\r | |
131 | "bR'''", 'bR"""', "BR'''", 'BR"""'):\r | |
132 | triple_quoted[t] = t\r | |
133 | single_quoted = {}\r | |
134 | for t in ("'", '"',\r | |
135 | "r'", 'r"', "R'", 'R"',\r | |
136 | "u'", 'u"', "U'", 'U"',\r | |
137 | "ur'", 'ur"', "Ur'", 'Ur"',\r | |
138 | "uR'", 'uR"', "UR'", 'UR"',\r | |
139 | "b'", 'b"', "B'", 'B"',\r | |
140 | "br'", 'br"', "Br'", 'Br"',\r | |
141 | "bR'", 'bR"', "BR'", 'BR"' ):\r | |
142 | single_quoted[t] = t\r | |
143 | \r | |
144 | tabsize = 8\r | |
145 | \r | |
146 | class TokenError(Exception): pass\r | |
147 | \r | |
148 | class StopTokenizing(Exception): pass\r | |
149 | \r | |
150 | def printtoken(type, token, srow_scol, erow_ecol, line): # for testing\r | |
151 | srow, scol = srow_scol\r | |
152 | erow, ecol = erow_ecol\r | |
153 | print "%d,%d-%d,%d:\t%s\t%s" % \\r | |
154 | (srow, scol, erow, ecol, tok_name[type], repr(token))\r | |
155 | \r | |
156 | def tokenize(readline, tokeneater=printtoken):\r | |
157 | """\r | |
158 | The tokenize() function accepts two parameters: one representing the\r | |
159 | input stream, and one providing an output mechanism for tokenize().\r | |
160 | \r | |
161 | The first parameter, readline, must be a callable object which provides\r | |
162 | the same interface as the readline() method of built-in file objects.\r | |
163 | Each call to the function should return one line of input as a string.\r | |
164 | \r | |
165 | The second parameter, tokeneater, must also be a callable object. It is\r | |
166 | called once for each token, with five arguments, corresponding to the\r | |
167 | tuples generated by generate_tokens().\r | |
168 | """\r | |
169 | try:\r | |
170 | tokenize_loop(readline, tokeneater)\r | |
171 | except StopTokenizing:\r | |
172 | pass\r | |
173 | \r | |
174 | # backwards compatible interface\r | |
175 | def tokenize_loop(readline, tokeneater):\r | |
176 | for token_info in generate_tokens(readline):\r | |
177 | tokeneater(*token_info)\r | |
178 | \r | |
179 | class Untokenizer:\r | |
180 | \r | |
181 | def __init__(self):\r | |
182 | self.tokens = []\r | |
183 | self.prev_row = 1\r | |
184 | self.prev_col = 0\r | |
185 | \r | |
186 | def add_whitespace(self, start):\r | |
187 | row, col = start\r | |
188 | if row < self.prev_row or row == self.prev_row and col < self.prev_col:\r | |
189 | raise ValueError("start ({},{}) precedes previous end ({},{})"\r | |
190 | .format(row, col, self.prev_row, self.prev_col))\r | |
191 | row_offset = row - self.prev_row\r | |
192 | if row_offset:\r | |
193 | self.tokens.append("\\\n" * row_offset)\r | |
194 | self.prev_col = 0\r | |
195 | col_offset = col - self.prev_col\r | |
196 | if col_offset:\r | |
197 | self.tokens.append(" " * col_offset)\r | |
198 | \r | |
199 | def untokenize(self, iterable):\r | |
200 | it = iter(iterable)\r | |
201 | for t in it:\r | |
202 | if len(t) == 2:\r | |
203 | self.compat(t, it)\r | |
204 | break\r | |
205 | tok_type, token, start, end, line = t\r | |
206 | if tok_type == ENDMARKER:\r | |
207 | break\r | |
208 | self.add_whitespace(start)\r | |
209 | self.tokens.append(token)\r | |
210 | self.prev_row, self.prev_col = end\r | |
211 | if tok_type in (NEWLINE, NL):\r | |
212 | self.prev_row += 1\r | |
213 | self.prev_col = 0\r | |
214 | return "".join(self.tokens)\r | |
215 | \r | |
216 | def compat(self, token, iterable):\r | |
217 | indents = []\r | |
218 | toks_append = self.tokens.append\r | |
219 | startline = token[0] in (NEWLINE, NL)\r | |
220 | prevstring = False\r | |
221 | \r | |
222 | for tok in chain([token], iterable):\r | |
223 | toknum, tokval = tok[:2]\r | |
224 | \r | |
225 | if toknum in (NAME, NUMBER):\r | |
226 | tokval += ' '\r | |
227 | \r | |
228 | # Insert a space between two consecutive strings\r | |
229 | if toknum == STRING:\r | |
230 | if prevstring:\r | |
231 | tokval = ' ' + tokval\r | |
232 | prevstring = True\r | |
233 | else:\r | |
234 | prevstring = False\r | |
235 | \r | |
236 | if toknum == INDENT:\r | |
237 | indents.append(tokval)\r | |
238 | continue\r | |
239 | elif toknum == DEDENT:\r | |
240 | indents.pop()\r | |
241 | continue\r | |
242 | elif toknum in (NEWLINE, NL):\r | |
243 | startline = True\r | |
244 | elif startline and indents:\r | |
245 | toks_append(indents[-1])\r | |
246 | startline = False\r | |
247 | toks_append(tokval)\r | |
248 | \r | |
249 | def untokenize(iterable):\r | |
250 | """Transform tokens back into Python source code.\r | |
251 | \r | |
252 | Each element returned by the iterable must be a token sequence\r | |
253 | with at least two elements, a token number and token value. If\r | |
254 | only two tokens are passed, the resulting output is poor.\r | |
255 | \r | |
256 | Round-trip invariant for full input:\r | |
257 | Untokenized source will match input source exactly\r | |
258 | \r | |
259 | Round-trip invariant for limited intput:\r | |
260 | # Output text will tokenize the back to the input\r | |
261 | t1 = [tok[:2] for tok in generate_tokens(f.readline)]\r | |
262 | newcode = untokenize(t1)\r | |
263 | readline = iter(newcode.splitlines(1)).next\r | |
264 | t2 = [tok[:2] for tok in generate_tokens(readline)]\r | |
265 | assert t1 == t2\r | |
266 | """\r | |
267 | ut = Untokenizer()\r | |
268 | return ut.untokenize(iterable)\r | |
269 | \r | |
270 | def generate_tokens(readline):\r | |
271 | """\r | |
272 | The generate_tokens() generator requires one argument, readline, which\r | |
273 | must be a callable object which provides the same interface as the\r | |
274 | readline() method of built-in file objects. Each call to the function\r | |
275 | should return one line of input as a string. Alternately, readline\r | |
276 | can be a callable function terminating with StopIteration:\r | |
277 | readline = open(myfile).next # Example of alternate readline\r | |
278 | \r | |
279 | The generator produces 5-tuples with these members: the token type; the\r | |
280 | token string; a 2-tuple (srow, scol) of ints specifying the row and\r | |
281 | column where the token begins in the source; a 2-tuple (erow, ecol) of\r | |
282 | ints specifying the row and column where the token ends in the source;\r | |
283 | and the line on which the token was found. The line passed is the\r | |
284 | logical line; continuation lines are included.\r | |
285 | """\r | |
286 | lnum = parenlev = continued = 0\r | |
287 | namechars, numchars = string.ascii_letters + '_', '0123456789'\r | |
288 | contstr, needcont = '', 0\r | |
289 | contline = None\r | |
290 | indents = [0]\r | |
291 | \r | |
292 | while 1: # loop over lines in stream\r | |
293 | try:\r | |
294 | line = readline()\r | |
295 | except StopIteration:\r | |
296 | line = ''\r | |
297 | lnum += 1\r | |
298 | pos, max = 0, len(line)\r | |
299 | \r | |
300 | if contstr: # continued string\r | |
301 | if not line:\r | |
302 | raise TokenError, ("EOF in multi-line string", strstart)\r | |
303 | endmatch = endprog.match(line)\r | |
304 | if endmatch:\r | |
305 | pos = end = endmatch.end(0)\r | |
306 | yield (STRING, contstr + line[:end],\r | |
307 | strstart, (lnum, end), contline + line)\r | |
308 | contstr, needcont = '', 0\r | |
309 | contline = None\r | |
310 | elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':\r | |
311 | yield (ERRORTOKEN, contstr + line,\r | |
312 | strstart, (lnum, len(line)), contline)\r | |
313 | contstr = ''\r | |
314 | contline = None\r | |
315 | continue\r | |
316 | else:\r | |
317 | contstr = contstr + line\r | |
318 | contline = contline + line\r | |
319 | continue\r | |
320 | \r | |
321 | elif parenlev == 0 and not continued: # new statement\r | |
322 | if not line: break\r | |
323 | column = 0\r | |
324 | while pos < max: # measure leading whitespace\r | |
325 | if line[pos] == ' ':\r | |
326 | column += 1\r | |
327 | elif line[pos] == '\t':\r | |
328 | column = (column//tabsize + 1)*tabsize\r | |
329 | elif line[pos] == '\f':\r | |
330 | column = 0\r | |
331 | else:\r | |
332 | break\r | |
333 | pos += 1\r | |
334 | if pos == max:\r | |
335 | break\r | |
336 | \r | |
337 | if line[pos] in '#\r\n': # skip comments or blank lines\r | |
338 | if line[pos] == '#':\r | |
339 | comment_token = line[pos:].rstrip('\r\n')\r | |
340 | nl_pos = pos + len(comment_token)\r | |
341 | yield (COMMENT, comment_token,\r | |
342 | (lnum, pos), (lnum, pos + len(comment_token)), line)\r | |
343 | yield (NL, line[nl_pos:],\r | |
344 | (lnum, nl_pos), (lnum, len(line)), line)\r | |
345 | else:\r | |
346 | yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],\r | |
347 | (lnum, pos), (lnum, len(line)), line)\r | |
348 | continue\r | |
349 | \r | |
350 | if column > indents[-1]: # count indents or dedents\r | |
351 | indents.append(column)\r | |
352 | yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)\r | |
353 | while column < indents[-1]:\r | |
354 | if column not in indents:\r | |
355 | raise IndentationError(\r | |
356 | "unindent does not match any outer indentation level",\r | |
357 | ("<tokenize>", lnum, pos, line))\r | |
358 | indents = indents[:-1]\r | |
359 | yield (DEDENT, '', (lnum, pos), (lnum, pos), line)\r | |
360 | \r | |
361 | else: # continued statement\r | |
362 | if not line:\r | |
363 | raise TokenError, ("EOF in multi-line statement", (lnum, 0))\r | |
364 | continued = 0\r | |
365 | \r | |
366 | while pos < max:\r | |
367 | pseudomatch = pseudoprog.match(line, pos)\r | |
368 | if pseudomatch: # scan for tokens\r | |
369 | start, end = pseudomatch.span(1)\r | |
370 | spos, epos, pos = (lnum, start), (lnum, end), end\r | |
371 | if start == end:\r | |
372 | continue\r | |
373 | token, initial = line[start:end], line[start]\r | |
374 | \r | |
375 | if initial in numchars or \\r | |
376 | (initial == '.' and token != '.'): # ordinary number\r | |
377 | yield (NUMBER, token, spos, epos, line)\r | |
378 | elif initial in '\r\n':\r | |
379 | yield (NL if parenlev > 0 else NEWLINE,\r | |
380 | token, spos, epos, line)\r | |
381 | elif initial == '#':\r | |
382 | assert not token.endswith("\n")\r | |
383 | yield (COMMENT, token, spos, epos, line)\r | |
384 | elif token in triple_quoted:\r | |
385 | endprog = endprogs[token]\r | |
386 | endmatch = endprog.match(line, pos)\r | |
387 | if endmatch: # all on one line\r | |
388 | pos = endmatch.end(0)\r | |
389 | token = line[start:pos]\r | |
390 | yield (STRING, token, spos, (lnum, pos), line)\r | |
391 | else:\r | |
392 | strstart = (lnum, start) # multiple lines\r | |
393 | contstr = line[start:]\r | |
394 | contline = line\r | |
395 | break\r | |
396 | elif initial in single_quoted or \\r | |
397 | token[:2] in single_quoted or \\r | |
398 | token[:3] in single_quoted:\r | |
399 | if token[-1] == '\n': # continued string\r | |
400 | strstart = (lnum, start)\r | |
401 | endprog = (endprogs[initial] or endprogs[token[1]] or\r | |
402 | endprogs[token[2]])\r | |
403 | contstr, needcont = line[start:], 1\r | |
404 | contline = line\r | |
405 | break\r | |
406 | else: # ordinary string\r | |
407 | yield (STRING, token, spos, epos, line)\r | |
408 | elif initial in namechars: # ordinary name\r | |
409 | yield (NAME, token, spos, epos, line)\r | |
410 | elif initial == '\\': # continued stmt\r | |
411 | continued = 1\r | |
412 | else:\r | |
413 | if initial in '([{':\r | |
414 | parenlev += 1\r | |
415 | elif initial in ')]}':\r | |
416 | parenlev -= 1\r | |
417 | yield (OP, token, spos, epos, line)\r | |
418 | else:\r | |
419 | yield (ERRORTOKEN, line[pos],\r | |
420 | (lnum, pos), (lnum, pos+1), line)\r | |
421 | pos += 1\r | |
422 | \r | |
423 | for indent in indents[1:]: # pop remaining indent levels\r | |
424 | yield (DEDENT, '', (lnum, 0), (lnum, 0), '')\r | |
425 | yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')\r | |
426 | \r | |
427 | if __name__ == '__main__': # testing\r | |
428 | import sys\r | |
429 | if len(sys.argv) > 1:\r | |
430 | tokenize(open(sys.argv[1]).readline)\r | |
431 | else:\r | |
432 | tokenize(sys.stdin.readline)\r |