]> git.proxmox.com Git - mirror_edk2.git/blame - AppPkg/Applications/Python/Python-2.7.10/Lib/tokenize.py
AppPkg/Applications/Python/Python-2.7.10: Initial Checkin part 4/5.
[mirror_edk2.git] / AppPkg / Applications / Python / Python-2.7.10 / Lib / tokenize.py
CommitLineData
3257aa99
DM
1"""Tokenization help for Python programs.\r
2\r
3generate_tokens(readline) is a generator that breaks a stream of\r
4text into Python tokens. It accepts a readline-like method which is called\r
5repeatedly to get the next line of input (or "" for EOF). It generates\r
65-tuples with these members:\r
7\r
8 the token type (see token.py)\r
9 the token (a string)\r
10 the starting (row, column) indices of the token (a 2-tuple of ints)\r
11 the ending (row, column) indices of the token (a 2-tuple of ints)\r
12 the original line (string)\r
13\r
14It is designed to match the working of the Python tokenizer exactly, except\r
15that it produces COMMENT tokens for comments and gives type OP for all\r
16operators\r
17\r
18Older entry points\r
19 tokenize_loop(readline, tokeneater)\r
20 tokenize(readline, tokeneater=printtoken)\r
21are the same, except instead of generating tokens, tokeneater is a callback\r
22function to which the 5 fields described above are passed as 5 arguments,\r
23each time a new token is found."""\r
24\r
25__author__ = 'Ka-Ping Yee <ping@lfw.org>'\r
26__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '\r
27 'Skip Montanaro, Raymond Hettinger')\r
28\r
29from itertools import chain\r
30import string, re\r
31from token import *\r
32\r
33import token\r
34__all__ = [x for x in dir(token) if not x.startswith("_")]\r
35__all__ += ["COMMENT", "tokenize", "generate_tokens", "NL", "untokenize"]\r
36del x\r
37del token\r
38\r
39COMMENT = N_TOKENS\r
40tok_name[COMMENT] = 'COMMENT'\r
41NL = N_TOKENS + 1\r
42tok_name[NL] = 'NL'\r
43N_TOKENS += 2\r
44\r
45def group(*choices): return '(' + '|'.join(choices) + ')'\r
46def any(*choices): return group(*choices) + '*'\r
47def maybe(*choices): return group(*choices) + '?'\r
48\r
49Whitespace = r'[ \f\t]*'\r
50Comment = r'#[^\r\n]*'\r
51Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)\r
52Name = r'[a-zA-Z_]\w*'\r
53\r
54Hexnumber = r'0[xX][\da-fA-F]+[lL]?'\r
55Octnumber = r'(0[oO][0-7]+)|(0[0-7]*)[lL]?'\r
56Binnumber = r'0[bB][01]+[lL]?'\r
57Decnumber = r'[1-9]\d*[lL]?'\r
58Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)\r
59Exponent = r'[eE][-+]?\d+'\r
60Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)\r
61Expfloat = r'\d+' + Exponent\r
62Floatnumber = group(Pointfloat, Expfloat)\r
63Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')\r
64Number = group(Imagnumber, Floatnumber, Intnumber)\r
65\r
66# Tail end of ' string.\r
67Single = r"[^'\\]*(?:\\.[^'\\]*)*'"\r
68# Tail end of " string.\r
69Double = r'[^"\\]*(?:\\.[^"\\]*)*"'\r
70# Tail end of ''' string.\r
71Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"\r
72# Tail end of """ string.\r
73Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'\r
74Triple = group("[uUbB]?[rR]?'''", '[uUbB]?[rR]?"""')\r
75# Single-line ' or " string.\r
76String = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",\r
77 r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')\r
78\r
79# Because of leftmost-then-longest match semantics, be sure to put the\r
80# longest operators first (e.g., if = came before ==, == would get\r
81# recognized as two instances of =).\r
82Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",\r
83 r"//=?",\r
84 r"[+\-*/%&|^=<>]=?",\r
85 r"~")\r
86\r
87Bracket = '[][(){}]'\r
88Special = group(r'\r?\n', r'[:;.,`@]')\r
89Funny = group(Operator, Bracket, Special)\r
90\r
91PlainToken = group(Number, Funny, String, Name)\r
92Token = Ignore + PlainToken\r
93\r
94# First (or only) line of ' or " string.\r
95ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +\r
96 group("'", r'\\\r?\n'),\r
97 r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +\r
98 group('"', r'\\\r?\n'))\r
99PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)\r
100PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)\r
101\r
102tokenprog, pseudoprog, single3prog, double3prog = map(\r
103 re.compile, (Token, PseudoToken, Single3, Double3))\r
104endprogs = {"'": re.compile(Single), '"': re.compile(Double),\r
105 "'''": single3prog, '"""': double3prog,\r
106 "r'''": single3prog, 'r"""': double3prog,\r
107 "u'''": single3prog, 'u"""': double3prog,\r
108 "ur'''": single3prog, 'ur"""': double3prog,\r
109 "R'''": single3prog, 'R"""': double3prog,\r
110 "U'''": single3prog, 'U"""': double3prog,\r
111 "uR'''": single3prog, 'uR"""': double3prog,\r
112 "Ur'''": single3prog, 'Ur"""': double3prog,\r
113 "UR'''": single3prog, 'UR"""': double3prog,\r
114 "b'''": single3prog, 'b"""': double3prog,\r
115 "br'''": single3prog, 'br"""': double3prog,\r
116 "B'''": single3prog, 'B"""': double3prog,\r
117 "bR'''": single3prog, 'bR"""': double3prog,\r
118 "Br'''": single3prog, 'Br"""': double3prog,\r
119 "BR'''": single3prog, 'BR"""': double3prog,\r
120 'r': None, 'R': None, 'u': None, 'U': None,\r
121 'b': None, 'B': None}\r
122\r
123triple_quoted = {}\r
124for t in ("'''", '"""',\r
125 "r'''", 'r"""', "R'''", 'R"""',\r
126 "u'''", 'u"""', "U'''", 'U"""',\r
127 "ur'''", 'ur"""', "Ur'''", 'Ur"""',\r
128 "uR'''", 'uR"""', "UR'''", 'UR"""',\r
129 "b'''", 'b"""', "B'''", 'B"""',\r
130 "br'''", 'br"""', "Br'''", 'Br"""',\r
131 "bR'''", 'bR"""', "BR'''", 'BR"""'):\r
132 triple_quoted[t] = t\r
133single_quoted = {}\r
134for t in ("'", '"',\r
135 "r'", 'r"', "R'", 'R"',\r
136 "u'", 'u"', "U'", 'U"',\r
137 "ur'", 'ur"', "Ur'", 'Ur"',\r
138 "uR'", 'uR"', "UR'", 'UR"',\r
139 "b'", 'b"', "B'", 'B"',\r
140 "br'", 'br"', "Br'", 'Br"',\r
141 "bR'", 'bR"', "BR'", 'BR"' ):\r
142 single_quoted[t] = t\r
143\r
144tabsize = 8\r
145\r
146class TokenError(Exception): pass\r
147\r
148class StopTokenizing(Exception): pass\r
149\r
150def printtoken(type, token, srow_scol, erow_ecol, line): # for testing\r
151 srow, scol = srow_scol\r
152 erow, ecol = erow_ecol\r
153 print "%d,%d-%d,%d:\t%s\t%s" % \\r
154 (srow, scol, erow, ecol, tok_name[type], repr(token))\r
155\r
156def tokenize(readline, tokeneater=printtoken):\r
157 """\r
158 The tokenize() function accepts two parameters: one representing the\r
159 input stream, and one providing an output mechanism for tokenize().\r
160\r
161 The first parameter, readline, must be a callable object which provides\r
162 the same interface as the readline() method of built-in file objects.\r
163 Each call to the function should return one line of input as a string.\r
164\r
165 The second parameter, tokeneater, must also be a callable object. It is\r
166 called once for each token, with five arguments, corresponding to the\r
167 tuples generated by generate_tokens().\r
168 """\r
169 try:\r
170 tokenize_loop(readline, tokeneater)\r
171 except StopTokenizing:\r
172 pass\r
173\r
174# backwards compatible interface\r
175def tokenize_loop(readline, tokeneater):\r
176 for token_info in generate_tokens(readline):\r
177 tokeneater(*token_info)\r
178\r
179class Untokenizer:\r
180\r
181 def __init__(self):\r
182 self.tokens = []\r
183 self.prev_row = 1\r
184 self.prev_col = 0\r
185\r
186 def add_whitespace(self, start):\r
187 row, col = start\r
188 if row < self.prev_row or row == self.prev_row and col < self.prev_col:\r
189 raise ValueError("start ({},{}) precedes previous end ({},{})"\r
190 .format(row, col, self.prev_row, self.prev_col))\r
191 row_offset = row - self.prev_row\r
192 if row_offset:\r
193 self.tokens.append("\\\n" * row_offset)\r
194 self.prev_col = 0\r
195 col_offset = col - self.prev_col\r
196 if col_offset:\r
197 self.tokens.append(" " * col_offset)\r
198\r
199 def untokenize(self, iterable):\r
200 it = iter(iterable)\r
201 for t in it:\r
202 if len(t) == 2:\r
203 self.compat(t, it)\r
204 break\r
205 tok_type, token, start, end, line = t\r
206 if tok_type == ENDMARKER:\r
207 break\r
208 self.add_whitespace(start)\r
209 self.tokens.append(token)\r
210 self.prev_row, self.prev_col = end\r
211 if tok_type in (NEWLINE, NL):\r
212 self.prev_row += 1\r
213 self.prev_col = 0\r
214 return "".join(self.tokens)\r
215\r
216 def compat(self, token, iterable):\r
217 indents = []\r
218 toks_append = self.tokens.append\r
219 startline = token[0] in (NEWLINE, NL)\r
220 prevstring = False\r
221\r
222 for tok in chain([token], iterable):\r
223 toknum, tokval = tok[:2]\r
224\r
225 if toknum in (NAME, NUMBER):\r
226 tokval += ' '\r
227\r
228 # Insert a space between two consecutive strings\r
229 if toknum == STRING:\r
230 if prevstring:\r
231 tokval = ' ' + tokval\r
232 prevstring = True\r
233 else:\r
234 prevstring = False\r
235\r
236 if toknum == INDENT:\r
237 indents.append(tokval)\r
238 continue\r
239 elif toknum == DEDENT:\r
240 indents.pop()\r
241 continue\r
242 elif toknum in (NEWLINE, NL):\r
243 startline = True\r
244 elif startline and indents:\r
245 toks_append(indents[-1])\r
246 startline = False\r
247 toks_append(tokval)\r
248\r
249def untokenize(iterable):\r
250 """Transform tokens back into Python source code.\r
251\r
252 Each element returned by the iterable must be a token sequence\r
253 with at least two elements, a token number and token value. If\r
254 only two tokens are passed, the resulting output is poor.\r
255\r
256 Round-trip invariant for full input:\r
257 Untokenized source will match input source exactly\r
258\r
259 Round-trip invariant for limited intput:\r
260 # Output text will tokenize the back to the input\r
261 t1 = [tok[:2] for tok in generate_tokens(f.readline)]\r
262 newcode = untokenize(t1)\r
263 readline = iter(newcode.splitlines(1)).next\r
264 t2 = [tok[:2] for tok in generate_tokens(readline)]\r
265 assert t1 == t2\r
266 """\r
267 ut = Untokenizer()\r
268 return ut.untokenize(iterable)\r
269\r
270def generate_tokens(readline):\r
271 """\r
272 The generate_tokens() generator requires one argument, readline, which\r
273 must be a callable object which provides the same interface as the\r
274 readline() method of built-in file objects. Each call to the function\r
275 should return one line of input as a string. Alternately, readline\r
276 can be a callable function terminating with StopIteration:\r
277 readline = open(myfile).next # Example of alternate readline\r
278\r
279 The generator produces 5-tuples with these members: the token type; the\r
280 token string; a 2-tuple (srow, scol) of ints specifying the row and\r
281 column where the token begins in the source; a 2-tuple (erow, ecol) of\r
282 ints specifying the row and column where the token ends in the source;\r
283 and the line on which the token was found. The line passed is the\r
284 logical line; continuation lines are included.\r
285 """\r
286 lnum = parenlev = continued = 0\r
287 namechars, numchars = string.ascii_letters + '_', '0123456789'\r
288 contstr, needcont = '', 0\r
289 contline = None\r
290 indents = [0]\r
291\r
292 while 1: # loop over lines in stream\r
293 try:\r
294 line = readline()\r
295 except StopIteration:\r
296 line = ''\r
297 lnum += 1\r
298 pos, max = 0, len(line)\r
299\r
300 if contstr: # continued string\r
301 if not line:\r
302 raise TokenError, ("EOF in multi-line string", strstart)\r
303 endmatch = endprog.match(line)\r
304 if endmatch:\r
305 pos = end = endmatch.end(0)\r
306 yield (STRING, contstr + line[:end],\r
307 strstart, (lnum, end), contline + line)\r
308 contstr, needcont = '', 0\r
309 contline = None\r
310 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':\r
311 yield (ERRORTOKEN, contstr + line,\r
312 strstart, (lnum, len(line)), contline)\r
313 contstr = ''\r
314 contline = None\r
315 continue\r
316 else:\r
317 contstr = contstr + line\r
318 contline = contline + line\r
319 continue\r
320\r
321 elif parenlev == 0 and not continued: # new statement\r
322 if not line: break\r
323 column = 0\r
324 while pos < max: # measure leading whitespace\r
325 if line[pos] == ' ':\r
326 column += 1\r
327 elif line[pos] == '\t':\r
328 column = (column//tabsize + 1)*tabsize\r
329 elif line[pos] == '\f':\r
330 column = 0\r
331 else:\r
332 break\r
333 pos += 1\r
334 if pos == max:\r
335 break\r
336\r
337 if line[pos] in '#\r\n': # skip comments or blank lines\r
338 if line[pos] == '#':\r
339 comment_token = line[pos:].rstrip('\r\n')\r
340 nl_pos = pos + len(comment_token)\r
341 yield (COMMENT, comment_token,\r
342 (lnum, pos), (lnum, pos + len(comment_token)), line)\r
343 yield (NL, line[nl_pos:],\r
344 (lnum, nl_pos), (lnum, len(line)), line)\r
345 else:\r
346 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],\r
347 (lnum, pos), (lnum, len(line)), line)\r
348 continue\r
349\r
350 if column > indents[-1]: # count indents or dedents\r
351 indents.append(column)\r
352 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)\r
353 while column < indents[-1]:\r
354 if column not in indents:\r
355 raise IndentationError(\r
356 "unindent does not match any outer indentation level",\r
357 ("<tokenize>", lnum, pos, line))\r
358 indents = indents[:-1]\r
359 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)\r
360\r
361 else: # continued statement\r
362 if not line:\r
363 raise TokenError, ("EOF in multi-line statement", (lnum, 0))\r
364 continued = 0\r
365\r
366 while pos < max:\r
367 pseudomatch = pseudoprog.match(line, pos)\r
368 if pseudomatch: # scan for tokens\r
369 start, end = pseudomatch.span(1)\r
370 spos, epos, pos = (lnum, start), (lnum, end), end\r
371 if start == end:\r
372 continue\r
373 token, initial = line[start:end], line[start]\r
374\r
375 if initial in numchars or \\r
376 (initial == '.' and token != '.'): # ordinary number\r
377 yield (NUMBER, token, spos, epos, line)\r
378 elif initial in '\r\n':\r
379 yield (NL if parenlev > 0 else NEWLINE,\r
380 token, spos, epos, line)\r
381 elif initial == '#':\r
382 assert not token.endswith("\n")\r
383 yield (COMMENT, token, spos, epos, line)\r
384 elif token in triple_quoted:\r
385 endprog = endprogs[token]\r
386 endmatch = endprog.match(line, pos)\r
387 if endmatch: # all on one line\r
388 pos = endmatch.end(0)\r
389 token = line[start:pos]\r
390 yield (STRING, token, spos, (lnum, pos), line)\r
391 else:\r
392 strstart = (lnum, start) # multiple lines\r
393 contstr = line[start:]\r
394 contline = line\r
395 break\r
396 elif initial in single_quoted or \\r
397 token[:2] in single_quoted or \\r
398 token[:3] in single_quoted:\r
399 if token[-1] == '\n': # continued string\r
400 strstart = (lnum, start)\r
401 endprog = (endprogs[initial] or endprogs[token[1]] or\r
402 endprogs[token[2]])\r
403 contstr, needcont = line[start:], 1\r
404 contline = line\r
405 break\r
406 else: # ordinary string\r
407 yield (STRING, token, spos, epos, line)\r
408 elif initial in namechars: # ordinary name\r
409 yield (NAME, token, spos, epos, line)\r
410 elif initial == '\\': # continued stmt\r
411 continued = 1\r
412 else:\r
413 if initial in '([{':\r
414 parenlev += 1\r
415 elif initial in ')]}':\r
416 parenlev -= 1\r
417 yield (OP, token, spos, epos, line)\r
418 else:\r
419 yield (ERRORTOKEN, line[pos],\r
420 (lnum, pos), (lnum, pos+1), line)\r
421 pos += 1\r
422\r
423 for indent in indents[1:]: # pop remaining indent levels\r
424 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')\r
425 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')\r
426\r
427if __name__ == '__main__': # testing\r
428 import sys\r
429 if len(sys.argv) > 1:\r
430 tokenize(open(sys.argv[1]).readline)\r
431 else:\r
432 tokenize(sys.stdin.readline)\r