]> git.proxmox.com Git - mirror_edk2.git/blame - AppPkg/Applications/Python/Python-2.7.2/Lib/lib2to3/pgen2/tokenize.py
EmbeddedPkg: Extend NvVarStoreFormattedLib LIBRARY_CLASS
[mirror_edk2.git] / AppPkg / Applications / Python / Python-2.7.2 / Lib / lib2to3 / pgen2 / tokenize.py
CommitLineData
4710c53d 1# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.\r
2# All rights reserved.\r
3\r
4"""Tokenization help for Python programs.\r
5\r
6generate_tokens(readline) is a generator that breaks a stream of\r
7text into Python tokens. It accepts a readline-like method which is called\r
8repeatedly to get the next line of input (or "" for EOF). It generates\r
95-tuples with these members:\r
10\r
11 the token type (see token.py)\r
12 the token (a string)\r
13 the starting (row, column) indices of the token (a 2-tuple of ints)\r
14 the ending (row, column) indices of the token (a 2-tuple of ints)\r
15 the original line (string)\r
16\r
17It is designed to match the working of the Python tokenizer exactly, except\r
18that it produces COMMENT tokens for comments and gives type OP for all\r
19operators\r
20\r
21Older entry points\r
22 tokenize_loop(readline, tokeneater)\r
23 tokenize(readline, tokeneater=printtoken)\r
24are the same, except instead of generating tokens, tokeneater is a callback\r
25function to which the 5 fields described above are passed as 5 arguments,\r
26each time a new token is found."""\r
27\r
28__author__ = 'Ka-Ping Yee <ping@lfw.org>'\r
29__credits__ = \\r
30 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'\r
31\r
32import string, re\r
33from codecs import BOM_UTF8, lookup\r
34from lib2to3.pgen2.token import *\r
35\r
36from . import token\r
37__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",\r
38 "generate_tokens", "untokenize"]\r
39del token\r
40\r
41try:\r
42 bytes\r
43except NameError:\r
44 # Support bytes type in Python <= 2.5, so 2to3 turns itself into\r
45 # valid Python 3 code.\r
46 bytes = str\r
47\r
48def group(*choices): return '(' + '|'.join(choices) + ')'\r
49def any(*choices): return group(*choices) + '*'\r
50def maybe(*choices): return group(*choices) + '?'\r
51\r
52Whitespace = r'[ \f\t]*'\r
53Comment = r'#[^\r\n]*'\r
54Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)\r
55Name = r'[a-zA-Z_]\w*'\r
56\r
57Binnumber = r'0[bB][01]*'\r
58Hexnumber = r'0[xX][\da-fA-F]*[lL]?'\r
59Octnumber = r'0[oO]?[0-7]*[lL]?'\r
60Decnumber = r'[1-9]\d*[lL]?'\r
61Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)\r
62Exponent = r'[eE][-+]?\d+'\r
63Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)\r
64Expfloat = r'\d+' + Exponent\r
65Floatnumber = group(Pointfloat, Expfloat)\r
66Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')\r
67Number = group(Imagnumber, Floatnumber, Intnumber)\r
68\r
69# Tail end of ' string.\r
70Single = r"[^'\\]*(?:\\.[^'\\]*)*'"\r
71# Tail end of " string.\r
72Double = r'[^"\\]*(?:\\.[^"\\]*)*"'\r
73# Tail end of ''' string.\r
74Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"\r
75# Tail end of """ string.\r
76Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'\r
77Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""')\r
78# Single-line ' or " string.\r
79String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",\r
80 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')\r
81\r
82# Because of leftmost-then-longest match semantics, be sure to put the\r
83# longest operators first (e.g., if = came before ==, == would get\r
84# recognized as two instances of =).\r
85Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",\r
86 r"//=?", r"->",\r
87 r"[+\-*/%&|^=<>]=?",\r
88 r"~")\r
89\r
90Bracket = '[][(){}]'\r
91Special = group(r'\r?\n', r'[:;.,`@]')\r
92Funny = group(Operator, Bracket, Special)\r
93\r
94PlainToken = group(Number, Funny, String, Name)\r
95Token = Ignore + PlainToken\r
96\r
97# First (or only) line of ' or " string.\r
98ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +\r
99 group("'", r'\\\r?\n'),\r
100 r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +\r
101 group('"', r'\\\r?\n'))\r
102PseudoExtras = group(r'\\\r?\n', Comment, Triple)\r
103PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)\r
104\r
105tokenprog, pseudoprog, single3prog, double3prog = map(\r
106 re.compile, (Token, PseudoToken, Single3, Double3))\r
107endprogs = {"'": re.compile(Single), '"': re.compile(Double),\r
108 "'''": single3prog, '"""': double3prog,\r
109 "r'''": single3prog, 'r"""': double3prog,\r
110 "u'''": single3prog, 'u"""': double3prog,\r
111 "b'''": single3prog, 'b"""': double3prog,\r
112 "ur'''": single3prog, 'ur"""': double3prog,\r
113 "br'''": single3prog, 'br"""': double3prog,\r
114 "R'''": single3prog, 'R"""': double3prog,\r
115 "U'''": single3prog, 'U"""': double3prog,\r
116 "B'''": single3prog, 'B"""': double3prog,\r
117 "uR'''": single3prog, 'uR"""': double3prog,\r
118 "Ur'''": single3prog, 'Ur"""': double3prog,\r
119 "UR'''": single3prog, 'UR"""': double3prog,\r
120 "bR'''": single3prog, 'bR"""': double3prog,\r
121 "Br'''": single3prog, 'Br"""': double3prog,\r
122 "BR'''": single3prog, 'BR"""': double3prog,\r
123 'r': None, 'R': None,\r
124 'u': None, 'U': None,\r
125 'b': None, 'B': None}\r
126\r
127triple_quoted = {}\r
128for t in ("'''", '"""',\r
129 "r'''", 'r"""', "R'''", 'R"""',\r
130 "u'''", 'u"""', "U'''", 'U"""',\r
131 "b'''", 'b"""', "B'''", 'B"""',\r
132 "ur'''", 'ur"""', "Ur'''", 'Ur"""',\r
133 "uR'''", 'uR"""', "UR'''", 'UR"""',\r
134 "br'''", 'br"""', "Br'''", 'Br"""',\r
135 "bR'''", 'bR"""', "BR'''", 'BR"""',):\r
136 triple_quoted[t] = t\r
137single_quoted = {}\r
138for t in ("'", '"',\r
139 "r'", 'r"', "R'", 'R"',\r
140 "u'", 'u"', "U'", 'U"',\r
141 "b'", 'b"', "B'", 'B"',\r
142 "ur'", 'ur"', "Ur'", 'Ur"',\r
143 "uR'", 'uR"', "UR'", 'UR"',\r
144 "br'", 'br"', "Br'", 'Br"',\r
145 "bR'", 'bR"', "BR'", 'BR"', ):\r
146 single_quoted[t] = t\r
147\r
148tabsize = 8\r
149\r
150class TokenError(Exception): pass\r
151\r
152class StopTokenizing(Exception): pass\r
153\r
154def printtoken(type, token, start, end, line): # for testing\r
155 (srow, scol) = start\r
156 (erow, ecol) = end\r
157 print "%d,%d-%d,%d:\t%s\t%s" % \\r
158 (srow, scol, erow, ecol, tok_name[type], repr(token))\r
159\r
160def tokenize(readline, tokeneater=printtoken):\r
161 """\r
162 The tokenize() function accepts two parameters: one representing the\r
163 input stream, and one providing an output mechanism for tokenize().\r
164\r
165 The first parameter, readline, must be a callable object which provides\r
166 the same interface as the readline() method of built-in file objects.\r
167 Each call to the function should return one line of input as a string.\r
168\r
169 The second parameter, tokeneater, must also be a callable object. It is\r
170 called once for each token, with five arguments, corresponding to the\r
171 tuples generated by generate_tokens().\r
172 """\r
173 try:\r
174 tokenize_loop(readline, tokeneater)\r
175 except StopTokenizing:\r
176 pass\r
177\r
178# backwards compatible interface\r
179def tokenize_loop(readline, tokeneater):\r
180 for token_info in generate_tokens(readline):\r
181 tokeneater(*token_info)\r
182\r
183class Untokenizer:\r
184\r
185 def __init__(self):\r
186 self.tokens = []\r
187 self.prev_row = 1\r
188 self.prev_col = 0\r
189\r
190 def add_whitespace(self, start):\r
191 row, col = start\r
192 assert row <= self.prev_row\r
193 col_offset = col - self.prev_col\r
194 if col_offset:\r
195 self.tokens.append(" " * col_offset)\r
196\r
197 def untokenize(self, iterable):\r
198 for t in iterable:\r
199 if len(t) == 2:\r
200 self.compat(t, iterable)\r
201 break\r
202 tok_type, token, start, end, line = t\r
203 self.add_whitespace(start)\r
204 self.tokens.append(token)\r
205 self.prev_row, self.prev_col = end\r
206 if tok_type in (NEWLINE, NL):\r
207 self.prev_row += 1\r
208 self.prev_col = 0\r
209 return "".join(self.tokens)\r
210\r
211 def compat(self, token, iterable):\r
212 startline = False\r
213 indents = []\r
214 toks_append = self.tokens.append\r
215 toknum, tokval = token\r
216 if toknum in (NAME, NUMBER):\r
217 tokval += ' '\r
218 if toknum in (NEWLINE, NL):\r
219 startline = True\r
220 for tok in iterable:\r
221 toknum, tokval = tok[:2]\r
222\r
223 if toknum in (NAME, NUMBER):\r
224 tokval += ' '\r
225\r
226 if toknum == INDENT:\r
227 indents.append(tokval)\r
228 continue\r
229 elif toknum == DEDENT:\r
230 indents.pop()\r
231 continue\r
232 elif toknum in (NEWLINE, NL):\r
233 startline = True\r
234 elif startline and indents:\r
235 toks_append(indents[-1])\r
236 startline = False\r
237 toks_append(tokval)\r
238\r
239cookie_re = re.compile("coding[:=]\s*([-\w.]+)")\r
240\r
241def _get_normal_name(orig_enc):\r
242 """Imitates get_normal_name in tokenizer.c."""\r
243 # Only care about the first 12 characters.\r
244 enc = orig_enc[:12].lower().replace("_", "-")\r
245 if enc == "utf-8" or enc.startswith("utf-8-"):\r
246 return "utf-8"\r
247 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \\r
248 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):\r
249 return "iso-8859-1"\r
250 return orig_enc\r
251\r
252def detect_encoding(readline):\r
253 """\r
254 The detect_encoding() function is used to detect the encoding that should\r
255 be used to decode a Python source file. It requires one argment, readline,\r
256 in the same way as the tokenize() generator.\r
257\r
258 It will call readline a maximum of twice, and return the encoding used\r
259 (as a string) and a list of any lines (left as bytes) it has read\r
260 in.\r
261\r
262 It detects the encoding from the presence of a utf-8 bom or an encoding\r
263 cookie as specified in pep-0263. If both a bom and a cookie are present, but\r
264 disagree, a SyntaxError will be raised. If the encoding cookie is an invalid\r
265 charset, raise a SyntaxError. Note that if a utf-8 bom is found,\r
266 'utf-8-sig' is returned.\r
267\r
268 If no encoding is specified, then the default of 'utf-8' will be returned.\r
269 """\r
270 bom_found = False\r
271 encoding = None\r
272 default = 'utf-8'\r
273 def read_or_stop():\r
274 try:\r
275 return readline()\r
276 except StopIteration:\r
277 return bytes()\r
278\r
279 def find_cookie(line):\r
280 try:\r
281 line_string = line.decode('ascii')\r
282 except UnicodeDecodeError:\r
283 return None\r
284\r
285 matches = cookie_re.findall(line_string)\r
286 if not matches:\r
287 return None\r
288 encoding = _get_normal_name(matches[0])\r
289 try:\r
290 codec = lookup(encoding)\r
291 except LookupError:\r
292 # This behaviour mimics the Python interpreter\r
293 raise SyntaxError("unknown encoding: " + encoding)\r
294\r
295 if bom_found:\r
296 if codec.name != 'utf-8':\r
297 # This behaviour mimics the Python interpreter\r
298 raise SyntaxError('encoding problem: utf-8')\r
299 encoding += '-sig'\r
300 return encoding\r
301\r
302 first = read_or_stop()\r
303 if first.startswith(BOM_UTF8):\r
304 bom_found = True\r
305 first = first[3:]\r
306 default = 'utf-8-sig'\r
307 if not first:\r
308 return default, []\r
309\r
310 encoding = find_cookie(first)\r
311 if encoding:\r
312 return encoding, [first]\r
313\r
314 second = read_or_stop()\r
315 if not second:\r
316 return default, [first]\r
317\r
318 encoding = find_cookie(second)\r
319 if encoding:\r
320 return encoding, [first, second]\r
321\r
322 return default, [first, second]\r
323\r
324def untokenize(iterable):\r
325 """Transform tokens back into Python source code.\r
326\r
327 Each element returned by the iterable must be a token sequence\r
328 with at least two elements, a token number and token value. If\r
329 only two tokens are passed, the resulting output is poor.\r
330\r
331 Round-trip invariant for full input:\r
332 Untokenized source will match input source exactly\r
333\r
334 Round-trip invariant for limited intput:\r
335 # Output text will tokenize the back to the input\r
336 t1 = [tok[:2] for tok in generate_tokens(f.readline)]\r
337 newcode = untokenize(t1)\r
338 readline = iter(newcode.splitlines(1)).next\r
339 t2 = [tok[:2] for tokin generate_tokens(readline)]\r
340 assert t1 == t2\r
341 """\r
342 ut = Untokenizer()\r
343 return ut.untokenize(iterable)\r
344\r
345def generate_tokens(readline):\r
346 """\r
347 The generate_tokens() generator requires one argment, readline, which\r
348 must be a callable object which provides the same interface as the\r
349 readline() method of built-in file objects. Each call to the function\r
350 should return one line of input as a string. Alternately, readline\r
351 can be a callable function terminating with StopIteration:\r
352 readline = open(myfile).next # Example of alternate readline\r
353\r
354 The generator produces 5-tuples with these members: the token type; the\r
355 token string; a 2-tuple (srow, scol) of ints specifying the row and\r
356 column where the token begins in the source; a 2-tuple (erow, ecol) of\r
357 ints specifying the row and column where the token ends in the source;\r
358 and the line on which the token was found. The line passed is the\r
359 logical line; continuation lines are included.\r
360 """\r
361 lnum = parenlev = continued = 0\r
362 namechars, numchars = string.ascii_letters + '_', '0123456789'\r
363 contstr, needcont = '', 0\r
364 contline = None\r
365 indents = [0]\r
366\r
367 while 1: # loop over lines in stream\r
368 try:\r
369 line = readline()\r
370 except StopIteration:\r
371 line = ''\r
372 lnum = lnum + 1\r
373 pos, max = 0, len(line)\r
374\r
375 if contstr: # continued string\r
376 if not line:\r
377 raise TokenError, ("EOF in multi-line string", strstart)\r
378 endmatch = endprog.match(line)\r
379 if endmatch:\r
380 pos = end = endmatch.end(0)\r
381 yield (STRING, contstr + line[:end],\r
382 strstart, (lnum, end), contline + line)\r
383 contstr, needcont = '', 0\r
384 contline = None\r
385 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':\r
386 yield (ERRORTOKEN, contstr + line,\r
387 strstart, (lnum, len(line)), contline)\r
388 contstr = ''\r
389 contline = None\r
390 continue\r
391 else:\r
392 contstr = contstr + line\r
393 contline = contline + line\r
394 continue\r
395\r
396 elif parenlev == 0 and not continued: # new statement\r
397 if not line: break\r
398 column = 0\r
399 while pos < max: # measure leading whitespace\r
400 if line[pos] == ' ': column = column + 1\r
401 elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize\r
402 elif line[pos] == '\f': column = 0\r
403 else: break\r
404 pos = pos + 1\r
405 if pos == max: break\r
406\r
407 if line[pos] in '#\r\n': # skip comments or blank lines\r
408 if line[pos] == '#':\r
409 comment_token = line[pos:].rstrip('\r\n')\r
410 nl_pos = pos + len(comment_token)\r
411 yield (COMMENT, comment_token,\r
412 (lnum, pos), (lnum, pos + len(comment_token)), line)\r
413 yield (NL, line[nl_pos:],\r
414 (lnum, nl_pos), (lnum, len(line)), line)\r
415 else:\r
416 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],\r
417 (lnum, pos), (lnum, len(line)), line)\r
418 continue\r
419\r
420 if column > indents[-1]: # count indents or dedents\r
421 indents.append(column)\r
422 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)\r
423 while column < indents[-1]:\r
424 if column not in indents:\r
425 raise IndentationError(\r
426 "unindent does not match any outer indentation level",\r
427 ("<tokenize>", lnum, pos, line))\r
428 indents = indents[:-1]\r
429 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)\r
430\r
431 else: # continued statement\r
432 if not line:\r
433 raise TokenError, ("EOF in multi-line statement", (lnum, 0))\r
434 continued = 0\r
435\r
436 while pos < max:\r
437 pseudomatch = pseudoprog.match(line, pos)\r
438 if pseudomatch: # scan for tokens\r
439 start, end = pseudomatch.span(1)\r
440 spos, epos, pos = (lnum, start), (lnum, end), end\r
441 token, initial = line[start:end], line[start]\r
442\r
443 if initial in numchars or \\r
444 (initial == '.' and token != '.'): # ordinary number\r
445 yield (NUMBER, token, spos, epos, line)\r
446 elif initial in '\r\n':\r
447 newline = NEWLINE\r
448 if parenlev > 0:\r
449 newline = NL\r
450 yield (newline, token, spos, epos, line)\r
451 elif initial == '#':\r
452 assert not token.endswith("\n")\r
453 yield (COMMENT, token, spos, epos, line)\r
454 elif token in triple_quoted:\r
455 endprog = endprogs[token]\r
456 endmatch = endprog.match(line, pos)\r
457 if endmatch: # all on one line\r
458 pos = endmatch.end(0)\r
459 token = line[start:pos]\r
460 yield (STRING, token, spos, (lnum, pos), line)\r
461 else:\r
462 strstart = (lnum, start) # multiple lines\r
463 contstr = line[start:]\r
464 contline = line\r
465 break\r
466 elif initial in single_quoted or \\r
467 token[:2] in single_quoted or \\r
468 token[:3] in single_quoted:\r
469 if token[-1] == '\n': # continued string\r
470 strstart = (lnum, start)\r
471 endprog = (endprogs[initial] or endprogs[token[1]] or\r
472 endprogs[token[2]])\r
473 contstr, needcont = line[start:], 1\r
474 contline = line\r
475 break\r
476 else: # ordinary string\r
477 yield (STRING, token, spos, epos, line)\r
478 elif initial in namechars: # ordinary name\r
479 yield (NAME, token, spos, epos, line)\r
480 elif initial == '\\': # continued stmt\r
481 # This yield is new; needed for better idempotency:\r
482 yield (NL, token, spos, (lnum, pos), line)\r
483 continued = 1\r
484 else:\r
485 if initial in '([{': parenlev = parenlev + 1\r
486 elif initial in ')]}': parenlev = parenlev - 1\r
487 yield (OP, token, spos, epos, line)\r
488 else:\r
489 yield (ERRORTOKEN, line[pos],\r
490 (lnum, pos), (lnum, pos+1), line)\r
491 pos = pos + 1\r
492\r
493 for indent in indents[1:]: # pop remaining indent levels\r
494 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')\r
495 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')\r
496\r
497if __name__ == '__main__': # testing\r
498 import sys\r
499 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)\r
500 else: tokenize(sys.stdin.readline)\r