]>
git.proxmox.com Git - mirror_edk2.git/blob - AppPkg/Applications/Python/Python-2.7.10/Lib/tokenize.py
1 """Tokenization help for Python programs.
3 generate_tokens(readline) is a generator that breaks a stream of
4 text into Python tokens. It accepts a readline-like method which is called
5 repeatedly to get the next line of input (or "" for EOF). It generates
6 5-tuples with these members:
8 the token type (see token.py)
10 the starting (row, column) indices of the token (a 2-tuple of ints)
11 the ending (row, column) indices of the token (a 2-tuple of ints)
12 the original line (string)
14 It is designed to match the working of the Python tokenizer exactly, except
15 that it produces COMMENT tokens for comments and gives type OP for all
19 tokenize_loop(readline, tokeneater)
20 tokenize(readline, tokeneater=printtoken)
21 are the same, except instead of generating tokens, tokeneater is a callback
22 function to which the 5 fields described above are passed as 5 arguments,
23 each time a new token is found."""
25 __author__
= 'Ka-Ping Yee <ping@lfw.org>'
26 __credits__
= ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
27 'Skip Montanaro, Raymond Hettinger')
29 from itertools
import chain
34 __all__
= [x
for x
in dir(token
) if not x
.startswith("_")]
35 __all__
+= ["COMMENT", "tokenize", "generate_tokens", "NL", "untokenize"]
40 tok_name
[COMMENT
] = 'COMMENT'
45 def group(*choices
): return '(' + '|'.join(choices
) + ')'
46 def any(*choices
): return group(*choices
) + '*'
47 def maybe(*choices
): return group(*choices
) + '?'
49 Whitespace
= r
'[ \f\t]*'
50 Comment
= r
'#[^\r\n]*'
51 Ignore
= Whitespace
+ any(r
'\\\r?\n' + Whitespace
) + maybe(Comment
)
52 Name
= r
'[a-zA-Z_]\w*'
54 Hexnumber
= r
'0[xX][\da-fA-F]+[lL]?'
55 Octnumber
= r
'(0[oO][0-7]+)|(0[0-7]*)[lL]?'
56 Binnumber
= r
'0[bB][01]+[lL]?'
57 Decnumber
= r
'[1-9]\d*[lL]?'
58 Intnumber
= group(Hexnumber
, Binnumber
, Octnumber
, Decnumber
)
59 Exponent
= r
'[eE][-+]?\d+'
60 Pointfloat
= group(r
'\d+\.\d*', r
'\.\d+') + maybe(Exponent
)
61 Expfloat
= r
'\d+' + Exponent
62 Floatnumber
= group(Pointfloat
, Expfloat
)
63 Imagnumber
= group(r
'\d+[jJ]', Floatnumber
+ r
'[jJ]')
64 Number
= group(Imagnumber
, Floatnumber
, Intnumber
)
66 # Tail end of ' string.
67 Single
= r
"[^'\\]*(?:\\.[^'\\]*)*'"
68 # Tail end of " string.
69 Double
= r
'[^"\\]*(?:\\.[^"\\]*)*"'
70 # Tail end of ''' string.
71 Single3
= r
"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
72 # Tail end of """ string.
73 Double3
= r
'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
74 Triple
= group("[uUbB]?[rR]?'''", '[uUbB]?[rR]?"""')
75 # Single-line ' or " string.
76 String
= group(r
"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
77 r
'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
79 # Because of leftmost-then-longest match semantics, be sure to put the
80 # longest operators first (e.g., if = came before ==, == would get
81 # recognized as two instances of =).
82 Operator
= group(r
"\*\*=?", r
">>=?", r
"<<=?", r
"<>", r
"!=",
88 Special
= group(r
'\r?\n', r
'[:;.,`@]')
89 Funny
= group(Operator
, Bracket
, Special
)
91 PlainToken
= group(Number
, Funny
, String
, Name
)
92 Token
= Ignore
+ PlainToken
94 # First (or only) line of ' or " string.
95 ContStr
= group(r
"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
96 group("'", r
'\\\r?\n'),
97 r
'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
98 group('"', r
'\\\r?\n'))
99 PseudoExtras
= group(r
'\\\r?\n|\Z', Comment
, Triple
)
100 PseudoToken
= Whitespace
+ group(PseudoExtras
, Number
, Funny
, ContStr
, Name
)
102 tokenprog
, pseudoprog
, single3prog
, double3prog
= map(
103 re
.compile, (Token
, PseudoToken
, Single3
, Double3
))
104 endprogs
= {"'": re
.compile(Single
), '"': re
.compile(Double
),
105 "'''": single3prog
, '"""': double3prog
,
106 "r'''": single3prog
, 'r"""': double3prog
,
107 "u'''": single3prog
, 'u"""': double3prog
,
108 "ur'''": single3prog
, 'ur"""': double3prog
,
109 "R'''": single3prog
, 'R"""': double3prog
,
110 "U'''": single3prog
, 'U"""': double3prog
,
111 "uR'''": single3prog
, 'uR"""': double3prog
,
112 "Ur'''": single3prog
, 'Ur"""': double3prog
,
113 "UR'''": single3prog
, 'UR"""': double3prog
,
114 "b'''": single3prog
, 'b"""': double3prog
,
115 "br'''": single3prog
, 'br"""': double3prog
,
116 "B'''": single3prog
, 'B"""': double3prog
,
117 "bR'''": single3prog
, 'bR"""': double3prog
,
118 "Br'''": single3prog
, 'Br"""': double3prog
,
119 "BR'''": single3prog
, 'BR"""': double3prog
,
120 'r': None, 'R': None, 'u': None, 'U': None,
121 'b': None, 'B': None}
124 for t
in ("'''", '"""',
125 "r'''", 'r"""', "R'''", 'R"""',
126 "u'''", 'u"""', "U'''", 'U"""',
127 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
128 "uR'''", 'uR"""', "UR'''", 'UR"""',
129 "b'''", 'b"""', "B'''", 'B"""',
130 "br'''", 'br"""', "Br'''", 'Br"""',
131 "bR'''", 'bR"""', "BR'''", 'BR"""'):
135 "r'", 'r"', "R'", 'R"',
136 "u'", 'u"', "U'", 'U"',
137 "ur'", 'ur"', "Ur'", 'Ur"',
138 "uR'", 'uR"', "UR'", 'UR"',
139 "b'", 'b"', "B'", 'B"',
140 "br'", 'br"', "Br'", 'Br"',
141 "bR'", 'bR"', "BR'", 'BR"' ):
146 class TokenError(Exception): pass
148 class StopTokenizing(Exception): pass
150 def printtoken(type, token
, srow_scol
, erow_ecol
, line
): # for testing
151 srow
, scol
= srow_scol
152 erow
, ecol
= erow_ecol
153 print "%d,%d-%d,%d:\t%s\t%s" % \
154 (srow
, scol
, erow
, ecol
, tok_name
[type], repr(token
))
156 def tokenize(readline
, tokeneater
=printtoken
):
158 The tokenize() function accepts two parameters: one representing the
159 input stream, and one providing an output mechanism for tokenize().
161 The first parameter, readline, must be a callable object which provides
162 the same interface as the readline() method of built-in file objects.
163 Each call to the function should return one line of input as a string.
165 The second parameter, tokeneater, must also be a callable object. It is
166 called once for each token, with five arguments, corresponding to the
167 tuples generated by generate_tokens().
170 tokenize_loop(readline
, tokeneater
)
171 except StopTokenizing
:
174 # backwards compatible interface
175 def tokenize_loop(readline
, tokeneater
):
176 for token_info
in generate_tokens(readline
):
177 tokeneater(*token_info
)
186 def add_whitespace(self
, start
):
188 if row
< self
.prev_row
or row
== self
.prev_row
and col
< self
.prev_col
:
189 raise ValueError("start ({},{}) precedes previous end ({},{})"
190 .format(row
, col
, self
.prev_row
, self
.prev_col
))
191 row_offset
= row
- self
.prev_row
193 self
.tokens
.append("\\\n" * row_offset
)
195 col_offset
= col
- self
.prev_col
197 self
.tokens
.append(" " * col_offset
)
199 def untokenize(self
, iterable
):
205 tok_type
, token
, start
, end
, line
= t
206 if tok_type
== ENDMARKER
:
208 self
.add_whitespace(start
)
209 self
.tokens
.append(token
)
210 self
.prev_row
, self
.prev_col
= end
211 if tok_type
in (NEWLINE
, NL
):
214 return "".join(self
.tokens
)
216 def compat(self
, token
, iterable
):
218 toks_append
= self
.tokens
.append
219 startline
= token
[0] in (NEWLINE
, NL
)
222 for tok
in chain([token
], iterable
):
223 toknum
, tokval
= tok
[:2]
225 if toknum
in (NAME
, NUMBER
):
228 # Insert a space between two consecutive strings
231 tokval
= ' ' + tokval
237 indents
.append(tokval
)
239 elif toknum
== DEDENT
:
242 elif toknum
in (NEWLINE
, NL
):
244 elif startline
and indents
:
245 toks_append(indents
[-1])
249 def untokenize(iterable
):
250 """Transform tokens back into Python source code.
252 Each element returned by the iterable must be a token sequence
253 with at least two elements, a token number and token value. If
254 only two tokens are passed, the resulting output is poor.
256 Round-trip invariant for full input:
257 Untokenized source will match input source exactly
259 Round-trip invariant for limited intput:
260 # Output text will tokenize the back to the input
261 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
262 newcode = untokenize(t1)
263 readline = iter(newcode.splitlines(1)).next
264 t2 = [tok[:2] for tok in generate_tokens(readline)]
268 return ut
.untokenize(iterable
)
270 def generate_tokens(readline
):
272 The generate_tokens() generator requires one argument, readline, which
273 must be a callable object which provides the same interface as the
274 readline() method of built-in file objects. Each call to the function
275 should return one line of input as a string. Alternately, readline
276 can be a callable function terminating with StopIteration:
277 readline = open(myfile).next # Example of alternate readline
279 The generator produces 5-tuples with these members: the token type; the
280 token string; a 2-tuple (srow, scol) of ints specifying the row and
281 column where the token begins in the source; a 2-tuple (erow, ecol) of
282 ints specifying the row and column where the token ends in the source;
283 and the line on which the token was found. The line passed is the
284 logical line; continuation lines are included.
286 lnum
= parenlev
= continued
= 0
287 namechars
, numchars
= string
.ascii_letters
+ '_', '0123456789'
288 contstr
, needcont
= '', 0
292 while 1: # loop over lines in stream
295 except StopIteration:
298 pos
, max = 0, len(line
)
300 if contstr
: # continued string
302 raise TokenError
, ("EOF in multi-line string", strstart
)
303 endmatch
= endprog
.match(line
)
305 pos
= end
= endmatch
.end(0)
306 yield (STRING
, contstr
+ line
[:end
],
307 strstart
, (lnum
, end
), contline
+ line
)
308 contstr
, needcont
= '', 0
310 elif needcont
and line
[-2:] != '\\\n' and line
[-3:] != '\\\r\n':
311 yield (ERRORTOKEN
, contstr
+ line
,
312 strstart
, (lnum
, len(line
)), contline
)
317 contstr
= contstr
+ line
318 contline
= contline
+ line
321 elif parenlev
== 0 and not continued
: # new statement
324 while pos
< max: # measure leading whitespace
327 elif line
[pos
] == '\t':
328 column
= (column
//tabsize
+ 1)*tabsize
329 elif line
[pos
] == '\f':
337 if line
[pos
] in '#\r\n': # skip comments or blank lines
339 comment_token
= line
[pos
:].rstrip('\r\n')
340 nl_pos
= pos
+ len(comment_token
)
341 yield (COMMENT
, comment_token
,
342 (lnum
, pos
), (lnum
, pos
+ len(comment_token
)), line
)
343 yield (NL
, line
[nl_pos
:],
344 (lnum
, nl_pos
), (lnum
, len(line
)), line
)
346 yield ((NL
, COMMENT
)[line
[pos
] == '#'], line
[pos
:],
347 (lnum
, pos
), (lnum
, len(line
)), line
)
350 if column
> indents
[-1]: # count indents or dedents
351 indents
.append(column
)
352 yield (INDENT
, line
[:pos
], (lnum
, 0), (lnum
, pos
), line
)
353 while column
< indents
[-1]:
354 if column
not in indents
:
355 raise IndentationError(
356 "unindent does not match any outer indentation level",
357 ("<tokenize>", lnum
, pos
, line
))
358 indents
= indents
[:-1]
359 yield (DEDENT
, '', (lnum
, pos
), (lnum
, pos
), line
)
361 else: # continued statement
363 raise TokenError
, ("EOF in multi-line statement", (lnum
, 0))
367 pseudomatch
= pseudoprog
.match(line
, pos
)
368 if pseudomatch
: # scan for tokens
369 start
, end
= pseudomatch
.span(1)
370 spos
, epos
, pos
= (lnum
, start
), (lnum
, end
), end
373 token
, initial
= line
[start
:end
], line
[start
]
375 if initial
in numchars
or \
376 (initial
== '.' and token
!= '.'): # ordinary number
377 yield (NUMBER
, token
, spos
, epos
, line
)
378 elif initial
in '\r\n':
379 yield (NL
if parenlev
> 0 else NEWLINE
,
380 token
, spos
, epos
, line
)
382 assert not token
.endswith("\n")
383 yield (COMMENT
, token
, spos
, epos
, line
)
384 elif token
in triple_quoted
:
385 endprog
= endprogs
[token
]
386 endmatch
= endprog
.match(line
, pos
)
387 if endmatch
: # all on one line
388 pos
= endmatch
.end(0)
389 token
= line
[start
:pos
]
390 yield (STRING
, token
, spos
, (lnum
, pos
), line
)
392 strstart
= (lnum
, start
) # multiple lines
393 contstr
= line
[start
:]
396 elif initial
in single_quoted
or \
397 token
[:2] in single_quoted
or \
398 token
[:3] in single_quoted
:
399 if token
[-1] == '\n': # continued string
400 strstart
= (lnum
, start
)
401 endprog
= (endprogs
[initial
] or endprogs
[token
[1]] or
403 contstr
, needcont
= line
[start
:], 1
406 else: # ordinary string
407 yield (STRING
, token
, spos
, epos
, line
)
408 elif initial
in namechars
: # ordinary name
409 yield (NAME
, token
, spos
, epos
, line
)
410 elif initial
== '\\': # continued stmt
415 elif initial
in ')]}':
417 yield (OP
, token
, spos
, epos
, line
)
419 yield (ERRORTOKEN
, line
[pos
],
420 (lnum
, pos
), (lnum
, pos
+1), line
)
423 for indent
in indents
[1:]: # pop remaining indent levels
424 yield (DEDENT
, '', (lnum
, 0), (lnum
, 0), '')
425 yield (ENDMARKER
, '', (lnum
, 0), (lnum
, 0), '')
427 if __name__
== '__main__': # testing
429 if len(sys
.argv
) > 1:
430 tokenize(open(sys
.argv
[1]).readline
)
432 tokenize(sys
.stdin
.readline
)