AppPkg/Applications/Python/Python-2.7.10/Lib/tokenize.py

   1 """Tokenization help for Python programs.
   2
   3 generate_tokens(readline) is a generator that breaks a stream of
   4 text into Python tokens.  It accepts a readline-like method which is called
   5 repeatedly to get the next line of input (or "" for EOF).  It generates
   6 5-tuples with these members:
   7
   8     the token type (see token.py)
   9     the token (a string)
  10     the starting (row, column) indices of the token (a 2-tuple of ints)
  11     the ending (row, column) indices of the token (a 2-tuple of ints)
  12     the original line (string)
  13
  14 It is designed to match the working of the Python tokenizer exactly, except
  15 that it produces COMMENT tokens for comments and gives type OP for all
  16 operators
  17
  18 Older entry points
  19     tokenize_loop(readline, tokeneater)
  20     tokenize(readline, tokeneater=printtoken)
  21 are the same, except instead of generating tokens, tokeneater is a callback
  22 function to which the 5 fields described above are passed as 5 arguments,
  23 each time a new token is found."""
  24
  25 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
  26 __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
  27                'Skip Montanaro, Raymond Hettinger')
  28
  29 from itertools import chain
  30 import string, re
  31 from token import *
  32
  33 import token
  34 __all__ = [x for x in dir(token) if not x.startswith("_")]
  35 __all__ += ["COMMENT", "tokenize", "generate_tokens", "NL", "untokenize"]
  36 del x
  37 del token
  38
  39 COMMENT = N_TOKENS
  40 tok_name[COMMENT] = 'COMMENT'
  41 NL = N_TOKENS + 1
  42 tok_name[NL] = 'NL'
  43 N_TOKENS += 2
  44
  45 def group(*choices): return '(' + '|'.join(choices) + ')'
  46 def any(*choices): return group(*choices) + '*'
  47 def maybe(*choices): return group(*choices) + '?'
  48
  49 Whitespace = r'[ \f\t]*'
  50 Comment = r'#[^\r\n]*'
  51 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
  52 Name = r'[a-zA-Z_]\w*'
  53
  54 Hexnumber = r'0[xX][\da-fA-F]+[lL]?'
  55 Octnumber = r'(0[oO][0-7]+)|(0[0-7]*)[lL]?'
  56 Binnumber = r'0[bB][01]+[lL]?'
  57 Decnumber = r'[1-9]\d*[lL]?'
  58 Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
  59 Exponent = r'[eE][-+]?\d+'
  60 Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
  61 Expfloat = r'\d+' + Exponent
  62 Floatnumber = group(Pointfloat, Expfloat)
  63 Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
  64 Number = group(Imagnumber, Floatnumber, Intnumber)
  65
  66 # Tail end of ' string.
  67 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
  68 # Tail end of " string.
  69 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
  70 # Tail end of ''' string.
  71 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
  72 # Tail end of """ string.
  73 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
  74 Triple = group("[uUbB]?[rR]?'''", '[uUbB]?[rR]?"""')
  75 # Single-line ' or " string.
  76 String = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
  77                r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
  78
  79 # Because of leftmost-then-longest match semantics, be sure to put the
  80 # longest operators first (e.g., if = came before ==, == would get
  81 # recognized as two instances of =).
  82 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
  83                  r"//=?",
  84                  r"[+\-*/%&|^=<>]=?",
  85                  r"~")
  86
  87 Bracket = '[][(){}]'
  88 Special = group(r'\r?\n', r'[:;.,`@]')
  89 Funny = group(Operator, Bracket, Special)
  90
  91 PlainToken = group(Number, Funny, String, Name)
  92 Token = Ignore + PlainToken
  93
  94 # First (or only) line of ' or " string.
  95 ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
  96                 group("'", r'\\\r?\n'),
  97                 r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
  98                 group('"', r'\\\r?\n'))
  99 PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
 100 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
 101
 102 tokenprog, pseudoprog, single3prog, double3prog = map(
 103     re.compile, (Token, PseudoToken, Single3, Double3))
 104 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
 105             "'''": single3prog, '"""': double3prog,
 106             "r'''": single3prog, 'r"""': double3prog,
 107             "u'''": single3prog, 'u"""': double3prog,
 108             "ur'''": single3prog, 'ur"""': double3prog,
 109             "R'''": single3prog, 'R"""': double3prog,
 110             "U'''": single3prog, 'U"""': double3prog,
 111             "uR'''": single3prog, 'uR"""': double3prog,
 112             "Ur'''": single3prog, 'Ur"""': double3prog,
 113             "UR'''": single3prog, 'UR"""': double3prog,
 114             "b'''": single3prog, 'b"""': double3prog,
 115             "br'''": single3prog, 'br"""': double3prog,
 116             "B'''": single3prog, 'B"""': double3prog,
 117             "bR'''": single3prog, 'bR"""': double3prog,
 118             "Br'''": single3prog, 'Br"""': double3prog,
 119             "BR'''": single3prog, 'BR"""': double3prog,
 120             'r': None, 'R': None, 'u': None, 'U': None,
 121             'b': None, 'B': None}
 122
 123 triple_quoted = {}
 124 for t in ("'''", '"""',
 125           "r'''", 'r"""', "R'''", 'R"""',
 126           "u'''", 'u"""', "U'''", 'U"""',
 127           "ur'''", 'ur"""', "Ur'''", 'Ur"""',
 128           "uR'''", 'uR"""', "UR'''", 'UR"""',
 129           "b'''", 'b"""', "B'''", 'B"""',
 130           "br'''", 'br"""', "Br'''", 'Br"""',
 131           "bR'''", 'bR"""', "BR'''", 'BR"""'):
 132     triple_quoted[t] = t
 133 single_quoted = {}
 134 for t in ("'", '"',
 135           "r'", 'r"', "R'", 'R"',
 136           "u'", 'u"', "U'", 'U"',
 137           "ur'", 'ur"', "Ur'", 'Ur"',
 138           "uR'", 'uR"', "UR'", 'UR"',
 139           "b'", 'b"', "B'", 'B"',
 140           "br'", 'br"', "Br'", 'Br"',
 141           "bR'", 'bR"', "BR'", 'BR"' ):
 142     single_quoted[t] = t
 143
 144 tabsize = 8
 145
 146 class TokenError(Exception): pass
 147
 148 class StopTokenizing(Exception): pass
 149
 150 def printtoken(type, token, srow_scol, erow_ecol, line): # for testing
 151     srow, scol = srow_scol
 152     erow, ecol = erow_ecol
 153     print "%d,%d-%d,%d:\t%s\t%s" % \
 154         (srow, scol, erow, ecol, tok_name[type], repr(token))
 155
 156 def tokenize(readline, tokeneater=printtoken):
 157     """
 158     The tokenize() function accepts two parameters: one representing the
 159     input stream, and one providing an output mechanism for tokenize().
 160
 161     The first parameter, readline, must be a callable object which provides
 162     the same interface as the readline() method of built-in file objects.
 163     Each call to the function should return one line of input as a string.
 164
 165     The second parameter, tokeneater, must also be a callable object. It is
 166     called once for each token, with five arguments, corresponding to the
 167     tuples generated by generate_tokens().
 168     """
 169     try:
 170         tokenize_loop(readline, tokeneater)
 171     except StopTokenizing:
 172         pass
 173
 174 # backwards compatible interface
 175 def tokenize_loop(readline, tokeneater):
 176     for token_info in generate_tokens(readline):
 177         tokeneater(*token_info)
 178
 179 class Untokenizer:
 180
 181     def __init__(self):
 182         self.tokens = []
 183         self.prev_row = 1
 184         self.prev_col = 0
 185
 186     def add_whitespace(self, start):
 187         row, col = start
 188         if row < self.prev_row or row == self.prev_row and col < self.prev_col:
 189             raise ValueError("start ({},{}) precedes previous end ({},{})"
 190                              .format(row, col, self.prev_row, self.prev_col))
 191         row_offset = row - self.prev_row
 192         if row_offset:
 193             self.tokens.append("\\\n" * row_offset)
 194             self.prev_col = 0
 195         col_offset = col - self.prev_col
 196         if col_offset:
 197             self.tokens.append(" " * col_offset)
 198
 199     def untokenize(self, iterable):
 200         it = iter(iterable)
 201         for t in it:
 202             if len(t) == 2:
 203                 self.compat(t, it)
 204                 break
 205             tok_type, token, start, end, line = t
 206             if tok_type == ENDMARKER:
 207                 break
 208             self.add_whitespace(start)
 209             self.tokens.append(token)
 210             self.prev_row, self.prev_col = end
 211             if tok_type in (NEWLINE, NL):
 212                 self.prev_row += 1
 213                 self.prev_col = 0
 214         return "".join(self.tokens)
 215
 216     def compat(self, token, iterable):
 217         indents = []
 218         toks_append = self.tokens.append
 219         startline = token[0] in (NEWLINE, NL)
 220         prevstring = False
 221
 222         for tok in chain([token], iterable):
 223             toknum, tokval = tok[:2]
 224
 225             if toknum in (NAME, NUMBER):
 226                 tokval += ' '
 227
 228             # Insert a space between two consecutive strings
 229             if toknum == STRING:
 230                 if prevstring:
 231                     tokval = ' ' + tokval
 232                 prevstring = True
 233             else:
 234                 prevstring = False
 235
 236             if toknum == INDENT:
 237                 indents.append(tokval)
 238                 continue
 239             elif toknum == DEDENT:
 240                 indents.pop()
 241                 continue
 242             elif toknum in (NEWLINE, NL):
 243                 startline = True
 244             elif startline and indents:
 245                 toks_append(indents[-1])
 246                 startline = False
 247             toks_append(tokval)
 248
 249 def untokenize(iterable):
 250     """Transform tokens back into Python source code.
 251
 252     Each element returned by the iterable must be a token sequence
 253     with at least two elements, a token number and token value.  If
 254     only two tokens are passed, the resulting output is poor.
 255
 256     Round-trip invariant for full input:
 257         Untokenized source will match input source exactly
 258
 259     Round-trip invariant for limited intput:
 260         # Output text will tokenize the back to the input
 261         t1 = [tok[:2] for tok in generate_tokens(f.readline)]
 262         newcode = untokenize(t1)
 263         readline = iter(newcode.splitlines(1)).next
 264         t2 = [tok[:2] for tok in generate_tokens(readline)]
 265         assert t1 == t2
 266     """
 267     ut = Untokenizer()
 268     return ut.untokenize(iterable)
 269
 270 def generate_tokens(readline):
 271     """
 272     The generate_tokens() generator requires one argument, readline, which
 273     must be a callable object which provides the same interface as the
 274     readline() method of built-in file objects. Each call to the function
 275     should return one line of input as a string.  Alternately, readline
 276     can be a callable function terminating with StopIteration:
 277         readline = open(myfile).next    # Example of alternate readline
 278
 279     The generator produces 5-tuples with these members: the token type; the
 280     token string; a 2-tuple (srow, scol) of ints specifying the row and
 281     column where the token begins in the source; a 2-tuple (erow, ecol) of
 282     ints specifying the row and column where the token ends in the source;
 283     and the line on which the token was found. The line passed is the
 284     logical line; continuation lines are included.
 285     """
 286     lnum = parenlev = continued = 0
 287     namechars, numchars = string.ascii_letters + '_', '0123456789'
 288     contstr, needcont = '', 0
 289     contline = None
 290     indents = [0]
 291
 292     while 1:                                   # loop over lines in stream
 293         try:
 294             line = readline()
 295         except StopIteration:
 296             line = ''
 297         lnum += 1
 298         pos, max = 0, len(line)
 299
 300         if contstr:                            # continued string
 301             if not line:
 302                 raise TokenError, ("EOF in multi-line string", strstart)
 303             endmatch = endprog.match(line)
 304             if endmatch:
 305                 pos = end = endmatch.end(0)
 306                 yield (STRING, contstr + line[:end],
 307                        strstart, (lnum, end), contline + line)
 308                 contstr, needcont = '', 0
 309                 contline = None
 310             elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
 311                 yield (ERRORTOKEN, contstr + line,
 312                            strstart, (lnum, len(line)), contline)
 313                 contstr = ''
 314                 contline = None
 315                 continue
 316             else:
 317                 contstr = contstr + line
 318                 contline = contline + line
 319                 continue
 320
 321         elif parenlev == 0 and not continued:  # new statement
 322             if not line: break
 323             column = 0
 324             while pos < max:                   # measure leading whitespace
 325                 if line[pos] == ' ':
 326                     column += 1
 327                 elif line[pos] == '\t':
 328                     column = (column//tabsize + 1)*tabsize
 329                 elif line[pos] == '\f':
 330                     column = 0
 331                 else:
 332                     break
 333                 pos += 1
 334             if pos == max:
 335                 break
 336
 337             if line[pos] in '#\r\n':           # skip comments or blank lines
 338                 if line[pos] == '#':
 339                     comment_token = line[pos:].rstrip('\r\n')
 340                     nl_pos = pos + len(comment_token)
 341                     yield (COMMENT, comment_token,
 342                            (lnum, pos), (lnum, pos + len(comment_token)), line)
 343                     yield (NL, line[nl_pos:],
 344                            (lnum, nl_pos), (lnum, len(line)), line)
 345                 else:
 346                     yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
 347                            (lnum, pos), (lnum, len(line)), line)
 348                 continue
 349
 350             if column > indents[-1]:           # count indents or dedents
 351                 indents.append(column)
 352                 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
 353             while column < indents[-1]:
 354                 if column not in indents:
 355                     raise IndentationError(
 356                         "unindent does not match any outer indentation level",
 357                         ("<tokenize>", lnum, pos, line))
 358                 indents = indents[:-1]
 359                 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
 360
 361         else:                                  # continued statement
 362             if not line:
 363                 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
 364             continued = 0
 365
 366         while pos < max:
 367             pseudomatch = pseudoprog.match(line, pos)
 368             if pseudomatch:                                # scan for tokens
 369                 start, end = pseudomatch.span(1)
 370                 spos, epos, pos = (lnum, start), (lnum, end), end
 371                 if start == end:
 372                     continue
 373                 token, initial = line[start:end], line[start]
 374
 375                 if initial in numchars or \
 376                    (initial == '.' and token != '.'):      # ordinary number
 377                     yield (NUMBER, token, spos, epos, line)
 378                 elif initial in '\r\n':
 379                     yield (NL if parenlev > 0 else NEWLINE,
 380                            token, spos, epos, line)
 381                 elif initial == '#':
 382                     assert not token.endswith("\n")
 383                     yield (COMMENT, token, spos, epos, line)
 384                 elif token in triple_quoted:
 385                     endprog = endprogs[token]
 386                     endmatch = endprog.match(line, pos)
 387                     if endmatch:                           # all on one line
 388                         pos = endmatch.end(0)
 389                         token = line[start:pos]
 390                         yield (STRING, token, spos, (lnum, pos), line)
 391                     else:
 392                         strstart = (lnum, start)           # multiple lines
 393                         contstr = line[start:]
 394                         contline = line
 395                         break
 396                 elif initial in single_quoted or \
 397                     token[:2] in single_quoted or \
 398                     token[:3] in single_quoted:
 399                     if token[-1] == '\n':                  # continued string
 400                         strstart = (lnum, start)
 401                         endprog = (endprogs[initial] or endprogs[token[1]] or
 402                                    endprogs[token[2]])
 403                         contstr, needcont = line[start:], 1
 404                         contline = line
 405                         break
 406                     else:                                  # ordinary string
 407                         yield (STRING, token, spos, epos, line)
 408                 elif initial in namechars:                 # ordinary name
 409                     yield (NAME, token, spos, epos, line)
 410                 elif initial == '\\':                      # continued stmt
 411                     continued = 1
 412                 else:
 413                     if initial in '([{':
 414                         parenlev += 1
 415                     elif initial in ')]}':
 416                         parenlev -= 1
 417                     yield (OP, token, spos, epos, line)
 418             else:
 419                 yield (ERRORTOKEN, line[pos],
 420                            (lnum, pos), (lnum, pos+1), line)
 421                 pos += 1
 422
 423     for indent in indents[1:]:                 # pop remaining indent levels
 424         yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
 425     yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
 426
 427 if __name__ == '__main__':                     # testing
 428     import sys
 429     if len(sys.argv) > 1:
 430         tokenize(open(sys.argv[1]).readline)
 431     else:
 432         tokenize(sys.stdin.readline)