ceph/src/googletest/googlemock/scripts/generator/cpp/tokenize.py

   1 #!/usr/bin/env python
   2 #
   3 # Copyright 2007 Neal Norwitz
   4 # Portions Copyright 2007 Google Inc.
   5 #
   6 # Licensed under the Apache License, Version 2.0 (the "License");
   7 # you may not use this file except in compliance with the License.
   8 # You may obtain a copy of the License at
   9 #
  10 #      http://www.apache.org/licenses/LICENSE-2.0
  11 #
  12 # Unless required by applicable law or agreed to in writing, software
  13 # distributed under the License is distributed on an "AS IS" BASIS,
  14 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15 # See the License for the specific language governing permissions and
  16 # limitations under the License.
  17
  18 """Tokenize C++ source code."""
  19
  20 try:
  21     # Python 3.x
  22     import builtins
  23 except ImportError:
  24     # Python 2.x
  25     import __builtin__ as builtins
  26
  27
  28 import sys
  29
  30 from cpp import utils
  31
  32
  33 if not hasattr(builtins, 'set'):
  34     # Nominal support for Python 2.3.
  35     from sets import Set as set
  36
  37
  38 # Add $ as a valid identifier char since so much code uses it.
  39 _letters = 'abcdefghijklmnopqrstuvwxyz'
  40 VALID_IDENTIFIER_CHARS = set(_letters + _letters.upper() + '_0123456789$')
  41 HEX_DIGITS = set('0123456789abcdefABCDEF')
  42 INT_OR_FLOAT_DIGITS = set('01234567890eE-+')
  43
  44
  45 # C++0x string preffixes.
  46 _STR_PREFIXES = set(('R', 'u8', 'u8R', 'u', 'uR', 'U', 'UR', 'L', 'LR'))
  47
  48
  49 # Token types.
  50 UNKNOWN = 'UNKNOWN'
  51 SYNTAX = 'SYNTAX'
  52 CONSTANT = 'CONSTANT'
  53 NAME = 'NAME'
  54 PREPROCESSOR = 'PREPROCESSOR'
  55
  56 # Where the token originated from.  This can be used for backtracking.
  57 # It is always set to WHENCE_STREAM in this code.
  58 WHENCE_STREAM, WHENCE_QUEUE = range(2)
  59
  60
  61 class Token(object):
  62     """Data container to represent a C++ token.
  63
  64     Tokens can be identifiers, syntax char(s), constants, or
  65     pre-processor directives.
  66
  67     start contains the index of the first char of the token in the source
  68     end contains the index of the last char of the token in the source
  69     """
  70
  71     def __init__(self, token_type, name, start, end):
  72         self.token_type = token_type
  73         self.name = name
  74         self.start = start
  75         self.end = end
  76         self.whence = WHENCE_STREAM
  77
  78     def __str__(self):
  79         if not utils.DEBUG:
  80             return 'Token(%r)' % self.name
  81         return 'Token(%r, %s, %s)' % (self.name, self.start, self.end)
  82
  83     __repr__ = __str__
  84
  85
  86 def _GetString(source, start, i):
  87     i = source.find('"', i+1)
  88     while source[i-1] == '\\':
  89         # Count the trailing backslashes.
  90         backslash_count = 1
  91         j = i - 2
  92         while source[j] == '\\':
  93             backslash_count += 1
  94             j -= 1
  95         # When trailing backslashes are even, they escape each other.
  96         if (backslash_count % 2) == 0:
  97             break
  98         i = source.find('"', i+1)
  99     return i + 1
 100
 101
 102 def _GetChar(source, start, i):
 103     # NOTE(nnorwitz): may not be quite correct, should be good enough.
 104     i = source.find("'", i+1)
 105     while source[i-1] == '\\':
 106         # Need to special case '\\'.
 107         if (i - 2) > start and source[i-2] == '\\':
 108             break
 109         i = source.find("'", i+1)
 110     # Try to handle unterminated single quotes (in a #if 0 block).
 111     if i < 0:
 112         i = start
 113     return i + 1
 114
 115
 116 def GetTokens(source):
 117     """Returns a sequence of Tokens.
 118
 119     Args:
 120       source: string of C++ source code.
 121
 122     Yields:
 123       Token that represents the next token in the source.
 124     """
 125     # Cache various valid character sets for speed.
 126     valid_identifier_chars = VALID_IDENTIFIER_CHARS
 127     hex_digits = HEX_DIGITS
 128     int_or_float_digits = INT_OR_FLOAT_DIGITS
 129     int_or_float_digits2 = int_or_float_digits | set('.')
 130
 131     # Only ignore errors while in a #if 0 block.
 132     ignore_errors = False
 133     count_ifs = 0
 134
 135     i = 0
 136     end = len(source)
 137     while i < end:
 138         # Skip whitespace.
 139         while i < end and source[i].isspace():
 140             i += 1
 141         if i >= end:
 142             return
 143
 144         token_type = UNKNOWN
 145         start = i
 146         c = source[i]
 147         if c.isalpha() or c == '_':              # Find a string token.
 148             token_type = NAME
 149             while source[i] in valid_identifier_chars:
 150                 i += 1
 151             # String and character constants can look like a name if
 152             # they are something like L"".
 153             if (source[i] == "'" and (i - start) == 1 and
 154                 source[start:i] in 'uUL'):
 155                 # u, U, and L are valid C++0x character preffixes.
 156                 token_type = CONSTANT
 157                 i = _GetChar(source, start, i)
 158             elif source[i] == "'" and source[start:i] in _STR_PREFIXES:
 159                 token_type = CONSTANT
 160                 i = _GetString(source, start, i)
 161         elif c == '/' and source[i+1] == '/':    # Find // comments.
 162             i = source.find('\n', i)
 163             if i == -1:  # Handle EOF.
 164                 i = end
 165             continue
 166         elif c == '/' and source[i+1] == '*':    # Find /* comments. */
 167             i = source.find('*/', i) + 2
 168             continue
 169         elif c in ':+-<>&|*=':                   # : or :: (plus other chars).
 170             token_type = SYNTAX
 171             i += 1
 172             new_ch = source[i]
 173             if new_ch == c and c != '>':         # Treat ">>" as two tokens.
 174                 i += 1
 175             elif c == '-' and new_ch == '>':
 176                 i += 1
 177             elif new_ch == '=':
 178                 i += 1
 179         elif c in '()[]{}~!?^%;/.,':             # Handle single char tokens.
 180             token_type = SYNTAX
 181             i += 1
 182             if c == '.' and source[i].isdigit():
 183                 token_type = CONSTANT
 184                 i += 1
 185                 while source[i] in int_or_float_digits:
 186                     i += 1
 187                 # Handle float suffixes.
 188                 for suffix in ('l', 'f'):
 189                     if suffix == source[i:i+1].lower():
 190                         i += 1
 191                         break
 192         elif c.isdigit():                        # Find integer.
 193             token_type = CONSTANT
 194             if c == '0' and source[i+1] in 'xX':
 195                 # Handle hex digits.
 196                 i += 2
 197                 while source[i] in hex_digits:
 198                     i += 1
 199             else:
 200                 while source[i] in int_or_float_digits2:
 201                     i += 1
 202             # Handle integer (and float) suffixes.
 203             for suffix in ('ull', 'll', 'ul', 'l', 'f', 'u'):
 204                 size = len(suffix)
 205                 if suffix == source[i:i+size].lower():
 206                     i += size
 207                     break
 208         elif c == '"':                           # Find string.
 209             token_type = CONSTANT
 210             i = _GetString(source, start, i)
 211         elif c == "'":                           # Find char.
 212             token_type = CONSTANT
 213             i = _GetChar(source, start, i)
 214         elif c == '#':                           # Find pre-processor command.
 215             token_type = PREPROCESSOR
 216             got_if = source[i:i+3] == '#if' and source[i+3:i+4].isspace()
 217             if got_if:
 218                 count_ifs += 1
 219             elif source[i:i+6] == '#endif':
 220                 count_ifs -= 1
 221                 if count_ifs == 0:
 222                     ignore_errors = False
 223
 224             # TODO(nnorwitz): handle preprocessor statements (\ continuations).
 225             while 1:
 226                 i1 = source.find('\n', i)
 227                 i2 = source.find('//', i)
 228                 i3 = source.find('/*', i)
 229                 i4 = source.find('"', i)
 230                 # NOTE(nnorwitz): doesn't handle comments in #define macros.
 231                 # Get the first important symbol (newline, comment, EOF/end).
 232                 i = min([x for x in (i1, i2, i3, i4, end) if x != -1])
 233
 234                 # Handle #include "dir//foo.h" properly.
 235                 if source[i] == '"':
 236                     i = source.find('"', i+1) + 1
 237                     assert i > 0
 238                     continue
 239                 # Keep going if end of the line and the line ends with \.
 240                 if not (i == i1 and source[i-1] == '\\'):
 241                     if got_if:
 242                         condition = source[start+4:i].lstrip()
 243                         if (condition.startswith('0') or
 244                             condition.startswith('(0)')):
 245                             ignore_errors = True
 246                     break
 247                 i += 1
 248         elif c == '\\':                          # Handle \ in code.
 249             # This is different from the pre-processor \ handling.
 250             i += 1
 251             continue
 252         elif ignore_errors:
 253             # The tokenizer seems to be in pretty good shape.  This
 254             # raise is conditionally disabled so that bogus code
 255             # in an #if 0 block can be handled.  Since we will ignore
 256             # it anyways, this is probably fine.  So disable the
 257             # exception and  return the bogus char.
 258             i += 1
 259         else:
 260             sys.stderr.write('Got invalid token in %s @ %d token:%s: %r\n' %
 261                              ('?', i, c, source[i-10:i+10]))
 262             raise RuntimeError('unexpected token')
 263
 264         if i <= 0:
 265             print('Invalid index, exiting now.')
 266             return
 267         yield Token(token_type, source[start:i], start, i)
 268
 269
 270 if __name__ == '__main__':
 271     def main(argv):
 272         """Driver mostly for testing purposes."""
 273         for filename in argv[1:]:
 274             source = utils.ReadFile(filename)
 275             if source is None:
 276                 continue
 277
 278             for token in GetTokens(source):
 279                 print('%-12s: %s' % (token.token_type, token.name))
 280                 # print('\r%6.2f%%' % (100.0 * index / token.end),)
 281             sys.stdout.write('\n')
 282
 283
 284     main(sys.argv)