]> git.proxmox.com Git - ceph.git/blob - ceph/src/googletest/googlemock/scripts/generator/cpp/tokenize.py
bump version to 18.2.2-pve1
[ceph.git] / ceph / src / googletest / googlemock / scripts / generator / cpp / tokenize.py
1 #!/usr/bin/env python
2 #
3 # Copyright 2007 Neal Norwitz
4 # Portions Copyright 2007 Google Inc.
5 #
6 # Licensed under the Apache License, Version 2.0 (the "License");
7 # you may not use this file except in compliance with the License.
8 # You may obtain a copy of the License at
9 #
10 # http://www.apache.org/licenses/LICENSE-2.0
11 #
12 # Unless required by applicable law or agreed to in writing, software
13 # distributed under the License is distributed on an "AS IS" BASIS,
14 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 # See the License for the specific language governing permissions and
16 # limitations under the License.
17
18 """Tokenize C++ source code."""
19
20 try:
21 # Python 3.x
22 import builtins
23 except ImportError:
24 # Python 2.x
25 import __builtin__ as builtins
26
27
28 import sys
29
30 from cpp import utils
31
32
33 if not hasattr(builtins, 'set'):
34 # Nominal support for Python 2.3.
35 from sets import Set as set
36
37
38 # Add $ as a valid identifier char since so much code uses it.
39 _letters = 'abcdefghijklmnopqrstuvwxyz'
40 VALID_IDENTIFIER_CHARS = set(_letters + _letters.upper() + '_0123456789$')
41 HEX_DIGITS = set('0123456789abcdefABCDEF')
42 INT_OR_FLOAT_DIGITS = set('01234567890eE-+')
43
44
45 # C++0x string preffixes.
46 _STR_PREFIXES = set(('R', 'u8', 'u8R', 'u', 'uR', 'U', 'UR', 'L', 'LR'))
47
48
49 # Token types.
50 UNKNOWN = 'UNKNOWN'
51 SYNTAX = 'SYNTAX'
52 CONSTANT = 'CONSTANT'
53 NAME = 'NAME'
54 PREPROCESSOR = 'PREPROCESSOR'
55
56 # Where the token originated from. This can be used for backtracking.
57 # It is always set to WHENCE_STREAM in this code.
58 WHENCE_STREAM, WHENCE_QUEUE = range(2)
59
60
61 class Token(object):
62 """Data container to represent a C++ token.
63
64 Tokens can be identifiers, syntax char(s), constants, or
65 pre-processor directives.
66
67 start contains the index of the first char of the token in the source
68 end contains the index of the last char of the token in the source
69 """
70
71 def __init__(self, token_type, name, start, end):
72 self.token_type = token_type
73 self.name = name
74 self.start = start
75 self.end = end
76 self.whence = WHENCE_STREAM
77
78 def __str__(self):
79 if not utils.DEBUG:
80 return 'Token(%r)' % self.name
81 return 'Token(%r, %s, %s)' % (self.name, self.start, self.end)
82
83 __repr__ = __str__
84
85
86 def _GetString(source, start, i):
87 i = source.find('"', i+1)
88 while source[i-1] == '\\':
89 # Count the trailing backslashes.
90 backslash_count = 1
91 j = i - 2
92 while source[j] == '\\':
93 backslash_count += 1
94 j -= 1
95 # When trailing backslashes are even, they escape each other.
96 if (backslash_count % 2) == 0:
97 break
98 i = source.find('"', i+1)
99 return i + 1
100
101
102 def _GetChar(source, start, i):
103 # NOTE(nnorwitz): may not be quite correct, should be good enough.
104 i = source.find("'", i+1)
105 while source[i-1] == '\\':
106 # Need to special case '\\'.
107 if (i - 2) > start and source[i-2] == '\\':
108 break
109 i = source.find("'", i+1)
110 # Try to handle unterminated single quotes (in a #if 0 block).
111 if i < 0:
112 i = start
113 return i + 1
114
115
116 def GetTokens(source):
117 """Returns a sequence of Tokens.
118
119 Args:
120 source: string of C++ source code.
121
122 Yields:
123 Token that represents the next token in the source.
124 """
125 # Cache various valid character sets for speed.
126 valid_identifier_chars = VALID_IDENTIFIER_CHARS
127 hex_digits = HEX_DIGITS
128 int_or_float_digits = INT_OR_FLOAT_DIGITS
129 int_or_float_digits2 = int_or_float_digits | set('.')
130
131 # Only ignore errors while in a #if 0 block.
132 ignore_errors = False
133 count_ifs = 0
134
135 i = 0
136 end = len(source)
137 while i < end:
138 # Skip whitespace.
139 while i < end and source[i].isspace():
140 i += 1
141 if i >= end:
142 return
143
144 token_type = UNKNOWN
145 start = i
146 c = source[i]
147 if c.isalpha() or c == '_': # Find a string token.
148 token_type = NAME
149 while source[i] in valid_identifier_chars:
150 i += 1
151 # String and character constants can look like a name if
152 # they are something like L"".
153 if (source[i] == "'" and (i - start) == 1 and
154 source[start:i] in 'uUL'):
155 # u, U, and L are valid C++0x character preffixes.
156 token_type = CONSTANT
157 i = _GetChar(source, start, i)
158 elif source[i] == "'" and source[start:i] in _STR_PREFIXES:
159 token_type = CONSTANT
160 i = _GetString(source, start, i)
161 elif c == '/' and source[i+1] == '/': # Find // comments.
162 i = source.find('\n', i)
163 if i == -1: # Handle EOF.
164 i = end
165 continue
166 elif c == '/' and source[i+1] == '*': # Find /* comments. */
167 i = source.find('*/', i) + 2
168 continue
169 elif c in ':+-<>&|*=': # : or :: (plus other chars).
170 token_type = SYNTAX
171 i += 1
172 new_ch = source[i]
173 if new_ch == c and c != '>': # Treat ">>" as two tokens.
174 i += 1
175 elif c == '-' and new_ch == '>':
176 i += 1
177 elif new_ch == '=':
178 i += 1
179 elif c in '()[]{}~!?^%;/.,': # Handle single char tokens.
180 token_type = SYNTAX
181 i += 1
182 if c == '.' and source[i].isdigit():
183 token_type = CONSTANT
184 i += 1
185 while source[i] in int_or_float_digits:
186 i += 1
187 # Handle float suffixes.
188 for suffix in ('l', 'f'):
189 if suffix == source[i:i+1].lower():
190 i += 1
191 break
192 elif c.isdigit(): # Find integer.
193 token_type = CONSTANT
194 if c == '0' and source[i+1] in 'xX':
195 # Handle hex digits.
196 i += 2
197 while source[i] in hex_digits:
198 i += 1
199 else:
200 while source[i] in int_or_float_digits2:
201 i += 1
202 # Handle integer (and float) suffixes.
203 for suffix in ('ull', 'll', 'ul', 'l', 'f', 'u'):
204 size = len(suffix)
205 if suffix == source[i:i+size].lower():
206 i += size
207 break
208 elif c == '"': # Find string.
209 token_type = CONSTANT
210 i = _GetString(source, start, i)
211 elif c == "'": # Find char.
212 token_type = CONSTANT
213 i = _GetChar(source, start, i)
214 elif c == '#': # Find pre-processor command.
215 token_type = PREPROCESSOR
216 got_if = source[i:i+3] == '#if' and source[i+3:i+4].isspace()
217 if got_if:
218 count_ifs += 1
219 elif source[i:i+6] == '#endif':
220 count_ifs -= 1
221 if count_ifs == 0:
222 ignore_errors = False
223
224 # TODO(nnorwitz): handle preprocessor statements (\ continuations).
225 while 1:
226 i1 = source.find('\n', i)
227 i2 = source.find('//', i)
228 i3 = source.find('/*', i)
229 i4 = source.find('"', i)
230 # NOTE(nnorwitz): doesn't handle comments in #define macros.
231 # Get the first important symbol (newline, comment, EOF/end).
232 i = min([x for x in (i1, i2, i3, i4, end) if x != -1])
233
234 # Handle #include "dir//foo.h" properly.
235 if source[i] == '"':
236 i = source.find('"', i+1) + 1
237 assert i > 0
238 continue
239 # Keep going if end of the line and the line ends with \.
240 if not (i == i1 and source[i-1] == '\\'):
241 if got_if:
242 condition = source[start+4:i].lstrip()
243 if (condition.startswith('0') or
244 condition.startswith('(0)')):
245 ignore_errors = True
246 break
247 i += 1
248 elif c == '\\': # Handle \ in code.
249 # This is different from the pre-processor \ handling.
250 i += 1
251 continue
252 elif ignore_errors:
253 # The tokenizer seems to be in pretty good shape. This
254 # raise is conditionally disabled so that bogus code
255 # in an #if 0 block can be handled. Since we will ignore
256 # it anyways, this is probably fine. So disable the
257 # exception and return the bogus char.
258 i += 1
259 else:
260 sys.stderr.write('Got invalid token in %s @ %d token:%s: %r\n' %
261 ('?', i, c, source[i-10:i+10]))
262 raise RuntimeError('unexpected token')
263
264 if i <= 0:
265 print('Invalid index, exiting now.')
266 return
267 yield Token(token_type, source[start:i], start, i)
268
269
270 if __name__ == '__main__':
271 def main(argv):
272 """Driver mostly for testing purposes."""
273 for filename in argv[1:]:
274 source = utils.ReadFile(filename)
275 if source is None:
276 continue
277
278 for token in GetTokens(source):
279 print('%-12s: %s' % (token.token_type, token.name))
280 # print('\r%6.2f%%' % (100.0 * index / token.end),)
281 sys.stdout.write('\n')
282
283
284 main(sys.argv)