[ceph.git] / ceph / src / rapidjson / thirdparty / gtest / googlemock / scripts / generator / cpp / tokenize.py

#!/usr/bin/env python
#
# Copyright 2007 Neal Norwitz
# Portions Copyright 2007 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Tokenize C++ source code."""

__author__ = 'nnorwitz@google.com (Neal Norwitz)'


try:
    # Python 3.x
    import builtins
except ImportError:
    # Python 2.x
    import __builtin__ as builtins


import sys

from cpp import utils


if not hasattr(builtins, 'set'):
    # Nominal support for Python 2.3.
    from sets import Set as set


# Add $ as a valid identifier char since so much code uses it.
_letters = 'abcdefghijklmnopqrstuvwxyz'
VALID_IDENTIFIER_CHARS = set(_letters + _letters.upper() + '_0123456789$')
HEX_DIGITS = set('0123456789abcdefABCDEF')
INT_OR_FLOAT_DIGITS = set('01234567890eE-+')


# C++0x string preffixes.
_STR_PREFIXES = set(('R', 'u8', 'u8R', 'u', 'uR', 'U', 'UR', 'L', 'LR'))


# Token types.
UNKNOWN = 'UNKNOWN'
SYNTAX = 'SYNTAX'
CONSTANT = 'CONSTANT'
NAME = 'NAME'
PREPROCESSOR = 'PREPROCESSOR'

# Where the token originated from.  This can be used for backtracking.
# It is always set to WHENCE_STREAM in this code.
WHENCE_STREAM, WHENCE_QUEUE = range(2)


class Token(object):
    """Data container to represent a C++ token.

    Tokens can be identifiers, syntax char(s), constants, or
    pre-processor directives.

    start contains the index of the first char of the token in the source
    end contains the index of the last char of the token in the source
    """

    def __init__(self, token_type, name, start, end):
        self.token_type = token_type
        self.name = name
        self.start = start
        self.end = end
        self.whence = WHENCE_STREAM

    def __str__(self):
        if not utils.DEBUG:
            return 'Token(%r)' % self.name
        return 'Token(%r, %s, %s)' % (self.name, self.start, self.end)

    __repr__ = __str__


def _GetString(source, start, i):
    i = source.find('"', i+1)
    while source[i-1] == '\\':
        # Count the trailing backslashes.
        backslash_count = 1
        j = i - 2
        while source[j] == '\\':
            backslash_count += 1
            j -= 1
        # When trailing backslashes are even, they escape each other.
        if (backslash_count % 2) == 0:
            break
        i = source.find('"', i+1)
    return i + 1


def _GetChar(source, start, i):
    # NOTE(nnorwitz): may not be quite correct, should be good enough.
    i = source.find("'", i+1)
    while source[i-1] == '\\':
        # Need to special case '\\'.
        if (i - 2) > start and source[i-2] == '\\':
            break
        i = source.find("'", i+1)
    # Try to handle unterminated single quotes (in a #if 0 block).
    if i < 0:
        i = start
    return i + 1


def GetTokens(source):
    """Returns a sequence of Tokens.

    Args:
      source: string of C++ source code.

    Yields:
      Token that represents the next token in the source.
    """
    # Cache various valid character sets for speed.
    valid_identifier_chars = VALID_IDENTIFIER_CHARS
    hex_digits = HEX_DIGITS
    int_or_float_digits = INT_OR_FLOAT_DIGITS
    int_or_float_digits2 = int_or_float_digits | set('.')

    # Only ignore errors while in a #if 0 block.
    ignore_errors = False
    count_ifs = 0

    i = 0
    end = len(source)
    while i < end:
        # Skip whitespace.
        while i < end and source[i].isspace():
            i += 1
        if i >= end:
            return

        token_type = UNKNOWN
        start = i
        c = source[i]
        if c.isalpha() or c == '_':              # Find a string token.
            token_type = NAME
            while source[i] in valid_identifier_chars:
                i += 1
            # String and character constants can look like a name if
            # they are something like L"".
            if (source[i] == "'" and (i - start) == 1 and
                source[start:i] in 'uUL'):
                # u, U, and L are valid C++0x character preffixes.
                token_type = CONSTANT
                i = _GetChar(source, start, i)
            elif source[i] == "'" and source[start:i] in _STR_PREFIXES:
                token_type = CONSTANT
                i = _GetString(source, start, i)
        elif c == '/' and source[i+1] == '/':    # Find // comments.
            i = source.find('\n', i)
            if i == -1:  # Handle EOF.
                i = end
            continue
        elif c == '/' and source[i+1] == '*':    # Find /* comments. */
            i = source.find('*/', i) + 2
            continue
        elif c in ':+-<>&|*=':                   # : or :: (plus other chars).
            token_type = SYNTAX
            i += 1
            new_ch = source[i]
            if new_ch == c and c != '>':         # Treat ">>" as two tokens.
                i += 1
            elif c == '-' and new_ch == '>':
                i += 1
            elif new_ch == '=':
                i += 1
        elif c in '()[]{}~!?^%;/.,':             # Handle single char tokens.
            token_type = SYNTAX
            i += 1
            if c == '.' and source[i].isdigit():
                token_type = CONSTANT
                i += 1
                while source[i] in int_or_float_digits:
                    i += 1
                # Handle float suffixes.
                for suffix in ('l', 'f'):
                    if suffix == source[i:i+1].lower():
                        i += 1
                        break
        elif c.isdigit():                        # Find integer.
            token_type = CONSTANT
            if c == '0' and source[i+1] in 'xX':
                # Handle hex digits.
                i += 2
                while source[i] in hex_digits:
                    i += 1
            else:
                while source[i] in int_or_float_digits2:
                    i += 1
            # Handle integer (and float) suffixes.
            for suffix in ('ull', 'll', 'ul', 'l', 'f', 'u'):
                size = len(suffix)
                if suffix == source[i:i+size].lower():
                    i += size
                    break
        elif c == '"':                           # Find string.
            token_type = CONSTANT
            i = _GetString(source, start, i)
        elif c == "'":                           # Find char.
            token_type = CONSTANT
            i = _GetChar(source, start, i)
        elif c == '#':                           # Find pre-processor command.
            token_type = PREPROCESSOR
            got_if = source[i:i+3] == '#if' and source[i+3:i+4].isspace()
            if got_if:
                count_ifs += 1
            elif source[i:i+6] == '#endif':
                count_ifs -= 1
                if count_ifs == 0:
                    ignore_errors = False

            # TODO(nnorwitz): handle preprocessor statements (\ continuations).
            while 1:
                i1 = source.find('\n', i)
                i2 = source.find('//', i)
                i3 = source.find('/*', i)
                i4 = source.find('"', i)
                # NOTE(nnorwitz): doesn't handle comments in #define macros.
                # Get the first important symbol (newline, comment, EOF/end).
                i = min([x for x in (i1, i2, i3, i4, end) if x != -1])

                # Handle #include "dir//foo.h" properly.
                if source[i] == '"':
                    i = source.find('"', i+1) + 1
                    assert i > 0
                    continue
                # Keep going if end of the line and the line ends with \.
                if not (i == i1 and source[i-1] == '\\'):
                    if got_if:
                        condition = source[start+4:i].lstrip()
                        if (condition.startswith('0') or
                            condition.startswith('(0)')):
                            ignore_errors = True
                    break
                i += 1
        elif c == '\\':                          # Handle \ in code.
            # This is different from the pre-processor \ handling.
            i += 1
            continue
        elif ignore_errors:
            # The tokenizer seems to be in pretty good shape.  This
            # raise is conditionally disabled so that bogus code
            # in an #if 0 block can be handled.  Since we will ignore
            # it anyways, this is probably fine.  So disable the
            # exception and  return the bogus char.
            i += 1
        else:
            sys.stderr.write('Got invalid token in %s @ %d token:%s: %r\n' %
                             ('?', i, c, source[i-10:i+10]))
            raise RuntimeError('unexpected token')

        if i <= 0:
            print('Invalid index, exiting now.')
            return
        yield Token(token_type, source[start:i], start, i)


if __name__ == '__main__':
    def main(argv):
        """Driver mostly for testing purposes."""
        for filename in argv[1:]:
            source = utils.ReadFile(filename)
            if source is None:
                continue

            for token in GetTokens(source):
                print('%-12s: %s' % (token.token_type, token.name))
                # print('\r%6.2f%%' % (100.0 * index / token.end),)
            sys.stdout.write('\n')


    main(sys.argv)
Commit	Line	Data
31f18b77 FG	1	#!/usr/bin/env python
	2	#
	3	# Copyright 2007 Neal Norwitz
	4	# Portions Copyright 2007 Google Inc.
	5	#
	6	# Licensed under the Apache License, Version 2.0 (the "License");
	7	# you may not use this file except in compliance with the License.
	8	# You may obtain a copy of the License at
	9	#
	10	# http://www.apache.org/licenses/LICENSE-2.0
	11	#
	12	# Unless required by applicable law or agreed to in writing, software
	13	# distributed under the License is distributed on an "AS IS" BASIS,
	14	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	15	# See the License for the specific language governing permissions and
	16	# limitations under the License.
	17
	18	"""Tokenize C++ source code."""
	19
	20	__author__ = 'nnorwitz@google.com (Neal Norwitz)'
	21
	22
	23	try:
	24	# Python 3.x
	25	import builtins
	26	except ImportError:
	27	# Python 2.x
	28	import __builtin__ as builtins
	29
	30
	31	import sys
	32
	33	from cpp import utils
	34
	35
	36	if not hasattr(builtins, 'set'):
	37	# Nominal support for Python 2.3.
	38	from sets import Set as set
	39
	40
	41	# Add $ as a valid identifier char since so much code uses it.
	42	_letters = 'abcdefghijklmnopqrstuvwxyz'
	43	VALID_IDENTIFIER_CHARS = set(_letters + _letters.upper() + '_0123456789$')
	44	HEX_DIGITS = set('0123456789abcdefABCDEF')
	45	INT_OR_FLOAT_DIGITS = set('01234567890eE-+')
	46
	47
	48	# C++0x string preffixes.
	49	_STR_PREFIXES = set(('R', 'u8', 'u8R', 'u', 'uR', 'U', 'UR', 'L', 'LR'))
	50
	51
	52	# Token types.
	53	UNKNOWN = 'UNKNOWN'
	54	SYNTAX = 'SYNTAX'
	55	CONSTANT = 'CONSTANT'
	56	NAME = 'NAME'
	57	PREPROCESSOR = 'PREPROCESSOR'
	58
	59	# Where the token originated from. This can be used for backtracking.
	60	# It is always set to WHENCE_STREAM in this code.
	61	WHENCE_STREAM, WHENCE_QUEUE = range(2)
	62
	63
	64	class Token(object):
65	"""Data container to represent a C++ token.
66
67	Tokens can be identifiers, syntax char(s), constants, or
68	pre-processor directives.
69
70	start contains the index of the first char of the token in the source
71	end contains the index of the last char of the token in the source
72	"""
73
74	def __init__(self, token_type, name, start, end):
75	self.token_type = token_type
76	self.name = name
77	self.start = start
78	self.end = end
79	self.whence = WHENCE_STREAM
80
81	def __str__(self):
82	if not utils.DEBUG:
83	return 'Token(%r)' % self.name
84	return 'Token(%r, %s, %s)' % (self.name, self.start, self.end)
85
86	__repr__ = __str__
87
88
89	def _GetString(source, start, i):
90	i = source.find('"', i+1)
91	while source[i-1] == '\\':
92	# Count the trailing backslashes.
93	backslash_count = 1
94	j = i - 2
95	while source[j] == '\\':
96	backslash_count += 1
97	j -= 1
98	# When trailing backslashes are even, they escape each other.
99	if (backslash_count % 2) == 0:
100	break
101	i = source.find('"', i+1)
102	return i + 1
103
104
105	def _GetChar(source, start, i):
106	# NOTE(nnorwitz): may not be quite correct, should be good enough.
107	i = source.find("'", i+1)
108	while source[i-1] == '\\':
109	# Need to special case '\\'.
110	if (i - 2) > start and source[i-2] == '\\':
111	break
112	i = source.find("'", i+1)
113	# Try to handle unterminated single quotes (in a #if 0 block).
114	if i < 0:
115	i = start
116	return i + 1
117
118
119	def GetTokens(source):
120	"""Returns a sequence of Tokens.
121
122	Args:
123	source: string of C++ source code.
124
125	Yields:
126	Token that represents the next token in the source.
127	"""
128	# Cache various valid character sets for speed.
129	valid_identifier_chars = VALID_IDENTIFIER_CHARS
130	hex_digits = HEX_DIGITS
131	int_or_float_digits = INT_OR_FLOAT_DIGITS
132	int_or_float_digits2 = int_or_float_digits \| set('.')
133
134	# Only ignore errors while in a #if 0 block.
135	ignore_errors = False
136	count_ifs = 0
137
138	i = 0
139	end = len(source)
140	while i < end:
141	# Skip whitespace.
142	while i < end and source[i].isspace():
143	i += 1
144	if i >= end:
145	return
146
147	token_type = UNKNOWN
148	start = i
149	c = source[i]
150	if c.isalpha() or c == '_': # Find a string token.
151	token_type = NAME
152	while source[i] in valid_identifier_chars:
153	i += 1
154	# String and character constants can look like a name if
155	# they are something like L"".
156	if (source[i] == "'" and (i - start) == 1 and
157	source[start:i] in 'uUL'):
158	# u, U, and L are valid C++0x character preffixes.
159	token_type = CONSTANT
160	i = _GetChar(source, start, i)
161	elif source[i] == "'" and source[start:i] in _STR_PREFIXES:
162	token_type = CONSTANT
163	i = _GetString(source, start, i)
164	elif c == '/' and source[i+1] == '/': # Find // comments.
165	i = source.find('\n', i)
166	if i == -1: # Handle EOF.
167	i = end
168	continue
169	elif c == '/' and source[i+1] == '': # Find / comments. */
170	i = source.find('*/', i) + 2
171	continue
172	elif c in ':+-<>&\|*=': # : or :: (plus other chars).
173	token_type = SYNTAX
174	i += 1
175	new_ch = source[i]
176	if new_ch == c and c != '>': # Treat ">>" as two tokens.
177	i += 1
178	elif c == '-' and new_ch == '>':
179	i += 1
180	elif new_ch == '=':
181	i += 1
182	elif c in '()[]{}~!?^%;/.,': # Handle single char tokens.
183	token_type = SYNTAX
184	i += 1
185	if c == '.' and source[i].isdigit():
186	token_type = CONSTANT
187	i += 1
188	while source[i] in int_or_float_digits:
189	i += 1
190	# Handle float suffixes.
191	for suffix in ('l', 'f'):
192	if suffix == source[i:i+1].lower():
193	i += 1
194	break
195	elif c.isdigit(): # Find integer.
196	token_type = CONSTANT
197	if c == '0' and source[i+1] in 'xX':
198	# Handle hex digits.
199	i += 2
200	while source[i] in hex_digits:
201	i += 1
202	else:
203	while source[i] in int_or_float_digits2:
204	i += 1
205	# Handle integer (and float) suffixes.
206	for suffix in ('ull', 'll', 'ul', 'l', 'f', 'u'):
207	size = len(suffix)
208	if suffix == source[i:i+size].lower():
209	i += size
210	break
211	elif c == '"': # Find string.
212	token_type = CONSTANT
213	i = _GetString(source, start, i)
214	elif c == "'": # Find char.
215	token_type = CONSTANT
216	i = _GetChar(source, start, i)
217	elif c == '#': # Find pre-processor command.
218	token_type = PREPROCESSOR
219	got_if = source[i:i+3] == '#if' and source[i+3:i+4].isspace()
220	if got_if:
221	count_ifs += 1
222	elif source[i:i+6] == '#endif':
223	count_ifs -= 1
224	if count_ifs == 0:
225	ignore_errors = False
226
227	# TODO(nnorwitz): handle preprocessor statements (\ continuations).
228	while 1:
229	i1 = source.find('\n', i)
230	i2 = source.find('//', i)
231	i3 = source.find('/*', i)
232	i4 = source.find('"', i)
233	# NOTE(nnorwitz): doesn't handle comments in #define macros.
234	# Get the first important symbol (newline, comment, EOF/end).
235	i = min([x for x in (i1, i2, i3, i4, end) if x != -1])
236
237	# Handle #include "dir//foo.h" properly.
238	if source[i] == '"':
239	i = source.find('"', i+1) + 1
240	assert i > 0
241	continue
242	# Keep going if end of the line and the line ends with \.
243	if not (i == i1 and source[i-1] == '\\'):
244	if got_if:
245	condition = source[start+4:i].lstrip()
246	if (condition.startswith('0') or
247	condition.startswith('(0)')):
248	ignore_errors = True
249	break
250	i += 1
251	elif c == '\\': # Handle \ in code.
252	# This is different from the pre-processor \ handling.
253	i += 1
254	continue
255	elif ignore_errors:
256	# The tokenizer seems to be in pretty good shape. This
257	# raise is conditionally disabled so that bogus code
258	# in an #if 0 block can be handled. Since we will ignore
259	# it anyways, this is probably fine. So disable the
260	# exception and return the bogus char.
261	i += 1
262	else:
263	sys.stderr.write('Got invalid token in %s @ %d token:%s: %r\n' %
264	('?', i, c, source[i-10:i+10]))
265	raise RuntimeError('unexpected token')
266
267	if i <= 0:
268	print('Invalid index, exiting now.')
269	return
270	yield Token(token_type, source[start:i], start, i)
271
272
273	if __name__ == '__main__':
274	def main(argv):
275	"""Driver mostly for testing purposes."""
276	for filename in argv[1:]:
277	source = utils.ReadFile(filename)
278	if source is None:
279	continue
280
281	for token in GetTokens(source):
282	print('%-12s: %s' % (token.token_type, token.name))
283	# print('\r%6.2f%%' % (100.0 * index / token.end),)
284	sys.stdout.write('\n')
285
286
287	main(sys.argv)