[mirror_qemu.git] / qobject / json-lexer.c

/*
 * JSON lexer
 *
 * Copyright IBM, Corp. 2009
 *
 * Authors:
 *  Anthony Liguori   <aliguori@us.ibm.com>
 *
 * This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
 * See the COPYING.LIB file in the top-level directory.
 *
 */

#include "qemu/osdep.h"
#include "qemu-common.h"
#include "qapi/qmp/json-lexer.h"
#include "qapi/qmp/json-streamer.h"

#define MAX_TOKEN_SIZE (64ULL << 20)

/*
 * From RFC 8259 "The JavaScript Object Notation (JSON) Data
 * Interchange Format", with [comments in brackets]:
 *
 * The set of tokens includes six structural characters, strings,
 * numbers, and three literal names.
 *
 * These are the six structural characters:
 *
 *    begin-array     = ws %x5B ws  ; [ left square bracket
 *    begin-object    = ws %x7B ws  ; { left curly bracket
 *    end-array       = ws %x5D ws  ; ] right square bracket
 *    end-object      = ws %x7D ws  ; } right curly bracket
 *    name-separator  = ws %x3A ws  ; : colon
 *    value-separator = ws %x2C ws  ; , comma
 *
 * Insignificant whitespace is allowed before or after any of the six
 * structural characters.
 * [This lexer accepts it before or after any token, which is actually
 * the same, as the grammar always has structural characters between
 * other tokens.]
 *
 *    ws = *(
 *           %x20 /              ; Space
 *           %x09 /              ; Horizontal tab
 *           %x0A /              ; Line feed or New line
 *           %x0D )              ; Carriage return
 *
 * [...] three literal names:
 *    false null true
 *  [This lexer accepts [a-z]+, and leaves rejecting unknown literal
 *  names to the parser.]
 *
 * [Numbers:]
 *
 *    number = [ minus ] int [ frac ] [ exp ]
 *    decimal-point = %x2E       ; .
 *    digit1-9 = %x31-39         ; 1-9
 *    e = %x65 / %x45            ; e E
 *    exp = e [ minus / plus ] 1*DIGIT
 *    frac = decimal-point 1*DIGIT
 *    int = zero / ( digit1-9 *DIGIT )
 *    minus = %x2D               ; -
 *    plus = %x2B                ; +
 *    zero = %x30                ; 0
 *
 * [Strings:]
 *    string = quotation-mark *char quotation-mark
 *
 *    char = unescaped /
 *        escape (
 *            %x22 /          ; "    quotation mark  U+0022
 *            %x5C /          ; \    reverse solidus U+005C
 *            %x2F /          ; /    solidus         U+002F
 *            %x62 /          ; b    backspace       U+0008
 *            %x66 /          ; f    form feed       U+000C
 *            %x6E /          ; n    line feed       U+000A
 *            %x72 /          ; r    carriage return U+000D
 *            %x74 /          ; t    tab             U+0009
 *            %x75 4HEXDIG )  ; uXXXX                U+XXXX
 *    escape = %x5C              ; \
 *    quotation-mark = %x22      ; "
 *    unescaped = %x20-21 / %x23-5B / %x5D-10FFFF
 *    [This lexer accepts any non-control character after escape, and
 *    leaves rejecting invalid ones to the parser.]
 *
 *
 * Extensions over RFC 8259:
 * - Extra escape sequence in strings:
 *   0x27 (apostrophe) is recognized after escape, too
 * - Single-quoted strings:
 *   Like double-quoted strings, except they're delimited by %x27
 *   (apostrophe) instead of %x22 (quotation mark), and can't contain
 *   unescaped apostrophe, but can contain unescaped quotation mark.
 * - Interpolation, if enabled:
 *   The lexer accepts %[A-Za-z0-9]*, and leaves rejecting invalid
 *   ones to the parser.
 *
 * Note:
 * - Input must be encoded in modified UTF-8.
 * - Decoding and validating is left to the parser.
 */

enum json_lexer_state {
    IN_ERROR = 0,               /* must really be 0, see json_lexer[] */
    IN_DQ_STRING_ESCAPE,
    IN_DQ_STRING,
    IN_SQ_STRING_ESCAPE,
    IN_SQ_STRING,
    IN_ZERO,
    IN_EXP_DIGITS,
    IN_EXP_SIGN,
    IN_EXP_E,
    IN_MANTISSA,
    IN_MANTISSA_DIGITS,
    IN_DIGITS,
    IN_SIGN,
    IN_KEYWORD,
    IN_INTERP,
    IN_WHITESPACE,
    IN_START,
    IN_START_INTERP,            /* must be IN_START + 1 */
};

QEMU_BUILD_BUG_ON((int)JSON_MIN <= (int)IN_START_INTERP);
QEMU_BUILD_BUG_ON(IN_START_INTERP != IN_START + 1);

#define TERMINAL(state) [0 ... 0x7F] = (state)

/* Return whether TERMINAL is a terminal state and the transition to it
   from OLD_STATE required lookahead.  This happens whenever the table
   below uses the TERMINAL macro.  */
#define TERMINAL_NEEDED_LOOKAHEAD(old_state, terminal) \
    (terminal != IN_ERROR && json_lexer[(old_state)][0] == (terminal))

static const uint8_t json_lexer[][256] =  {
    /* Relies on default initialization to IN_ERROR! */

    /* double quote string */
    [IN_DQ_STRING_ESCAPE] = {
        [0x20 ... 0xFD] = IN_DQ_STRING,
    },
    [IN_DQ_STRING] = {
        [0x20 ... 0xFD] = IN_DQ_STRING,
        ['\\'] = IN_DQ_STRING_ESCAPE,
        ['"'] = JSON_STRING,
    },

    /* single quote string */
    [IN_SQ_STRING_ESCAPE] = {
        [0x20 ... 0xFD] = IN_SQ_STRING,
    },
    [IN_SQ_STRING] = {
        [0x20 ... 0xFD] = IN_SQ_STRING,
        ['\\'] = IN_SQ_STRING_ESCAPE,
        ['\''] = JSON_STRING,
    },

    /* Zero */
    [IN_ZERO] = {
        TERMINAL(JSON_INTEGER),
        ['0' ... '9'] = IN_ERROR,
        ['.'] = IN_MANTISSA,
    },

    /* Float */
    [IN_EXP_DIGITS] = {
        TERMINAL(JSON_FLOAT),
        ['0' ... '9'] = IN_EXP_DIGITS,
    },

    [IN_EXP_SIGN] = {
        ['0' ... '9'] = IN_EXP_DIGITS,
    },

    [IN_EXP_E] = {
        ['-'] = IN_EXP_SIGN,
        ['+'] = IN_EXP_SIGN,
        ['0' ... '9'] = IN_EXP_DIGITS,
    },

    [IN_MANTISSA_DIGITS] = {
        TERMINAL(JSON_FLOAT),
        ['0' ... '9'] = IN_MANTISSA_DIGITS,
        ['e'] = IN_EXP_E,
        ['E'] = IN_EXP_E,
    },

    [IN_MANTISSA] = {
        ['0' ... '9'] = IN_MANTISSA_DIGITS,
    },

    /* Number */
    [IN_DIGITS] = {
        TERMINAL(JSON_INTEGER),
        ['0' ... '9'] = IN_DIGITS,
        ['e'] = IN_EXP_E,
        ['E'] = IN_EXP_E,
        ['.'] = IN_MANTISSA,
    },

    [IN_SIGN] = {
        ['0'] = IN_ZERO,
        ['1' ... '9'] = IN_DIGITS,
    },

    /* keywords */
    [IN_KEYWORD] = {
        TERMINAL(JSON_KEYWORD),
        ['a' ... 'z'] = IN_KEYWORD,
    },

    /* whitespace */
    [IN_WHITESPACE] = {
        TERMINAL(JSON_SKIP),
        [' '] = IN_WHITESPACE,
        ['\t'] = IN_WHITESPACE,
        ['\r'] = IN_WHITESPACE,
        ['\n'] = IN_WHITESPACE,
    },

    /* interpolation */
    [IN_INTERP] = {
        TERMINAL(JSON_INTERP),
        ['A' ... 'Z'] = IN_INTERP,
        ['a' ... 'z'] = IN_INTERP,
        ['0' ... '9'] = IN_INTERP,
    },

    /*
     * Two start states:
     * - IN_START recognizes JSON tokens with our string extensions
     * - IN_START_INTERP additionally recognizes interpolation.
     */
    [IN_START ... IN_START_INTERP] = {
        ['"'] = IN_DQ_STRING,
        ['\''] = IN_SQ_STRING,
        ['0'] = IN_ZERO,
        ['1' ... '9'] = IN_DIGITS,
        ['-'] = IN_SIGN,
        ['{'] = JSON_LCURLY,
        ['}'] = JSON_RCURLY,
        ['['] = JSON_LSQUARE,
        [']'] = JSON_RSQUARE,
        [','] = JSON_COMMA,
        [':'] = JSON_COLON,
        ['a' ... 'z'] = IN_KEYWORD,
        [' '] = IN_WHITESPACE,
        ['\t'] = IN_WHITESPACE,
        ['\r'] = IN_WHITESPACE,
        ['\n'] = IN_WHITESPACE,
    },
    [IN_START_INTERP]['%'] = IN_INTERP,
};

void json_lexer_init(JSONLexer *lexer, bool enable_interpolation)
{
    lexer->start_state = lexer->state = enable_interpolation
        ? IN_START_INTERP : IN_START;
    lexer->token = g_string_sized_new(3);
    lexer->x = lexer->y = 0;
}

static void json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush)
{
    int char_consumed, new_state;

    lexer->x++;
    if (ch == '\n') {
        lexer->x = 0;
        lexer->y++;
    }

    do {
        assert(lexer->state <= ARRAY_SIZE(json_lexer));
        new_state = json_lexer[lexer->state][(uint8_t)ch];
        char_consumed = !TERMINAL_NEEDED_LOOKAHEAD(lexer->state, new_state);
        if (char_consumed && !flush) {
            g_string_append_c(lexer->token, ch);
        }

        switch (new_state) {
        case JSON_LCURLY:
        case JSON_RCURLY:
        case JSON_LSQUARE:
        case JSON_RSQUARE:
        case JSON_COLON:
        case JSON_COMMA:
        case JSON_INTERP:
        case JSON_INTEGER:
        case JSON_FLOAT:
        case JSON_KEYWORD:
        case JSON_STRING:
            json_message_process_token(lexer, lexer->token, new_state,
                                       lexer->x, lexer->y);
            /* fall through */
        case JSON_SKIP:
            g_string_truncate(lexer->token, 0);
            new_state = lexer->start_state;
            break;
        case IN_ERROR:
            /* XXX: To avoid having previous bad input leaving the parser in an
             * unresponsive state where we consume unpredictable amounts of
             * subsequent "good" input, percolate this error state up to the
             * parser by emitting a JSON_ERROR token, then reset lexer state.
             *
             * Also note that this handling is required for reliable channel
             * negotiation between QMP and the guest agent, since chr(0xFF)
             * is placed at the beginning of certain events to ensure proper
             * delivery when the channel is in an unknown state. chr(0xFF) is
             * never a valid ASCII/UTF-8 sequence, so this should reliably
             * induce an error/flush state.
             */
            json_message_process_token(lexer, lexer->token, JSON_ERROR,
                                       lexer->x, lexer->y);
            g_string_truncate(lexer->token, 0);
            lexer->state = lexer->start_state;
            return;
        default:
            break;
        }
        lexer->state = new_state;
    } while (!char_consumed && !flush);

    /* Do not let a single token grow to an arbitrarily large size,
     * this is a security consideration.
     */
    if (lexer->token->len > MAX_TOKEN_SIZE) {
        json_message_process_token(lexer, lexer->token, lexer->state,
                                   lexer->x, lexer->y);
        g_string_truncate(lexer->token, 0);
        lexer->state = lexer->start_state;
    }
}

void json_lexer_feed(JSONLexer *lexer, const char *buffer, size_t size)
{
    size_t i;

    for (i = 0; i < size; i++) {
        json_lexer_feed_char(lexer, buffer[i], false);
    }
}

void json_lexer_flush(JSONLexer *lexer)
{
    if (lexer->state != lexer->start_state) {
        json_lexer_feed_char(lexer, 0, true);
    }
}

void json_lexer_destroy(JSONLexer *lexer)
{
    g_string_free(lexer->token, true);
}
Commit	Line	Data
5ab8558d AL	1	/*
	2	* JSON lexer
	3	*
	4	* Copyright IBM, Corp. 2009
	5	*
	6	* Authors:
	7	* Anthony Liguori <aliguori@us.ibm.com>
	8	*
	9	* This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
	10	* See the COPYING.LIB file in the top-level directory.
	11	*
	12	*/
	13
f2ad72b3	14	#include "qemu/osdep.h"
5ab8558d	15	#include "qemu-common.h"
7b1b5d19	16	#include "qapi/qmp/json-lexer.h"
037f2440	17	#include "qapi/qmp/json-streamer.h"
5ab8558d	18
325601b4 AL	19	#define MAX_TOKEN_SIZE (64ULL << 20)
325601b4 AL	20
5ab8558d	21	/*
eddc0a7f MA	22	* From RFC 8259 "The JavaScript Object Notation (JSON) Data
eddc0a7f MA	23	* Interchange Format", with [comments in brackets]:
ff5394ad	24	*
eddc0a7f MA	25	* The set of tokens includes six structural characters, strings,
eddc0a7f MA	26	* numbers, and three literal names.
ff5394ad	27	*
eddc0a7f	28	* These are the six structural characters:
ff5394ad	29	*
eddc0a7f MA	30	* begin-array = ws %x5B ws ; [ left square bracket
	31	* begin-object = ws %x7B ws ; { left curly bracket
	32	* end-array = ws %x5D ws ; ] right square bracket
	33	* end-object = ws %x7D ws ; } right curly bracket
	34	* name-separator = ws %x3A ws ; : colon
	35	* value-separator = ws %x2C ws ; , comma
ff5394ad	36	*
eddc0a7f MA	37	* Insignificant whitespace is allowed before or after any of the six
	38	* structural characters.
	39	* [This lexer accepts it before or after any token, which is actually
	40	* the same, as the grammar always has structural characters between
	41	* other tokens.]
ff5394ad	42	*
eddc0a7f MA	43	* ws = *(
	44	* %x20 / ; Space
	45	* %x09 / ; Horizontal tab
	46	* %x0A / ; Line feed or New line
	47	* %x0D ) ; Carriage return
5ab8558d	48	*
eddc0a7f MA	49	* [...] three literal names:
	50	* false null true
	51	* [This lexer accepts [a-z]+, and leaves rejecting unknown literal
	52	* names to the parser.]
	53	*
	54	* [Numbers:]
	55	*
	56	* number = [ minus ] int [ frac ] [ exp ]
	57	* decimal-point = %x2E ; .
	58	* digit1-9 = %x31-39 ; 1-9
	59	* e = %x65 / %x45 ; e E
	60	* exp = e [ minus / plus ] 1*DIGIT
	61	* frac = decimal-point 1*DIGIT
	62	* int = zero / ( digit1-9 *DIGIT )
	63	* minus = %x2D ; -
	64	* plus = %x2B ; +
	65	* zero = %x30 ; 0
	66	*
	67	* [Strings:]
	68	* string = quotation-mark *char quotation-mark
	69	*
	70	* char = unescaped /
	71	* escape (
	72	* %x22 / ; " quotation mark U+0022
	73	* %x5C / ; \ reverse solidus U+005C
	74	* %x2F / ; / solidus U+002F
	75	* %x62 / ; b backspace U+0008
	76	* %x66 / ; f form feed U+000C
	77	* %x6E / ; n line feed U+000A
	78	* %x72 / ; r carriage return U+000D
	79	* %x74 / ; t tab U+0009
	80	* %x75 4HEXDIG ) ; uXXXX U+XXXX
	81	* escape = %x5C ; \
	82	* quotation-mark = %x22 ; "
	83	* unescaped = %x20-21 / %x23-5B / %x5D-10FFFF
b2da4a4d MA	84	* [This lexer accepts any non-control character after escape, and
b2da4a4d MA	85	* leaves rejecting invalid ones to the parser.]
eddc0a7f MA	86	*
	87	*
	88	* Extensions over RFC 8259:
	89	* - Extra escape sequence in strings:
	90	* 0x27 (apostrophe) is recognized after escape, too
	91	* - Single-quoted strings:
	92	* Like double-quoted strings, except they're delimited by %x27
	93	* (apostrophe) instead of %x22 (quotation mark), and can't contain
	94	* unescaped apostrophe, but can contain unescaped quotation mark.
2cbd15aa	95	* - Interpolation, if enabled:
f7617d45 MA	96	* The lexer accepts %[A-Za-z0-9]*, and leaves rejecting invalid
f7617d45 MA	97	* ones to the parser.
eddc0a7f MA	98	*
eddc0a7f MA	99	* Note:
4b1c0cd7	100	* - Input must be encoded in modified UTF-8.
eddc0a7f	101	* - Decoding and validating is left to the parser.
5ab8558d AL	102	*/
	103
	104	enum json_lexer_state {
b8d3b1da	105	IN_ERROR = 0, /* must really be 0, see json_lexer[] */
5ab8558d AL	106	IN_DQ_STRING_ESCAPE,
5ab8558d AL	107	IN_DQ_STRING,
5ab8558d AL	108	IN_SQ_STRING_ESCAPE,
	109	IN_SQ_STRING,
	110	IN_ZERO,
4d400661 MA	111	IN_EXP_DIGITS,
4d400661 MA	112	IN_EXP_SIGN,
5ab8558d AL	113	IN_EXP_E,
	114	IN_MANTISSA,
	115	IN_MANTISSA_DIGITS,
4d400661 MA	116	IN_DIGITS,
4d400661 MA	117	IN_SIGN,
5ab8558d	118	IN_KEYWORD,
61030280	119	IN_INTERP,
5ab8558d	120	IN_WHITESPACE,
5ab8558d	121	IN_START,
2cbd15aa	122	IN_START_INTERP, /* must be IN_START + 1 */
5ab8558d AL	123	};
5ab8558d AL	124
2cbd15aa MA	125	QEMU_BUILD_BUG_ON((int)JSON_MIN <= (int)IN_START_INTERP);
2cbd15aa MA	126	QEMU_BUILD_BUG_ON(IN_START_INTERP != IN_START + 1);
b8d3b1da	127
5ab8558d AL	128	#define TERMINAL(state) [0 ... 0x7F] = (state)
5ab8558d AL	129
f7c05274 PB	130	/* Return whether TERMINAL is a terminal state and the transition to it
	131	from OLD_STATE required lookahead. This happens whenever the table
	132	below uses the TERMINAL macro. */
	133	#define TERMINAL_NEEDED_LOOKAHEAD(old_state, terminal) \
a2ec6be7	134	(terminal != IN_ERROR && json_lexer[(old_state)][0] == (terminal))
f7c05274	135
5ab8558d	136	static const uint8_t json_lexer[][256] = {
b8d3b1da MA	137	/* Relies on default initialization to IN_ERROR! */
b8d3b1da MA	138
5ab8558d	139	/* double quote string */
5ab8558d	140	[IN_DQ_STRING_ESCAPE] = {
b2da4a4d	141	[0x20 ... 0xFD] = IN_DQ_STRING,
5ab8558d AL	142	},
5ab8558d AL	143	[IN_DQ_STRING] = {
de930f45	144	[0x20 ... 0xFD] = IN_DQ_STRING,
5ab8558d	145	['\\'] = IN_DQ_STRING_ESCAPE,
28e91a68	146	['"'] = JSON_STRING,
5ab8558d AL	147	},
	148
	149	/* single quote string */
5ab8558d	150	[IN_SQ_STRING_ESCAPE] = {
b2da4a4d	151	[0x20 ... 0xFD] = IN_SQ_STRING,
5ab8558d AL	152	},
5ab8558d AL	153	[IN_SQ_STRING] = {
de930f45	154	[0x20 ... 0xFD] = IN_SQ_STRING,
5ab8558d	155	['\\'] = IN_SQ_STRING_ESCAPE,
28e91a68	156	['\''] = JSON_STRING,
5ab8558d AL	157	},
	158
	159	/* Zero */
	160	[IN_ZERO] = {
	161	TERMINAL(JSON_INTEGER),
33d05394	162	['0' ... '9'] = IN_ERROR,
5ab8558d AL	163	['.'] = IN_MANTISSA,
	164	},
	165
	166	/* Float */
4d400661	167	[IN_EXP_DIGITS] = {
5ab8558d	168	TERMINAL(JSON_FLOAT),
4d400661	169	['0' ... '9'] = IN_EXP_DIGITS,
5ab8558d AL	170	},
5ab8558d AL	171
4d400661 MA	172	[IN_EXP_SIGN] = {
4d400661 MA	173	['0' ... '9'] = IN_EXP_DIGITS,
5ab8558d AL	174	},
	175
	176	[IN_EXP_E] = {
4d400661 MA	177	['-'] = IN_EXP_SIGN,
	178	['+'] = IN_EXP_SIGN,
	179	['0' ... '9'] = IN_EXP_DIGITS,
5ab8558d AL	180	},
	181
	182	[IN_MANTISSA_DIGITS] = {
	183	TERMINAL(JSON_FLOAT),
	184	['0' ... '9'] = IN_MANTISSA_DIGITS,
	185	['e'] = IN_EXP_E,
	186	['E'] = IN_EXP_E,
	187	},
	188
	189	[IN_MANTISSA] = {
	190	['0' ... '9'] = IN_MANTISSA_DIGITS,
	191	},
	192
	193	/* Number */
4d400661	194	[IN_DIGITS] = {
5ab8558d	195	TERMINAL(JSON_INTEGER),
4d400661	196	['0' ... '9'] = IN_DIGITS,
5ab8558d AL	197	['e'] = IN_EXP_E,
	198	['E'] = IN_EXP_E,
	199	['.'] = IN_MANTISSA,
	200	},
	201
4d400661	202	[IN_SIGN] = {
5ab8558d	203	['0'] = IN_ZERO,
4d400661	204	['1' ... '9'] = IN_DIGITS,
5ab8558d AL	205	},
	206
	207	/* keywords */
	208	[IN_KEYWORD] = {
	209	TERMINAL(JSON_KEYWORD),
	210	['a' ... 'z'] = IN_KEYWORD,
	211	},
	212
	213	/* whitespace */
	214	[IN_WHITESPACE] = {
	215	TERMINAL(JSON_SKIP),
	216	[' '] = IN_WHITESPACE,
	217	['\t'] = IN_WHITESPACE,
	218	['\r'] = IN_WHITESPACE,
	219	['\n'] = IN_WHITESPACE,
ff5394ad	220	},
5ab8558d	221
61030280	222	/* interpolation */
61030280	223	[IN_INTERP] = {
f7617d45 MA	224	TERMINAL(JSON_INTERP),
	225	['A' ... 'Z'] = IN_INTERP,
	226	['a' ... 'z'] = IN_INTERP,
	227	['0' ... '9'] = IN_INTERP,
5ab8558d AL	228	},
5ab8558d AL	229
2cbd15aa MA	230	/*
	231	* Two start states:
	232	* - IN_START recognizes JSON tokens with our string extensions
	233	* - IN_START_INTERP additionally recognizes interpolation.
	234	*/
	235	[IN_START ... IN_START_INTERP] = {
5ab8558d AL	236	['"'] = IN_DQ_STRING,
	237	['\''] = IN_SQ_STRING,
	238	['0'] = IN_ZERO,
4d400661 MA	239	['1' ... '9'] = IN_DIGITS,
4d400661 MA	240	['-'] = IN_SIGN,
c5461660 MA	241	['{'] = JSON_LCURLY,
	242	['}'] = JSON_RCURLY,
	243	['['] = JSON_LSQUARE,
	244	[']'] = JSON_RSQUARE,
	245	[','] = JSON_COMMA,
	246	[':'] = JSON_COLON,
5ab8558d	247	['a' ... 'z'] = IN_KEYWORD,
5ab8558d AL	248	[' '] = IN_WHITESPACE,
	249	['\t'] = IN_WHITESPACE,
	250	['\r'] = IN_WHITESPACE,
	251	['\n'] = IN_WHITESPACE,
	252	},
2cbd15aa	253	[IN_START_INTERP]['%'] = IN_INTERP,
5ab8558d AL	254	};
5ab8558d AL	255
2cbd15aa	256	void json_lexer_init(JSONLexer *lexer, bool enable_interpolation)
5ab8558d	257	{
2cbd15aa MA	258	lexer->start_state = lexer->state = enable_interpolation
2cbd15aa MA	259	? IN_START_INTERP : IN_START;
d2ca7c0b	260	lexer->token = g_string_sized_new(3);
03308f6c	261	lexer->x = lexer->y = 0;
5ab8558d AL	262	}
5ab8558d AL	263
7c1e1d54	264	static void json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush)
5ab8558d	265	{
f7c05274 PB	266	int char_consumed, new_state;
f7c05274 PB	267
5ab8558d AL	268	lexer->x++;
	269	if (ch == '\n') {
	270	lexer->x = 0;
	271	lexer->y++;
	272	}
	273
f7c05274	274	do {
b8d3b1da	275	assert(lexer->state <= ARRAY_SIZE(json_lexer));
f7c05274 PB	276	new_state = json_lexer[lexer->state][(uint8_t)ch];
f7c05274 PB	277	char_consumed = !TERMINAL_NEEDED_LOOKAHEAD(lexer->state, new_state);
a2ec6be7	278	if (char_consumed && !flush) {
d2ca7c0b	279	g_string_append_c(lexer->token, ch);
f7c05274	280	}
5ab8558d	281
f7c05274	282	switch (new_state) {
c5461660 MA	283	case JSON_LCURLY:
	284	case JSON_RCURLY:
	285	case JSON_LSQUARE:
	286	case JSON_RSQUARE:
	287	case JSON_COLON:
	288	case JSON_COMMA:
61030280	289	case JSON_INTERP:
f7c05274 PB	290	case JSON_INTEGER:
	291	case JSON_FLOAT:
	292	case JSON_KEYWORD:
	293	case JSON_STRING:
037f2440 MA	294	json_message_process_token(lexer, lexer->token, new_state,
037f2440 MA	295	lexer->x, lexer->y);
0b0404bf	296	/* fall through */
f7c05274	297	case JSON_SKIP:
d2ca7c0b	298	g_string_truncate(lexer->token, 0);
2cbd15aa	299	new_state = lexer->start_state;
f7c05274	300	break;
33d05394	301	case IN_ERROR:
b011f619 MR	302	/* XXX: To avoid having previous bad input leaving the parser in an
	303	* unresponsive state where we consume unpredictable amounts of
	304	* subsequent "good" input, percolate this error state up to the
84a56f38	305	* parser by emitting a JSON_ERROR token, then reset lexer state.
b011f619 MR	306	*
	307	* Also note that this handling is required for reliable channel
	308	* negotiation between QMP and the guest agent, since chr(0xFF)
	309	* is placed at the beginning of certain events to ensure proper
	310	* delivery when the channel is in an unknown state. chr(0xFF) is
	311	* never a valid ASCII/UTF-8 sequence, so this should reliably
	312	* induce an error/flush state.
	313	*/
037f2440 MA	314	json_message_process_token(lexer, lexer->token, JSON_ERROR,
037f2440 MA	315	lexer->x, lexer->y);
d2ca7c0b	316	g_string_truncate(lexer->token, 0);
2cbd15aa	317	lexer->state = lexer->start_state;
7c1e1d54	318	return;
f7c05274 PB	319	default:
	320	break;
	321	}
	322	lexer->state = new_state;
bd3924a3	323	} while (!char_consumed && !flush);
325601b4 AL	324
	325	/* Do not let a single token grow to an arbitrarily large size,
	326	* this is a security consideration.
	327	*/
d2ca7c0b	328	if (lexer->token->len > MAX_TOKEN_SIZE) {
037f2440 MA	329	json_message_process_token(lexer, lexer->token, lexer->state,
037f2440 MA	330	lexer->x, lexer->y);
d2ca7c0b	331	g_string_truncate(lexer->token, 0);
2cbd15aa	332	lexer->state = lexer->start_state;
325601b4	333	}
5ab8558d AL	334	}
5ab8558d AL	335
7c1e1d54	336	void json_lexer_feed(JSONLexer lexer, const char buffer, size_t size)
5ab8558d AL	337	{
	338	size_t i;
	339
	340	for (i = 0; i < size; i++) {
7c1e1d54	341	json_lexer_feed_char(lexer, buffer[i], false);
5ab8558d	342	}
5ab8558d AL	343	}
5ab8558d AL	344
7c1e1d54	345	void json_lexer_flush(JSONLexer *lexer)
5ab8558d	346	{
2cbd15aa	347	if (lexer->state != lexer->start_state) {
7c1e1d54 MAL	348	json_lexer_feed_char(lexer, 0, true);
7c1e1d54 MAL	349	}
5ab8558d AL	350	}
	351
	352	void json_lexer_destroy(JSONLexer *lexer)
	353	{
d2ca7c0b	354	g_string_free(lexer->token, true);
5ab8558d	355	}