[mirror_qemu.git] / qobject / json-lexer.c

/*
 * JSON lexer
 *
 * Copyright IBM, Corp. 2009
 *
 * Authors:
 *  Anthony Liguori   <aliguori@us.ibm.com>
 *
 * This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
 * See the COPYING.LIB file in the top-level directory.
 *
 */

#include "qemu/osdep.h"
#include "qemu-common.h"
#include "qapi/qmp/json-lexer.h"

#define MAX_TOKEN_SIZE (64ULL << 20)

/*
 * From RFC 8259 "The JavaScript Object Notation (JSON) Data
 * Interchange Format", with [comments in brackets]:
 *
 * The set of tokens includes six structural characters, strings,
 * numbers, and three literal names.
 *
 * These are the six structural characters:
 *
 *    begin-array     = ws %x5B ws  ; [ left square bracket
 *    begin-object    = ws %x7B ws  ; { left curly bracket
 *    end-array       = ws %x5D ws  ; ] right square bracket
 *    end-object      = ws %x7D ws  ; } right curly bracket
 *    name-separator  = ws %x3A ws  ; : colon
 *    value-separator = ws %x2C ws  ; , comma
 *
 * Insignificant whitespace is allowed before or after any of the six
 * structural characters.
 * [This lexer accepts it before or after any token, which is actually
 * the same, as the grammar always has structural characters between
 * other tokens.]
 *
 *    ws = *(
 *           %x20 /              ; Space
 *           %x09 /              ; Horizontal tab
 *           %x0A /              ; Line feed or New line
 *           %x0D )              ; Carriage return
 *
 * [...] three literal names:
 *    false null true
 *  [This lexer accepts [a-z]+, and leaves rejecting unknown literal
 *  names to the parser.]
 *
 * [Numbers:]
 *
 *    number = [ minus ] int [ frac ] [ exp ]
 *    decimal-point = %x2E       ; .
 *    digit1-9 = %x31-39         ; 1-9
 *    e = %x65 / %x45            ; e E
 *    exp = e [ minus / plus ] 1*DIGIT
 *    frac = decimal-point 1*DIGIT
 *    int = zero / ( digit1-9 *DIGIT )
 *    minus = %x2D               ; -
 *    plus = %x2B                ; +
 *    zero = %x30                ; 0
 *
 * [Strings:]
 *    string = quotation-mark *char quotation-mark
 *
 *    char = unescaped /
 *        escape (
 *            %x22 /          ; "    quotation mark  U+0022
 *            %x5C /          ; \    reverse solidus U+005C
 *            %x2F /          ; /    solidus         U+002F
 *            %x62 /          ; b    backspace       U+0008
 *            %x66 /          ; f    form feed       U+000C
 *            %x6E /          ; n    line feed       U+000A
 *            %x72 /          ; r    carriage return U+000D
 *            %x74 /          ; t    tab             U+0009
 *            %x75 4HEXDIG )  ; uXXXX                U+XXXX
 *    escape = %x5C              ; \
 *    quotation-mark = %x22      ; "
 *    unescaped = %x20-21 / %x23-5B / %x5D-10FFFF
 *    [This lexer accepts any non-control character after escape, and
 *    leaves rejecting invalid ones to the parser.]
 *
 *
 * Extensions over RFC 8259:
 * - Extra escape sequence in strings:
 *   0x27 (apostrophe) is recognized after escape, too
 * - Single-quoted strings:
 *   Like double-quoted strings, except they're delimited by %x27
 *   (apostrophe) instead of %x22 (quotation mark), and can't contain
 *   unescaped apostrophe, but can contain unescaped quotation mark.
 * - Interpolation:
 *   interpolation = %((l|ll|I64)[du]|[ipsf])
 *
 * Note:
 * - Input must be encoded in modified UTF-8.
 * - Decoding and validating is left to the parser.
 */

enum json_lexer_state {
    IN_ERROR = 0,               /* must really be 0, see json_lexer[] */
    IN_DQ_STRING_ESCAPE,
    IN_DQ_STRING,
    IN_SQ_STRING_ESCAPE,
    IN_SQ_STRING,
    IN_ZERO,
    IN_DIGITS,
    IN_DIGIT,
    IN_EXP_E,
    IN_MANTISSA,
    IN_MANTISSA_DIGITS,
    IN_NONZERO_NUMBER,
    IN_NEG_NONZERO_NUMBER,
    IN_KEYWORD,
    IN_ESCAPE,
    IN_ESCAPE_L,
    IN_ESCAPE_LL,
    IN_ESCAPE_I,
    IN_ESCAPE_I6,
    IN_ESCAPE_I64,
    IN_WHITESPACE,
    IN_START,
};

QEMU_BUILD_BUG_ON((int)JSON_MIN <= (int)IN_START);

#define TERMINAL(state) [0 ... 0x7F] = (state)

/* Return whether TERMINAL is a terminal state and the transition to it
   from OLD_STATE required lookahead.  This happens whenever the table
   below uses the TERMINAL macro.  */
#define TERMINAL_NEEDED_LOOKAHEAD(old_state, terminal) \
    (terminal != IN_ERROR && json_lexer[(old_state)][0] == (terminal))

static const uint8_t json_lexer[][256] =  {
    /* Relies on default initialization to IN_ERROR! */

    /* double quote string */
    [IN_DQ_STRING_ESCAPE] = {
        [0x20 ... 0xFD] = IN_DQ_STRING,
    },
    [IN_DQ_STRING] = {
        [0x20 ... 0xFD] = IN_DQ_STRING,
        ['\\'] = IN_DQ_STRING_ESCAPE,
        ['"'] = JSON_STRING,
    },

    /* single quote string */
    [IN_SQ_STRING_ESCAPE] = {
        [0x20 ... 0xFD] = IN_SQ_STRING,
    },
    [IN_SQ_STRING] = {
        [0x20 ... 0xFD] = IN_SQ_STRING,
        ['\\'] = IN_SQ_STRING_ESCAPE,
        ['\''] = JSON_STRING,
    },

    /* Zero */
    [IN_ZERO] = {
        TERMINAL(JSON_INTEGER),
        ['0' ... '9'] = IN_ERROR,
        ['.'] = IN_MANTISSA,
    },

    /* Float */
    [IN_DIGITS] = {
        TERMINAL(JSON_FLOAT),
        ['0' ... '9'] = IN_DIGITS,
    },

    [IN_DIGIT] = {
        ['0' ... '9'] = IN_DIGITS,
    },

    [IN_EXP_E] = {
        ['-'] = IN_DIGIT,
        ['+'] = IN_DIGIT,
        ['0' ... '9'] = IN_DIGITS,
    },

    [IN_MANTISSA_DIGITS] = {
        TERMINAL(JSON_FLOAT),
        ['0' ... '9'] = IN_MANTISSA_DIGITS,
        ['e'] = IN_EXP_E,
        ['E'] = IN_EXP_E,
    },

    [IN_MANTISSA] = {
        ['0' ... '9'] = IN_MANTISSA_DIGITS,
    },

    /* Number */
    [IN_NONZERO_NUMBER] = {
        TERMINAL(JSON_INTEGER),
        ['0' ... '9'] = IN_NONZERO_NUMBER,
        ['e'] = IN_EXP_E,
        ['E'] = IN_EXP_E,
        ['.'] = IN_MANTISSA,
    },

    [IN_NEG_NONZERO_NUMBER] = {
        ['0'] = IN_ZERO,
        ['1' ... '9'] = IN_NONZERO_NUMBER,
    },

    /* keywords */
    [IN_KEYWORD] = {
        TERMINAL(JSON_KEYWORD),
        ['a' ... 'z'] = IN_KEYWORD,
    },

    /* whitespace */
    [IN_WHITESPACE] = {
        TERMINAL(JSON_SKIP),
        [' '] = IN_WHITESPACE,
        ['\t'] = IN_WHITESPACE,
        ['\r'] = IN_WHITESPACE,
        ['\n'] = IN_WHITESPACE,
    },

    /* escape */
    [IN_ESCAPE_LL] = {
        ['d'] = JSON_ESCAPE,
        ['u'] = JSON_ESCAPE,
    },

    [IN_ESCAPE_L] = {
        ['d'] = JSON_ESCAPE,
        ['l'] = IN_ESCAPE_LL,
        ['u'] = JSON_ESCAPE,
    },

    [IN_ESCAPE_I64] = {
        ['d'] = JSON_ESCAPE,
        ['u'] = JSON_ESCAPE,
    },

    [IN_ESCAPE_I6] = {
        ['4'] = IN_ESCAPE_I64,
    },

    [IN_ESCAPE_I] = {
        ['6'] = IN_ESCAPE_I6,
    },

    [IN_ESCAPE] = {
        ['d'] = JSON_ESCAPE,
        ['i'] = JSON_ESCAPE,
        ['p'] = JSON_ESCAPE,
        ['s'] = JSON_ESCAPE,
        ['u'] = JSON_ESCAPE,
        ['f'] = JSON_ESCAPE,
        ['l'] = IN_ESCAPE_L,
        ['I'] = IN_ESCAPE_I,
    },

    /* top level rule */
    [IN_START] = {
        ['"'] = IN_DQ_STRING,
        ['\''] = IN_SQ_STRING,
        ['0'] = IN_ZERO,
        ['1' ... '9'] = IN_NONZERO_NUMBER,
        ['-'] = IN_NEG_NONZERO_NUMBER,
        ['{'] = JSON_LCURLY,
        ['}'] = JSON_RCURLY,
        ['['] = JSON_LSQUARE,
        [']'] = JSON_RSQUARE,
        [','] = JSON_COMMA,
        [':'] = JSON_COLON,
        ['a' ... 'z'] = IN_KEYWORD,
        ['%'] = IN_ESCAPE,
        [' '] = IN_WHITESPACE,
        ['\t'] = IN_WHITESPACE,
        ['\r'] = IN_WHITESPACE,
        ['\n'] = IN_WHITESPACE,
    },
};

void json_lexer_init(JSONLexer *lexer, JSONLexerEmitter func)
{
    lexer->emit = func;
    lexer->state = IN_START;
    lexer->token = g_string_sized_new(3);
    lexer->x = lexer->y = 0;
}

static void json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush)
{
    int char_consumed, new_state;

    lexer->x++;
    if (ch == '\n') {
        lexer->x = 0;
        lexer->y++;
    }

    do {
        assert(lexer->state <= ARRAY_SIZE(json_lexer));
        new_state = json_lexer[lexer->state][(uint8_t)ch];
        char_consumed = !TERMINAL_NEEDED_LOOKAHEAD(lexer->state, new_state);
        if (char_consumed && !flush) {
            g_string_append_c(lexer->token, ch);
        }

        switch (new_state) {
        case JSON_LCURLY:
        case JSON_RCURLY:
        case JSON_LSQUARE:
        case JSON_RSQUARE:
        case JSON_COLON:
        case JSON_COMMA:
        case JSON_ESCAPE:
        case JSON_INTEGER:
        case JSON_FLOAT:
        case JSON_KEYWORD:
        case JSON_STRING:
            lexer->emit(lexer, lexer->token, new_state, lexer->x, lexer->y);
            /* fall through */
        case JSON_SKIP:
            g_string_truncate(lexer->token, 0);
            new_state = IN_START;
            break;
        case IN_ERROR:
            /* XXX: To avoid having previous bad input leaving the parser in an
             * unresponsive state where we consume unpredictable amounts of
             * subsequent "good" input, percolate this error state up to the
             * tokenizer/parser by forcing a NULL object to be emitted, then
             * reset state.
             *
             * Also note that this handling is required for reliable channel
             * negotiation between QMP and the guest agent, since chr(0xFF)
             * is placed at the beginning of certain events to ensure proper
             * delivery when the channel is in an unknown state. chr(0xFF) is
             * never a valid ASCII/UTF-8 sequence, so this should reliably
             * induce an error/flush state.
             */
            lexer->emit(lexer, lexer->token, JSON_ERROR, lexer->x, lexer->y);
            g_string_truncate(lexer->token, 0);
            new_state = IN_START;
            lexer->state = new_state;
            return;
        default:
            break;
        }
        lexer->state = new_state;
    } while (!char_consumed && !flush);

    /* Do not let a single token grow to an arbitrarily large size,
     * this is a security consideration.
     */
    if (lexer->token->len > MAX_TOKEN_SIZE) {
        lexer->emit(lexer, lexer->token, lexer->state, lexer->x, lexer->y);
        g_string_truncate(lexer->token, 0);
        lexer->state = IN_START;
    }
}

void json_lexer_feed(JSONLexer *lexer, const char *buffer, size_t size)
{
    size_t i;

    for (i = 0; i < size; i++) {
        json_lexer_feed_char(lexer, buffer[i], false);
    }
}

void json_lexer_flush(JSONLexer *lexer)
{
    if (lexer->state != IN_START) {
        json_lexer_feed_char(lexer, 0, true);
    }
}

void json_lexer_destroy(JSONLexer *lexer)
{
    g_string_free(lexer->token, true);
}
Commit	Line	Data
5ab8558d AL	1	/*
	2	* JSON lexer
	3	*
	4	* Copyright IBM, Corp. 2009
	5	*
	6	* Authors:
	7	* Anthony Liguori <aliguori@us.ibm.com>
	8	*
	9	* This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
	10	* See the COPYING.LIB file in the top-level directory.
	11	*
	12	*/
	13
f2ad72b3	14	#include "qemu/osdep.h"
5ab8558d	15	#include "qemu-common.h"
7b1b5d19	16	#include "qapi/qmp/json-lexer.h"
5ab8558d	17
325601b4 AL	18	#define MAX_TOKEN_SIZE (64ULL << 20)
325601b4 AL	19
5ab8558d	20	/*
eddc0a7f MA	21	* From RFC 8259 "The JavaScript Object Notation (JSON) Data
eddc0a7f MA	22	* Interchange Format", with [comments in brackets]:
ff5394ad	23	*
eddc0a7f MA	24	* The set of tokens includes six structural characters, strings,
eddc0a7f MA	25	* numbers, and three literal names.
ff5394ad	26	*
eddc0a7f	27	* These are the six structural characters:
ff5394ad	28	*
eddc0a7f MA	29	* begin-array = ws %x5B ws ; [ left square bracket
	30	* begin-object = ws %x7B ws ; { left curly bracket
	31	* end-array = ws %x5D ws ; ] right square bracket
	32	* end-object = ws %x7D ws ; } right curly bracket
	33	* name-separator = ws %x3A ws ; : colon
	34	* value-separator = ws %x2C ws ; , comma
ff5394ad	35	*
eddc0a7f MA	36	* Insignificant whitespace is allowed before or after any of the six
	37	* structural characters.
	38	* [This lexer accepts it before or after any token, which is actually
	39	* the same, as the grammar always has structural characters between
	40	* other tokens.]
ff5394ad	41	*
eddc0a7f MA	42	* ws = *(
	43	* %x20 / ; Space
	44	* %x09 / ; Horizontal tab
	45	* %x0A / ; Line feed or New line
	46	* %x0D ) ; Carriage return
5ab8558d	47	*
eddc0a7f MA	48	* [...] three literal names:
	49	* false null true
	50	* [This lexer accepts [a-z]+, and leaves rejecting unknown literal
	51	* names to the parser.]
	52	*
	53	* [Numbers:]
	54	*
	55	* number = [ minus ] int [ frac ] [ exp ]
	56	* decimal-point = %x2E ; .
	57	* digit1-9 = %x31-39 ; 1-9
	58	* e = %x65 / %x45 ; e E
	59	* exp = e [ minus / plus ] 1*DIGIT
	60	* frac = decimal-point 1*DIGIT
	61	* int = zero / ( digit1-9 *DIGIT )
	62	* minus = %x2D ; -
	63	* plus = %x2B ; +
	64	* zero = %x30 ; 0
	65	*
	66	* [Strings:]
	67	* string = quotation-mark *char quotation-mark
	68	*
	69	* char = unescaped /
	70	* escape (
	71	* %x22 / ; " quotation mark U+0022
	72	* %x5C / ; \ reverse solidus U+005C
	73	* %x2F / ; / solidus U+002F
	74	* %x62 / ; b backspace U+0008
	75	* %x66 / ; f form feed U+000C
	76	* %x6E / ; n line feed U+000A
	77	* %x72 / ; r carriage return U+000D
	78	* %x74 / ; t tab U+0009
	79	* %x75 4HEXDIG ) ; uXXXX U+XXXX
	80	* escape = %x5C ; \
	81	* quotation-mark = %x22 ; "
	82	* unescaped = %x20-21 / %x23-5B / %x5D-10FFFF
b2da4a4d MA	83	* [This lexer accepts any non-control character after escape, and
b2da4a4d MA	84	* leaves rejecting invalid ones to the parser.]
eddc0a7f MA	85	*
	86	*
	87	* Extensions over RFC 8259:
	88	* - Extra escape sequence in strings:
	89	* 0x27 (apostrophe) is recognized after escape, too
	90	* - Single-quoted strings:
	91	* Like double-quoted strings, except they're delimited by %x27
	92	* (apostrophe) instead of %x22 (quotation mark), and can't contain
	93	* unescaped apostrophe, but can contain unescaped quotation mark.
	94	* - Interpolation:
	95	* interpolation = %((l\|ll\|I64)[du]\|[ipsf])
	96	*
	97	* Note:
4b1c0cd7	98	* - Input must be encoded in modified UTF-8.
eddc0a7f	99	* - Decoding and validating is left to the parser.
5ab8558d AL	100	*/
	101
	102	enum json_lexer_state {
b8d3b1da	103	IN_ERROR = 0, /* must really be 0, see json_lexer[] */
5ab8558d AL	104	IN_DQ_STRING_ESCAPE,
5ab8558d AL	105	IN_DQ_STRING,
5ab8558d AL	106	IN_SQ_STRING_ESCAPE,
	107	IN_SQ_STRING,
	108	IN_ZERO,
	109	IN_DIGITS,
	110	IN_DIGIT,
	111	IN_EXP_E,
	112	IN_MANTISSA,
	113	IN_MANTISSA_DIGITS,
	114	IN_NONZERO_NUMBER,
	115	IN_NEG_NONZERO_NUMBER,
	116	IN_KEYWORD,
	117	IN_ESCAPE,
	118	IN_ESCAPE_L,
	119	IN_ESCAPE_LL,
2c0d4b36 RT	120	IN_ESCAPE_I,
	121	IN_ESCAPE_I6,
	122	IN_ESCAPE_I64,
5ab8558d	123	IN_WHITESPACE,
5ab8558d AL	124	IN_START,
	125	};
	126
b8d3b1da MA	127	QEMU_BUILD_BUG_ON((int)JSON_MIN <= (int)IN_START);
b8d3b1da MA	128
5ab8558d AL	129	#define TERMINAL(state) [0 ... 0x7F] = (state)
5ab8558d AL	130
f7c05274 PB	131	/* Return whether TERMINAL is a terminal state and the transition to it
	132	from OLD_STATE required lookahead. This happens whenever the table
	133	below uses the TERMINAL macro. */
	134	#define TERMINAL_NEEDED_LOOKAHEAD(old_state, terminal) \
a2ec6be7	135	(terminal != IN_ERROR && json_lexer[(old_state)][0] == (terminal))
f7c05274	136
5ab8558d	137	static const uint8_t json_lexer[][256] = {
b8d3b1da MA	138	/* Relies on default initialization to IN_ERROR! */
b8d3b1da MA	139
5ab8558d	140	/* double quote string */
5ab8558d	141	[IN_DQ_STRING_ESCAPE] = {
b2da4a4d	142	[0x20 ... 0xFD] = IN_DQ_STRING,
5ab8558d AL	143	},
5ab8558d AL	144	[IN_DQ_STRING] = {
de930f45	145	[0x20 ... 0xFD] = IN_DQ_STRING,
5ab8558d	146	['\\'] = IN_DQ_STRING_ESCAPE,
28e91a68	147	['"'] = JSON_STRING,
5ab8558d AL	148	},
	149
	150	/* single quote string */
5ab8558d	151	[IN_SQ_STRING_ESCAPE] = {
b2da4a4d	152	[0x20 ... 0xFD] = IN_SQ_STRING,
5ab8558d AL	153	},
5ab8558d AL	154	[IN_SQ_STRING] = {
de930f45	155	[0x20 ... 0xFD] = IN_SQ_STRING,
5ab8558d	156	['\\'] = IN_SQ_STRING_ESCAPE,
28e91a68	157	['\''] = JSON_STRING,
5ab8558d AL	158	},
	159
	160	/* Zero */
	161	[IN_ZERO] = {
	162	TERMINAL(JSON_INTEGER),
33d05394	163	['0' ... '9'] = IN_ERROR,
5ab8558d AL	164	['.'] = IN_MANTISSA,
	165	},
	166
	167	/* Float */
	168	[IN_DIGITS] = {
	169	TERMINAL(JSON_FLOAT),
	170	['0' ... '9'] = IN_DIGITS,
	171	},
	172
	173	[IN_DIGIT] = {
	174	['0' ... '9'] = IN_DIGITS,
	175	},
	176
	177	[IN_EXP_E] = {
	178	['-'] = IN_DIGIT,
	179	['+'] = IN_DIGIT,
	180	['0' ... '9'] = IN_DIGITS,
	181	},
	182
	183	[IN_MANTISSA_DIGITS] = {
	184	TERMINAL(JSON_FLOAT),
	185	['0' ... '9'] = IN_MANTISSA_DIGITS,
	186	['e'] = IN_EXP_E,
	187	['E'] = IN_EXP_E,
	188	},
	189
	190	[IN_MANTISSA] = {
	191	['0' ... '9'] = IN_MANTISSA_DIGITS,
	192	},
	193
	194	/* Number */
	195	[IN_NONZERO_NUMBER] = {
	196	TERMINAL(JSON_INTEGER),
	197	['0' ... '9'] = IN_NONZERO_NUMBER,
	198	['e'] = IN_EXP_E,
	199	['E'] = IN_EXP_E,
	200	['.'] = IN_MANTISSA,
	201	},
	202
	203	[IN_NEG_NONZERO_NUMBER] = {
	204	['0'] = IN_ZERO,
	205	['1' ... '9'] = IN_NONZERO_NUMBER,
	206	},
	207
	208	/* keywords */
	209	[IN_KEYWORD] = {
	210	TERMINAL(JSON_KEYWORD),
	211	['a' ... 'z'] = IN_KEYWORD,
	212	},
	213
	214	/* whitespace */
	215	[IN_WHITESPACE] = {
	216	TERMINAL(JSON_SKIP),
	217	[' '] = IN_WHITESPACE,
	218	['\t'] = IN_WHITESPACE,
	219	['\r'] = IN_WHITESPACE,
	220	['\n'] = IN_WHITESPACE,
ff5394ad	221	},
5ab8558d	222
5ab8558d	223	/* escape */
5ab8558d	224	[IN_ESCAPE_LL] = {
28e91a68	225	['d'] = JSON_ESCAPE,
2bc7cfea	226	['u'] = JSON_ESCAPE,
5ab8558d AL	227	},
	228
	229	[IN_ESCAPE_L] = {
28e91a68	230	['d'] = JSON_ESCAPE,
5ab8558d	231	['l'] = IN_ESCAPE_LL,
2bc7cfea	232	['u'] = JSON_ESCAPE,
5ab8558d AL	233	},
5ab8558d AL	234
2c0d4b36	235	[IN_ESCAPE_I64] = {
28e91a68	236	['d'] = JSON_ESCAPE,
2bc7cfea	237	['u'] = JSON_ESCAPE,
2c0d4b36 RT	238	},
	239
	240	[IN_ESCAPE_I6] = {
	241	['4'] = IN_ESCAPE_I64,
	242	},
	243
	244	[IN_ESCAPE_I] = {
	245	['6'] = IN_ESCAPE_I6,
	246	},
	247
5ab8558d	248	[IN_ESCAPE] = {
28e91a68 PB	249	['d'] = JSON_ESCAPE,
	250	['i'] = JSON_ESCAPE,
	251	['p'] = JSON_ESCAPE,
	252	['s'] = JSON_ESCAPE,
2bc7cfea	253	['u'] = JSON_ESCAPE,
28e91a68	254	['f'] = JSON_ESCAPE,
5ab8558d	255	['l'] = IN_ESCAPE_L,
2c0d4b36	256	['I'] = IN_ESCAPE_I,
5ab8558d AL	257	},
	258
	259	/* top level rule */
	260	[IN_START] = {
	261	['"'] = IN_DQ_STRING,
	262	['\''] = IN_SQ_STRING,
	263	['0'] = IN_ZERO,
	264	['1' ... '9'] = IN_NONZERO_NUMBER,
	265	['-'] = IN_NEG_NONZERO_NUMBER,
c5461660 MA	266	['{'] = JSON_LCURLY,
	267	['}'] = JSON_RCURLY,
	268	['['] = JSON_LSQUARE,
	269	[']'] = JSON_RSQUARE,
	270	[','] = JSON_COMMA,
	271	[':'] = JSON_COLON,
5ab8558d AL	272	['a' ... 'z'] = IN_KEYWORD,
	273	['%'] = IN_ESCAPE,
	274	[' '] = IN_WHITESPACE,
	275	['\t'] = IN_WHITESPACE,
	276	['\r'] = IN_WHITESPACE,
	277	['\n'] = IN_WHITESPACE,
	278	},
	279	};
	280
	281	void json_lexer_init(JSONLexer *lexer, JSONLexerEmitter func)
	282	{
	283	lexer->emit = func;
	284	lexer->state = IN_START;
d2ca7c0b	285	lexer->token = g_string_sized_new(3);
03308f6c	286	lexer->x = lexer->y = 0;
5ab8558d AL	287	}
5ab8558d AL	288
7c1e1d54	289	static void json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush)
5ab8558d	290	{
f7c05274 PB	291	int char_consumed, new_state;
f7c05274 PB	292
5ab8558d AL	293	lexer->x++;
	294	if (ch == '\n') {
	295	lexer->x = 0;
	296	lexer->y++;
	297	}
	298
f7c05274	299	do {
b8d3b1da	300	assert(lexer->state <= ARRAY_SIZE(json_lexer));
f7c05274 PB	301	new_state = json_lexer[lexer->state][(uint8_t)ch];
f7c05274 PB	302	char_consumed = !TERMINAL_NEEDED_LOOKAHEAD(lexer->state, new_state);
a2ec6be7	303	if (char_consumed && !flush) {
d2ca7c0b	304	g_string_append_c(lexer->token, ch);
f7c05274	305	}
5ab8558d	306
f7c05274	307	switch (new_state) {
c5461660 MA	308	case JSON_LCURLY:
	309	case JSON_RCURLY:
	310	case JSON_LSQUARE:
	311	case JSON_RSQUARE:
	312	case JSON_COLON:
	313	case JSON_COMMA:
f7c05274 PB	314	case JSON_ESCAPE:
	315	case JSON_INTEGER:
	316	case JSON_FLOAT:
	317	case JSON_KEYWORD:
	318	case JSON_STRING:
	319	lexer->emit(lexer, lexer->token, new_state, lexer->x, lexer->y);
0b0404bf	320	/* fall through */
f7c05274	321	case JSON_SKIP:
d2ca7c0b	322	g_string_truncate(lexer->token, 0);
f7c05274 PB	323	new_state = IN_START;
f7c05274 PB	324	break;
33d05394	325	case IN_ERROR:
b011f619 MR	326	/* XXX: To avoid having previous bad input leaving the parser in an
	327	* unresponsive state where we consume unpredictable amounts of
	328	* subsequent "good" input, percolate this error state up to the
	329	* tokenizer/parser by forcing a NULL object to be emitted, then
	330	* reset state.
	331	*
	332	* Also note that this handling is required for reliable channel
	333	* negotiation between QMP and the guest agent, since chr(0xFF)
	334	* is placed at the beginning of certain events to ensure proper
	335	* delivery when the channel is in an unknown state. chr(0xFF) is
	336	* never a valid ASCII/UTF-8 sequence, so this should reliably
	337	* induce an error/flush state.
	338	*/
	339	lexer->emit(lexer, lexer->token, JSON_ERROR, lexer->x, lexer->y);
d2ca7c0b	340	g_string_truncate(lexer->token, 0);
529a0ef5	341	new_state = IN_START;
b011f619	342	lexer->state = new_state;
7c1e1d54	343	return;
f7c05274 PB	344	default:
	345	break;
	346	}
	347	lexer->state = new_state;
bd3924a3	348	} while (!char_consumed && !flush);
325601b4 AL	349
	350	/* Do not let a single token grow to an arbitrarily large size,
	351	* this is a security consideration.
	352	*/
d2ca7c0b	353	if (lexer->token->len > MAX_TOKEN_SIZE) {
325601b4	354	lexer->emit(lexer, lexer->token, lexer->state, lexer->x, lexer->y);
d2ca7c0b	355	g_string_truncate(lexer->token, 0);
325601b4 AL	356	lexer->state = IN_START;
325601b4 AL	357	}
5ab8558d AL	358	}
5ab8558d AL	359
7c1e1d54	360	void json_lexer_feed(JSONLexer lexer, const char buffer, size_t size)
5ab8558d AL	361	{
	362	size_t i;
	363
	364	for (i = 0; i < size; i++) {
7c1e1d54	365	json_lexer_feed_char(lexer, buffer[i], false);
5ab8558d	366	}
5ab8558d AL	367	}
5ab8558d AL	368
7c1e1d54	369	void json_lexer_flush(JSONLexer *lexer)
5ab8558d	370	{
7c1e1d54 MAL	371	if (lexer->state != IN_START) {
	372	json_lexer_feed_char(lexer, 0, true);
	373	}
5ab8558d AL	374	}
	375
	376	void json_lexer_destroy(JSONLexer *lexer)
	377	{
d2ca7c0b	378	g_string_free(lexer->token, true);
5ab8558d	379	}