[ovs.git] / ovn / lib / lex.c

/*
 * Copyright (c) 2015 Nicira, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <config.h>
#include "lex.h"
#include <ctype.h>
#include <errno.h>
#include <stdarg.h>
#include "dynamic-string.h"
#include "json.h"
#include "packets.h"
#include "util.h"
\f
/* Returns a string that represents 'format'. */
const char *
lex_format_to_string(enum lex_format format)
{
    switch (format) {
    case LEX_F_DECIMAL:
        return "decimal";
    case LEX_F_HEXADECIMAL:
        return "hexadecimal";
    case LEX_F_IPV4:
        return "IPv4";
    case LEX_F_IPV6:
        return "IPv6";
    case LEX_F_ETHERNET:
        return "Ethernet";
    default:
        abort();
    }
}
\f
/* Initializes 'token'. */
void
lex_token_init(struct lex_token *token)
{
    token->type = LEX_T_END;
    token->s = NULL;
}

/* Frees memory owned by 'token'. */
void
lex_token_destroy(struct lex_token *token)
{
    free(token->s);
}

/* Exchanges 'a' and 'b'. */
void
lex_token_swap(struct lex_token *a, struct lex_token *b)
{
    struct lex_token tmp = *a;
    *a = *b;
    *b = tmp;
}
\f
/* lex_token_format(). */

static size_t
lex_token_n_zeros(enum lex_format format)
{
    switch (format) {
    case LEX_F_DECIMAL:     return offsetof(union mf_subvalue, integer);
    case LEX_F_HEXADECIMAL: return 0;
    case LEX_F_IPV4:        return offsetof(union mf_subvalue, ipv4);
    case LEX_F_IPV6:        return offsetof(union mf_subvalue, ipv6);
    case LEX_F_ETHERNET:    return offsetof(union mf_subvalue, mac);
    default: OVS_NOT_REACHED();
    }
}

/* Returns the effective format for 'token', that is, the format in which it
 * should actually be printed.  This is ordinarily the same as 'token->format',
 * but it's always possible that someone sets up a token with a format that
 * won't work for a value, e.g. 'token->value' is wider than 32 bits but the
 * format is LEX_F_IPV4.  (The lexer itself won't do that; this is an attempt
 * to avoid confusion in the future.) */
static enum lex_format
lex_token_get_format(const struct lex_token *token)
{
    size_t n_zeros = lex_token_n_zeros(token->format);
    return (is_all_zeros(&token->value, n_zeros)
            && (token->type != LEX_T_MASKED_INTEGER
                || is_all_zeros(&token->mask, n_zeros))
            ? token->format
            : LEX_F_HEXADECIMAL);
}

static void
lex_token_format_value(const union mf_subvalue *value,
                       enum lex_format format, struct ds *s)
{
    switch (format) {
    case LEX_F_DECIMAL:
        ds_put_format(s, "%"PRIu64, ntohll(value->integer));
        break;

    case LEX_F_HEXADECIMAL:
        mf_format_subvalue(value, s);
        break;

    case LEX_F_IPV4:
        ds_put_format(s, IP_FMT, IP_ARGS(value->ipv4));
        break;

    case LEX_F_IPV6:
        ipv6_format_addr(&value->ipv6, s);
        break;

    case LEX_F_ETHERNET:
        ds_put_format(s, ETH_ADDR_FMT, ETH_ADDR_ARGS(value->mac));
        break;

    default:
        OVS_NOT_REACHED();
    }

}

static void
lex_token_format_masked_integer(const struct lex_token *token, struct ds *s)
{
    enum lex_format format = lex_token_get_format(token);

    lex_token_format_value(&token->value, format, s);
    ds_put_char(s, '/');

    const union mf_subvalue *mask = &token->mask;
    if (format == LEX_F_IPV4 && ip_is_cidr(mask->ipv4)) {
        ds_put_format(s, "%d", ip_count_cidr_bits(mask->ipv4));
    } else if (token->format == LEX_F_IPV6 && ipv6_is_cidr(&mask->ipv6)) {
        ds_put_format(s, "%d", ipv6_count_cidr_bits(&mask->ipv6));
    } else {
        lex_token_format_value(&token->mask, format, s);
    }
}

/* Appends a string representation of 'token' to 's', in a format that can be
 * losslessly parsed back by the lexer.  (LEX_T_END and LEX_T_ERROR can't be
 * parsed back.) */
void
lex_token_format(const struct lex_token *token, struct ds *s)
{
    switch (token->type) {
    case LEX_T_END:
        ds_put_cstr(s, "$");
        break;

    case LEX_T_ID:
        ds_put_cstr(s, token->s);
        break;

    case LEX_T_ERROR:
        ds_put_cstr(s, "error(");
        json_string_escape(token->s, s);
        ds_put_char(s, ')');
        break;

    case LEX_T_STRING:
        json_string_escape(token->s, s);
        break;

    case LEX_T_INTEGER:
        lex_token_format_value(&token->value, lex_token_get_format(token), s);
        break;

    case LEX_T_MASKED_INTEGER:
        lex_token_format_masked_integer(token, s);
        break;

    case LEX_T_LPAREN:
        ds_put_cstr(s, "(");
        break;
    case LEX_T_RPAREN:
        ds_put_cstr(s, ")");
        break;
    case LEX_T_LCURLY:
        ds_put_cstr(s, "{");
        break;
    case LEX_T_RCURLY:
        ds_put_cstr(s, "}");
        break;
    case LEX_T_LSQUARE:
        ds_put_cstr(s, "[");
        break;
    case LEX_T_RSQUARE:
        ds_put_cstr(s, "]");
        break;
    case LEX_T_EQ:
        ds_put_cstr(s, "==");
        break;
    case LEX_T_NE:
        ds_put_cstr(s, "!=");
        break;
    case LEX_T_LT:
        ds_put_cstr(s, "<");
        break;
    case LEX_T_LE:
        ds_put_cstr(s, "<=");
        break;
    case LEX_T_GT:
        ds_put_cstr(s, ">");
        break;
    case LEX_T_GE:
        ds_put_cstr(s, ">=");
        break;
    case LEX_T_LOG_NOT:
        ds_put_cstr(s, "!");
        break;
    case LEX_T_LOG_AND:
        ds_put_cstr(s, "&&");
        break;
    case LEX_T_LOG_OR:
        ds_put_cstr(s, "||");
        break;
    case LEX_T_ELLIPSIS:
        ds_put_cstr(s, "..");
        break;
    case LEX_T_COMMA:
        ds_put_cstr(s, ",");
        break;
    case LEX_T_SEMICOLON:
        ds_put_cstr(s, ";");
        break;
    case LEX_T_EQUALS:
        ds_put_cstr(s, "=");
        break;
    case LEX_T_EXCHANGE:
        ds_put_cstr(s, "<->");
        break;
    case LEX_T_DECREMENT:
        ds_put_cstr(s, "--");
        break;
    default:
        OVS_NOT_REACHED();
    }

}
\f
/* lex_token_parse(). */

static void OVS_PRINTF_FORMAT(2, 3)
lex_error(struct lex_token *token, const char *message, ...)
{
    ovs_assert(!token->s);
    token->type = LEX_T_ERROR;

    va_list args;
    va_start(args, message);
    token->s = xvasprintf(message, args);
    va_end(args);
}

static void
lex_parse_hex_integer(const char *start, size_t len, struct lex_token *token)
{
    const char *in = start + (len - 1);
    uint8_t *out = token->value.u8 + (sizeof token->value.u8 - 1);

    for (int i = 0; i < len; i++) {
        int hexit = hexit_value(in[-i]);
        if (hexit < 0) {
            lex_error(token, "Invalid syntax in hexadecimal constant.");
            return;
        }
        if (hexit && i / 2 >= sizeof token->value.u8) {
            lex_error(token, "Hexadecimal constant requires more than "
                      "%"PRIuSIZE" bits.", 8 * sizeof token->value.u8);
            return;
        }
        out[-(i / 2)] |= i % 2 ? hexit << 4 : hexit;
    }
    token->format = LEX_F_HEXADECIMAL;
}

static const char *
lex_parse_integer__(const char *p, struct lex_token *token)
{
    lex_token_init(token);
    token->type = LEX_T_INTEGER;
    memset(&token->value, 0, sizeof token->value);
    const char *start = p;
    const char *end = start;
    while (isalnum((unsigned char) *end) || *end == ':'
           || (*end == '.' && end[1] != '.')) {
        end++;
    }
    size_t len = end - start;

    int n;
    struct eth_addr mac;

    if (!len) {
        lex_error(token, "Integer constant expected.");
    } else if (len == 17
               && ovs_scan(start, ETH_ADDR_SCAN_FMT"%n",
                           ETH_ADDR_SCAN_ARGS(mac), &n)
               && n == len) {
        token->value.mac = mac;
        token->format = LEX_F_ETHERNET;
    } else if (start + strspn(start, "0123456789") == end) {
        if (p[0] == '0' && len > 1) {
            lex_error(token, "Decimal constants must not have leading zeros.");
        } else {
            unsigned long long int integer;
            char *tail;

            errno = 0;
            integer = strtoull(p, &tail, 10);
            if (tail != end || errno == ERANGE) {
                lex_error(token, "Decimal constants must be less than 2**64.");
            } else {
                token->value.integer = htonll(integer);
                token->format = LEX_F_DECIMAL;
            }
        }
    } else if (p[0] == '0' && (p[1] == 'x' || p[1] == 'X')) {
        if (len > 2) {
            lex_parse_hex_integer(start + 2, len - 2, token);
        } else {
            lex_error(token, "Hex digits expected following 0%c.", p[1]);
        }
    } else if (len < INET6_ADDRSTRLEN) {
        char copy[INET6_ADDRSTRLEN];
        memcpy(copy, p, len);
        copy[len] = '\0';

        if (ip_parse(copy, &token->value.ipv4)) {
            token->format = LEX_F_IPV4;
        } else if (ipv6_parse(copy, &token->value.ipv6)) {
            token->format = LEX_F_IPV6;
        } else {
            lex_error(token, "Invalid numeric constant.");
        }
    } else {
        lex_error(token, "Invalid numeric constant.");
    }

    ovs_assert(token->type == LEX_T_INTEGER || token->type == LEX_T_ERROR);
    return end;
}

static const char *
lex_parse_mask(const char *p, struct lex_token *token)
{
    struct lex_token mask;

    /* Parse just past the '/' as a second integer.  Handle errors. */
    p = lex_parse_integer__(p + 1, &mask);
    if (mask.type == LEX_T_ERROR) {
        lex_token_swap(&mask, token);
        lex_token_destroy(&mask);
        return p;
    }
    ovs_assert(mask.type == LEX_T_INTEGER);

    /* Now convert the value and mask into a masked integer token.
     * We have a few special cases. */
    token->type = LEX_T_MASKED_INTEGER;
    memset(&token->mask, 0, sizeof token->mask);
    uint32_t prefix_bits = ntohll(mask.value.integer);
    if (token->format == mask.format) {
        /* Same format value and mask is always OK. */
        token->mask = mask.value;
    } else if (token->format == LEX_F_IPV4
               && mask.format == LEX_F_DECIMAL
               && prefix_bits <= 32) {
        /* IPv4 address with decimal mask is a CIDR prefix. */
        token->mask.integer = htonll(ntohl(be32_prefix_mask(prefix_bits)));
    } else if (token->format == LEX_F_IPV6
               && mask.format == LEX_F_DECIMAL
               && prefix_bits <= 128) {
        /* IPv6 address with decimal mask is a CIDR prefix. */
        token->mask.ipv6 = ipv6_create_mask(prefix_bits);
    } else if (token->format == LEX_F_DECIMAL
               && mask.format == LEX_F_HEXADECIMAL
               && token->value.integer == 0) {
        /* Special case for e.g. 0/0x1234. */
        token->format = LEX_F_HEXADECIMAL;
        token->mask = mask.value;
    } else {
        lex_error(token, "Value and mask have incompatible formats.");
        return p;
    }

    /* Check invariant that a 1-bit in the value corresponds to a 1-bit in the
     * mask. */
    for (int i = 0; i < ARRAY_SIZE(token->mask.be32); i++) {
        ovs_be32 v = token->value.be32[i];
        ovs_be32 m = token->mask.be32[i];

        if (v & ~m) {
            lex_error(token, "Value contains unmasked 1-bits.");
            break;
        }
    }

    /* Done! */
    lex_token_destroy(&mask);
    return p;
}

static const char *
lex_parse_integer(const char *p, struct lex_token *token)
{
    p = lex_parse_integer__(p, token);
    if (token->type == LEX_T_INTEGER && *p == '/') {
        p = lex_parse_mask(p, token);
    }
    return p;
}

static const char *
lex_parse_string(const char *p, struct lex_token *token)
{
    const char *start = ++p;
    for (;;) {
        switch (*p) {
        case '\0':
            lex_error(token, "Input ends inside quoted string.");
            return p;

        case '"':
            token->type = (json_string_unescape(start, p - start, &token->s)
                           ? LEX_T_STRING : LEX_T_ERROR);
            return p + 1;

        case '\\':
            p++;
            if (*p) {
                p++;
            }
            break;

        default:
            p++;
            break;
        }
    }
}

static bool
lex_is_id1(unsigned char c)
{
    return ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
            || c == '_' || c == '.');
}

static bool
lex_is_idn(unsigned char c)
{
    return lex_is_id1(c) || (c >= '0' && c <= '9');
}

static const char *
lex_parse_id(const char *p, struct lex_token *token)
{
    const char *start = p;

    do {
        p++;
    } while (lex_is_idn(*p));

    token->type = LEX_T_ID;
    token->s = xmemdup0(start, p - start);
    return p;
}

/* Initializes 'token' and parses the first token from the beginning of
 * null-terminated string 'p' into 'token'.  Stores a pointer to the start of
 * the token (after skipping white space and comments, if any) into '*startp'.
 * Returns the character position at which to begin parsing the next token. */
const char *
lex_token_parse(struct lex_token *token, const char *p, const char **startp)
{
    lex_token_init(token);

next:
    *startp = p;
    switch (*p) {
    case '\0':
        token->type = LEX_T_END;
        return p;

    case ' ': case '\t': case '\n': case '\r':
        p++;
        goto next;

    case '/':
        p++;
        if (*p == '/') {
            do {
                p++;
            } while (*p != '\0' && *p != '\n');
            goto next;
        } else if (*p == '*') {
            p++;
            for (;;) {
                if (*p == '*' && p[1] == '/') {
                    p += 2;
                    goto next;
                } else if (*p == '\0' || *p == '\n') {
                    lex_error(token, "`/*' without matching `*/'.");
                    return p;
                } else {
                    p++;
                }
            }
            goto next;
        } else {
            lex_error(token,
                      "`/' is only valid as part of `//' or `/*'.");
        }
        break;

    case '(':
        token->type = LEX_T_LPAREN;
        p++;
        break;

    case ')':
        token->type = LEX_T_RPAREN;
        p++;
        break;

    case '{':
        token->type = LEX_T_LCURLY;
        p++;
        break;

    case '}':
        token->type = LEX_T_RCURLY;
        p++;
        break;

    case '[':
        token->type = LEX_T_LSQUARE;
        p++;
        break;

    case ']':
        token->type = LEX_T_RSQUARE;
        p++;
        break;

    case '=':
        p++;
        if (*p == '=') {
            token->type = LEX_T_EQ;
            p++;
        } else {
            token->type = LEX_T_EQUALS;
        }
        break;

    case '!':
        p++;
        if (*p == '=') {
            token->type = LEX_T_NE;
            p++;
        } else {
            token->type = LEX_T_LOG_NOT;
        }
        break;

    case '&':
        p++;
        if (*p == '&') {
            token->type = LEX_T_LOG_AND;
            p++;
        } else {
            lex_error(token, "`&' is only valid as part of `&&'.");
        }
        break;

    case '|':
        p++;
        if (*p == '|') {
            token->type = LEX_T_LOG_OR;
            p++;
        } else {
            lex_error(token, "`|' is only valid as part of `||'.");
        }
        break;

    case '<':
        p++;
        if (*p == '=') {
            token->type = LEX_T_LE;
            p++;
        } else if (*p == '-' && p[1] == '>') {
            token->type = LEX_T_EXCHANGE;
            p += 2;
        } else {
            token->type = LEX_T_LT;
        }
        break;

    case '>':
        p++;
        if (*p == '=') {
            token->type = LEX_T_GE;
            p++;
        } else {
            token->type = LEX_T_GT;
        }
        break;

    case '.':
        p++;
        if (*p == '.') {
            token->type = LEX_T_ELLIPSIS;
            p++;
        } else {
            lex_error(token, "`.' is only valid as part of `..' or a number.");
        }
        break;

    case ',':
        p++;
        token->type = LEX_T_COMMA;
        break;

    case ';':
        p++;
        token->type = LEX_T_SEMICOLON;
        break;

    case '-':
        p++;
        if (*p == '-') {
            token->type = LEX_T_DECREMENT;
            p++;
        } else {
            lex_error(token, "`-' is only valid as part of `--'.");
        }
        break;

    case '0': case '1': case '2': case '3': case '4':
    case '5': case '6': case '7': case '8': case '9':
    case ':':
        p = lex_parse_integer(p, token);
        break;

    case '"':
        p = lex_parse_string(p, token);
        break;

    case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
    case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
        /* We need to distinguish an Ethernet address or IPv6 address from an
         * identifier.  Fortunately, Ethernet addresses and IPv6 addresses that
         * are ambiguous based on the first character, always start with hex
         * digits followed by a colon, but identifiers never do. */
        p = (p[strspn(p, "0123456789abcdefABCDEF")] == ':'
             ? lex_parse_integer(p, token)
             : lex_parse_id(p, token));
        break;

    default:
        if (lex_is_id1(*p)) {
            p = lex_parse_id(p, token);
        } else {
            if (isprint((unsigned char) *p)) {
                lex_error(token, "Invalid character `%c' in input.", *p);
            } else {
                lex_error(token, "Invalid byte 0x%d in input.", *p);
            }
            p++;
        }
        break;
    }

    return p;
}
\f
/* Initializes 'lexer' for parsing 'input'.
 *
 * While the lexer is in use, 'input' must remain available, but the caller
 * otherwise retains ownership of 'input'.
 *
 * The caller must call lexer_get() to obtain the first token. */
void
lexer_init(struct lexer *lexer, const char *input)
{
    lexer->input = input;
    lexer->start = NULL;
    lex_token_init(&lexer->token);
}

/* Frees storage associated with 'lexer'. */
void
lexer_destroy(struct lexer *lexer)
{
    lex_token_destroy(&lexer->token);
}

/* Obtains the next token from 'lexer' into 'lexer->token', and returns the
 * token's type.  The caller may examine 'lexer->token' directly to obtain full
 * information about the token. */
enum lex_type
lexer_get(struct lexer *lexer)
{
    lex_token_destroy(&lexer->token);
    lexer->input = lex_token_parse(&lexer->token, lexer->input, &lexer->start);
    return lexer->token.type;
}

/* Returns the type of the next token that will be fetched by lexer_get(),
 * without advancing 'lexer->token' to that token. */
enum lex_type
lexer_lookahead(const struct lexer *lexer)
{
    struct lex_token next;
    enum lex_type type;
    const char *start;

    lex_token_parse(&next, lexer->input, &start);
    type = next.type;
    lex_token_destroy(&next);
    return type;
}

/* If 'lexer''s current token has the given 'type', advances 'lexer' to the
 * next token and returns true.  Otherwise returns false. */
bool
lexer_match(struct lexer *lexer, enum lex_type type)
{
    if (lexer->token.type == type) {
        lexer_get(lexer);
        return true;
    } else {
        return false;
    }
}

/* If 'lexer''s current token is the identifier given in 'id', advances 'lexer'
 * to the next token and returns true.  Otherwise returns false.  */
bool
lexer_match_id(struct lexer *lexer, const char *id)
{
    if (lexer->token.type == LEX_T_ID && !strcmp(lexer->token.s, id)) {
        lexer_get(lexer);
        return true;
    } else {
        return false;
    }
}

bool
lexer_is_int(const struct lexer *lexer)
{
    return (lexer->token.type == LEX_T_INTEGER
            && lexer->token.format == LEX_F_DECIMAL
            && ntohll(lexer->token.value.integer) <= INT_MAX);
}

bool
lexer_get_int(struct lexer *lexer, int *value)
{
    if (lexer_is_int(lexer)) {
        *value = ntohll(lexer->token.value.integer);
        lexer_get(lexer);
        return true;
    } else {
        *value = 0;
        return false;
    }
}
Commit	Line	Data
10b1662b BP	1	/*
	2	* Copyright (c) 2015 Nicira, Inc.
	3	*
	4	* Licensed under the Apache License, Version 2.0 (the "License");
	5	* you may not use this file except in compliance with the License.
	6	* You may obtain a copy of the License at:
	7	*
	8	* http://www.apache.org/licenses/LICENSE-2.0
	9	*
	10	* Unless required by applicable law or agreed to in writing, software
	11	* distributed under the License is distributed on an "AS IS" BASIS,
	12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	13	* See the License for the specific language governing permissions and
	14	* limitations under the License.
	15	*/
	16
	17	#include <config.h>
	18	#include "lex.h"
	19	#include <ctype.h>
	20	#include <errno.h>
	21	#include <stdarg.h>
	22	#include "dynamic-string.h"
	23	#include "json.h"
e7695092	24	#include "packets.h"
10b1662b	25	#include "util.h"
363b5330 BP	26	\f
	27	/* Returns a string that represents 'format'. */
	28	const char *
	29	lex_format_to_string(enum lex_format format)
	30	{
	31	switch (format) {
	32	case LEX_F_DECIMAL:
	33	return "decimal";
	34	case LEX_F_HEXADECIMAL:
	35	return "hexadecimal";
	36	case LEX_F_IPV4:
	37	return "IPv4";
	38	case LEX_F_IPV6:
	39	return "IPv6";
	40	case LEX_F_ETHERNET:
	41	return "Ethernet";
	42	default:
	43	abort();
	44	}
	45	}
	46	\f
10b1662b BP	47	/* Initializes 'token'. */
	48	void
	49	lex_token_init(struct lex_token *token)
	50	{
	51	token->type = LEX_T_END;
	52	token->s = NULL;
	53	}
	54
	55	/* Frees memory owned by 'token'. */
	56	void
	57	lex_token_destroy(struct lex_token *token)
	58	{
	59	free(token->s);
	60	}
	61
	62	/* Exchanges 'a' and 'b'. */
	63	void
	64	lex_token_swap(struct lex_token a, struct lex_token b)
	65	{
	66	struct lex_token tmp = *a;
	67	a = b;
	68	*b = tmp;
	69	}
	70	\f
	71	/* lex_token_format(). */
	72
	73	static size_t
	74	lex_token_n_zeros(enum lex_format format)
	75	{
	76	switch (format) {
	77	case LEX_F_DECIMAL: return offsetof(union mf_subvalue, integer);
	78	case LEX_F_HEXADECIMAL: return 0;
	79	case LEX_F_IPV4: return offsetof(union mf_subvalue, ipv4);
	80	case LEX_F_IPV6: return offsetof(union mf_subvalue, ipv6);
	81	case LEX_F_ETHERNET: return offsetof(union mf_subvalue, mac);
	82	default: OVS_NOT_REACHED();
	83	}
	84	}
	85
	86	/* Returns the effective format for 'token', that is, the format in which it
	87	* should actually be printed. This is ordinarily the same as 'token->format',
	88	* but it's always possible that someone sets up a token with a format that
	89	* won't work for a value, e.g. 'token->value' is wider than 32 bits but the
	90	* format is LEX_F_IPV4. (The lexer itself won't do that; this is an attempt
	91	* to avoid confusion in the future.) */
	92	static enum lex_format
	93	lex_token_get_format(const struct lex_token *token)
	94	{
	95	size_t n_zeros = lex_token_n_zeros(token->format);
	96	return (is_all_zeros(&token->value, n_zeros)
	97	&& (token->type != LEX_T_MASKED_INTEGER
	98	\|\| is_all_zeros(&token->mask, n_zeros))
	99	? token->format
	100	: LEX_F_HEXADECIMAL);
	101	}
	102
	103	static void
	104	lex_token_format_value(const union mf_subvalue *value,
	105	enum lex_format format, struct ds *s)
	106	{
	107	switch (format) {
	108	case LEX_F_DECIMAL:
	109	ds_put_format(s, "%"PRIu64, ntohll(value->integer));
	110	break;
111
112	case LEX_F_HEXADECIMAL:
113	mf_format_subvalue(value, s);
114	break;
115
116	case LEX_F_IPV4:
117	ds_put_format(s, IP_FMT, IP_ARGS(value->ipv4));
118	break;
119
120	case LEX_F_IPV6:
ac6d120f	121	ipv6_format_addr(&value->ipv6, s);
10b1662b BP	122	break;
	123
	124	case LEX_F_ETHERNET:
	125	ds_put_format(s, ETH_ADDR_FMT, ETH_ADDR_ARGS(value->mac));
	126	break;
	127
	128	default:
	129	OVS_NOT_REACHED();
	130	}
	131
	132	}
	133
	134	static void
	135	lex_token_format_masked_integer(const struct lex_token token, struct ds s)
	136	{
	137	enum lex_format format = lex_token_get_format(token);
	138
	139	lex_token_format_value(&token->value, format, s);
	140	ds_put_char(s, '/');
	141
	142	const union mf_subvalue *mask = &token->mask;
	143	if (format == LEX_F_IPV4 && ip_is_cidr(mask->ipv4)) {
	144	ds_put_format(s, "%d", ip_count_cidr_bits(mask->ipv4));
	145	} else if (token->format == LEX_F_IPV6 && ipv6_is_cidr(&mask->ipv6)) {
	146	ds_put_format(s, "%d", ipv6_count_cidr_bits(&mask->ipv6));
	147	} else {
	148	lex_token_format_value(&token->mask, format, s);
	149	}
	150	}
	151
10b1662b BP	152	/* Appends a string representation of 'token' to 's', in a format that can be
	153	* losslessly parsed back by the lexer. (LEX_T_END and LEX_T_ERROR can't be
	154	* parsed back.) */
	155	void
3d611299	156	lex_token_format(const struct lex_token token, struct ds s)
10b1662b BP	157	{
	158	switch (token->type) {
	159	case LEX_T_END:
	160	ds_put_cstr(s, "$");
	161	break;
	162
	163	case LEX_T_ID:
	164	ds_put_cstr(s, token->s);
	165	break;
	166
	167	case LEX_T_ERROR:
	168	ds_put_cstr(s, "error(");
3b626771	169	json_string_escape(token->s, s);
10b1662b BP	170	ds_put_char(s, ')');
	171	break;
	172
	173	case LEX_T_STRING:
3b626771	174	json_string_escape(token->s, s);
10b1662b BP	175	break;
	176
	177	case LEX_T_INTEGER:
	178	lex_token_format_value(&token->value, lex_token_get_format(token), s);
	179	break;
	180
	181	case LEX_T_MASKED_INTEGER:
	182	lex_token_format_masked_integer(token, s);
	183	break;
	184
	185	case LEX_T_LPAREN:
	186	ds_put_cstr(s, "(");
	187	break;
	188	case LEX_T_RPAREN:
	189	ds_put_cstr(s, ")");
	190	break;
	191	case LEX_T_LCURLY:
	192	ds_put_cstr(s, "{");
	193	break;
	194	case LEX_T_RCURLY:
	195	ds_put_cstr(s, "}");
	196	break;
	197	case LEX_T_LSQUARE:
	198	ds_put_cstr(s, "[");
	199	break;
	200	case LEX_T_RSQUARE:
	201	ds_put_cstr(s, "]");
	202	break;
	203	case LEX_T_EQ:
	204	ds_put_cstr(s, "==");
	205	break;
	206	case LEX_T_NE:
	207	ds_put_cstr(s, "!=");
	208	break;
	209	case LEX_T_LT:
	210	ds_put_cstr(s, "<");
	211	break;
	212	case LEX_T_LE:
	213	ds_put_cstr(s, "<=");
	214	break;
	215	case LEX_T_GT:
	216	ds_put_cstr(s, ">");
	217	break;
	218	case LEX_T_GE:
	219	ds_put_cstr(s, ">=");
	220	break;
	221	case LEX_T_LOG_NOT:
	222	ds_put_cstr(s, "!");
	223	break;
	224	case LEX_T_LOG_AND:
	225	ds_put_cstr(s, "&&");
	226	break;
	227	case LEX_T_LOG_OR:
	228	ds_put_cstr(s, "\|\|");
	229	break;
	230	case LEX_T_ELLIPSIS:
	231	ds_put_cstr(s, "..");
	232	break;
	233	case LEX_T_COMMA:
	234	ds_put_cstr(s, ",");
	235	break;
	236	case LEX_T_SEMICOLON:
	237	ds_put_cstr(s, ";");
	238	break;
239	case LEX_T_EQUALS:
240	ds_put_cstr(s, "=");
241	break;
a20c96c6 BP	242	case LEX_T_EXCHANGE:
	243	ds_put_cstr(s, "<->");
	244	break;
56091efe BP	245	case LEX_T_DECREMENT:
	246	ds_put_cstr(s, "--");
	247	break;
10b1662b BP	248	default:
	249	OVS_NOT_REACHED();
	250	}
	251
	252	}
	253	\f
	254	/* lex_token_parse(). */
	255
	256	static void OVS_PRINTF_FORMAT(2, 3)
	257	lex_error(struct lex_token token, const char message, ...)
	258	{
	259	ovs_assert(!token->s);
	260	token->type = LEX_T_ERROR;
	261
	262	va_list args;
	263	va_start(args, message);
	264	token->s = xvasprintf(message, args);
	265	va_end(args);
	266	}
	267
	268	static void
	269	lex_parse_hex_integer(const char start, size_t len, struct lex_token token)
	270	{
	271	const char *in = start + (len - 1);
	272	uint8_t *out = token->value.u8 + (sizeof token->value.u8 - 1);
	273
	274	for (int i = 0; i < len; i++) {
	275	int hexit = hexit_value(in[-i]);
	276	if (hexit < 0) {
	277	lex_error(token, "Invalid syntax in hexadecimal constant.");
	278	return;
	279	}
	280	if (hexit && i / 2 >= sizeof token->value.u8) {
	281	lex_error(token, "Hexadecimal constant requires more than "
	282	"%"PRIuSIZE" bits.", 8 * sizeof token->value.u8);
	283	return;
	284	}
	285	out[-(i / 2)] \|= i % 2 ? hexit << 4 : hexit;
	286	}
	287	token->format = LEX_F_HEXADECIMAL;
	288	}
	289
	290	static const char *
	291	lex_parse_integer__(const char p, struct lex_token token)
	292	{
	293	lex_token_init(token);
	294	token->type = LEX_T_INTEGER;
	295	memset(&token->value, 0, sizeof token->value);
	296	const char *start = p;
	297	const char *end = start;
	298	while (isalnum((unsigned char) end) \|\| end == ':'
	299	\|\| (*end == '.' && end[1] != '.')) {
	300	end++;
	301	}
	302	size_t len = end - start;
	303
	304	int n;
74ff3298	305	struct eth_addr mac;
10b1662b BP	306
	307	if (!len) {
	308	lex_error(token, "Integer constant expected.");
	309	} else if (len == 17
	310	&& ovs_scan(start, ETH_ADDR_SCAN_FMT"%n",
	311	ETH_ADDR_SCAN_ARGS(mac), &n)
	312	&& n == len) {
74ff3298	313	token->value.mac = mac;
10b1662b BP	314	token->format = LEX_F_ETHERNET;
	315	} else if (start + strspn(start, "0123456789") == end) {
	316	if (p[0] == '0' && len > 1) {
	317	lex_error(token, "Decimal constants must not have leading zeros.");
	318	} else {
	319	unsigned long long int integer;
	320	char *tail;
	321
	322	errno = 0;
	323	integer = strtoull(p, &tail, 10);
	324	if (tail != end \|\| errno == ERANGE) {
	325	lex_error(token, "Decimal constants must be less than 2**64.");
	326	} else {
	327	token->value.integer = htonll(integer);
	328	token->format = LEX_F_DECIMAL;
	329	}
	330	}
	331	} else if (p[0] == '0' && (p[1] == 'x' \|\| p[1] == 'X')) {
	332	if (len > 2) {
	333	lex_parse_hex_integer(start + 2, len - 2, token);
	334	} else {
	335	lex_error(token, "Hex digits expected following 0%c.", p[1]);
	336	}
	337	} else if (len < INET6_ADDRSTRLEN) {
	338	char copy[INET6_ADDRSTRLEN];
	339	memcpy(copy, p, len);
	340	copy[len] = '\0';
	341
e7695092	342	if (ip_parse(copy, &token->value.ipv4)) {
10b1662b	343	token->format = LEX_F_IPV4;
e7695092	344	} else if (ipv6_parse(copy, &token->value.ipv6)) {
10b1662b BP	345	token->format = LEX_F_IPV6;
	346	} else {
	347	lex_error(token, "Invalid numeric constant.");
	348	}
	349	} else {
	350	lex_error(token, "Invalid numeric constant.");
	351	}
	352
	353	ovs_assert(token->type == LEX_T_INTEGER \|\| token->type == LEX_T_ERROR);
	354	return end;
	355	}
	356
	357	static const char *
	358	lex_parse_mask(const char p, struct lex_token token)
	359	{
	360	struct lex_token mask;
	361
	362	/* Parse just past the '/' as a second integer. Handle errors. */
	363	p = lex_parse_integer__(p + 1, &mask);
	364	if (mask.type == LEX_T_ERROR) {
	365	lex_token_swap(&mask, token);
	366	lex_token_destroy(&mask);
	367	return p;
	368	}
	369	ovs_assert(mask.type == LEX_T_INTEGER);
	370
	371	/* Now convert the value and mask into a masked integer token.
	372	* We have a few special cases. */
	373	token->type = LEX_T_MASKED_INTEGER;
	374	memset(&token->mask, 0, sizeof token->mask);
	375	uint32_t prefix_bits = ntohll(mask.value.integer);
	376	if (token->format == mask.format) {
	377	/* Same format value and mask is always OK. */
	378	token->mask = mask.value;
	379	} else if (token->format == LEX_F_IPV4
	380	&& mask.format == LEX_F_DECIMAL
	381	&& prefix_bits <= 32) {
	382	/* IPv4 address with decimal mask is a CIDR prefix. */
	383	token->mask.integer = htonll(ntohl(be32_prefix_mask(prefix_bits)));
	384	} else if (token->format == LEX_F_IPV6
	385	&& mask.format == LEX_F_DECIMAL
	386	&& prefix_bits <= 128) {
	387	/* IPv6 address with decimal mask is a CIDR prefix. */
	388	token->mask.ipv6 = ipv6_create_mask(prefix_bits);
	389	} else if (token->format == LEX_F_DECIMAL
	390	&& mask.format == LEX_F_HEXADECIMAL
	391	&& token->value.integer == 0) {
	392	/* Special case for e.g. 0/0x1234. */
	393	token->format = LEX_F_HEXADECIMAL;
	394	token->mask = mask.value;
	395	} else {
	396	lex_error(token, "Value and mask have incompatible formats.");
	397	return p;
	398	}
	399
	400	/* Check invariant that a 1-bit in the value corresponds to a 1-bit in the
	401	* mask. */
	402	for (int i = 0; i < ARRAY_SIZE(token->mask.be32); i++) {
	403	ovs_be32 v = token->value.be32[i];
	404	ovs_be32 m = token->mask.be32[i];
	405
	406	if (v & ~m) {
	407	lex_error(token, "Value contains unmasked 1-bits.");
	408	break;
409	}
410	}
411
412	/* Done! */
413	lex_token_destroy(&mask);
414	return p;
415	}
416
417	static const char *
418	lex_parse_integer(const char p, struct lex_token token)
419	{
420	p = lex_parse_integer__(p, token);
421	if (token->type == LEX_T_INTEGER && *p == '/') {
422	p = lex_parse_mask(p, token);
423	}
424	return p;
425	}
426
427	static const char *
428	lex_parse_string(const char p, struct lex_token token)
429	{
430	const char *start = ++p;
431	for (;;) {
432	switch (*p) {
433	case '\0':
434	lex_error(token, "Input ends inside quoted string.");
435	return p;
436
437	case '"':
438	token->type = (json_string_unescape(start, p - start, &token->s)
439	? LEX_T_STRING : LEX_T_ERROR);
440	return p + 1;
441
442	case '\\':
443	p++;
444	if (*p) {
445	p++;
446	}
447	break;
448
449	default:
450	p++;
451	break;
452	}
453	}
454	}
455
456	static bool
457	lex_is_id1(unsigned char c)
458	{
459	return ((c >= 'a' && c <= 'z') \|\| (c >= 'A' && c <= 'Z')
460	\|\| c == '_' \|\| c == '.');
461	}
462
463	static bool
464	lex_is_idn(unsigned char c)
465	{
466	return lex_is_id1(c) \|\| (c >= '0' && c <= '9');
467	}
468
469	static const char *
470	lex_parse_id(const char p, struct lex_token token)
471	{
472	const char *start = p;
473
474	do {
475	p++;
476	} while (lex_is_idn(*p));
477
478	token->type = LEX_T_ID;
479	token->s = xmemdup0(start, p - start);
480	return p;
481	}
482
483	/* Initializes 'token' and parses the first token from the beginning of
484	* null-terminated string 'p' into 'token'. Stores a pointer to the start of
485	* the token (after skipping white space and comments, if any) into '*startp'.
486	* Returns the character position at which to begin parsing the next token. */
487	const char *
488	lex_token_parse(struct lex_token token, const char p, const char **startp)
489	{
490	lex_token_init(token);
491
492	next:
493	*startp = p;
494	switch (*p) {
495	case '\0':
496	token->type = LEX_T_END;
497	return p;
498
499	case ' ': case '\t': case '\n': case '\r':
500	p++;
501	goto next;
502
503	case '/':
504	p++;
505	if (*p == '/') {
506	do {
507	p++;
508	} while (p != '\0' && p != '\n');
509	goto next;
510	} else if (p == '') {
511	p++;
512	for (;;) {
513	if (p == '' && p[1] == '/') {
514	p += 2;
515	goto next;
516	} else if (p == '\0' \|\| p == '\n') {
517	lex_error(token, "`/' without matching `/'.");
518	return p;
519	} else {
520	p++;
521	}
522	}
523	goto next;
524	} else {
525	lex_error(token,
526	"`/' is only valid as part of `//' or `/*'.");
527	}
528	break;
529
530	case '(':
531	token->type = LEX_T_LPAREN;
532	p++;
533	break;
534
535	case ')':
536	token->type = LEX_T_RPAREN;
537	p++;
538	break;
539
540	case '{':
541	token->type = LEX_T_LCURLY;
542	p++;
543	break;
544
545	case '}':
546	token->type = LEX_T_RCURLY;
547	p++;
548	break;
549
550	case '[':
551	token->type = LEX_T_LSQUARE;
552	p++;
553	break;
554
555	case ']':
556	token->type = LEX_T_RSQUARE;
557	p++;
558	break;
559
560	case '=':
561	p++;
562	if (*p == '=') {
563	token->type = LEX_T_EQ;
564	p++;
565	} else {
566	token->type = LEX_T_EQUALS;
567	}
568	break;
569
570	case '!':
571	p++;
572	if (*p == '=') {
573	token->type = LEX_T_NE;
574	p++;
575	} else {
576	token->type = LEX_T_LOG_NOT;
577	}
578	break;
579
580	case '&':
581	p++;
582	if (*p == '&') {
583	token->type = LEX_T_LOG_AND;
584	p++;
585	} else {
586	lex_error(token, "`&' is only valid as part of `&&'.");
587	}
588	break;
589
590	case '\|':
591	p++;
592	if (*p == '\|') {
593	token->type = LEX_T_LOG_OR;
594	p++;
595	} else {
596	lex_error(token, "`\|' is only valid as part of `\|\|'.");
597	}
598	break;
599
600	case '<':
601	p++;
602	if (*p == '=') {
603	token->type = LEX_T_LE;
604	p++;
a20c96c6 BP	605	} else if (*p == '-' && p[1] == '>') {
	606	token->type = LEX_T_EXCHANGE;
	607	p += 2;
10b1662b BP	608	} else {
	609	token->type = LEX_T_LT;
	610	}
	611	break;
	612
	613	case '>':
	614	p++;
	615	if (*p == '=') {
	616	token->type = LEX_T_GE;
	617	p++;
	618	} else {
	619	token->type = LEX_T_GT;
	620	}
	621	break;
	622
	623	case '.':
	624	p++;
	625	if (*p == '.') {
	626	token->type = LEX_T_ELLIPSIS;
	627	p++;
	628	} else {
	629	lex_error(token, "`.' is only valid as part of `..' or a number.");
	630	}
	631	break;
	632
	633	case ',':
	634	p++;
	635	token->type = LEX_T_COMMA;
	636	break;
	637
	638	case ';':
	639	p++;
	640	token->type = LEX_T_SEMICOLON;
	641	break;
	642
56091efe BP	643	case '-':
	644	p++;
	645	if (*p == '-') {
	646	token->type = LEX_T_DECREMENT;
	647	p++;
	648	} else {
	649	lex_error(token, "`-' is only valid as part of `--'.");
	650	}
	651	break;
	652
10b1662b BP	653	case '0': case '1': case '2': case '3': case '4':
	654	case '5': case '6': case '7': case '8': case '9':
	655	case ':':
	656	p = lex_parse_integer(p, token);
	657	break;
	658
	659	case '"':
	660	p = lex_parse_string(p, token);
	661	break;
	662
	663	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
	664	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
	665	/* We need to distinguish an Ethernet address or IPv6 address from an
	666	* identifier. Fortunately, Ethernet addresses and IPv6 addresses that
	667	* are ambiguous based on the first character, always start with hex
	668	* digits followed by a colon, but identifiers never do. */
	669	p = (p[strspn(p, "0123456789abcdefABCDEF")] == ':'
	670	? lex_parse_integer(p, token)
	671	: lex_parse_id(p, token));
	672	break;
	673
	674	default:
	675	if (lex_is_id1(*p)) {
	676	p = lex_parse_id(p, token);
	677	} else {
	678	if (isprint((unsigned char) *p)) {
	679	lex_error(token, "Invalid character `%c' in input.", *p);
	680	} else {
	681	lex_error(token, "Invalid byte 0x%d in input.", *p);
	682	}
	683	p++;
	684	}
	685	break;
	686	}
	687
	688	return p;
	689	}
	690	\f
	691	/* Initializes 'lexer' for parsing 'input'.
	692	*
	693	* While the lexer is in use, 'input' must remain available, but the caller
	694	* otherwise retains ownership of 'input'.
	695	*
	696	* The caller must call lexer_get() to obtain the first token. */
	697	void
	698	lexer_init(struct lexer lexer, const char input)
	699	{
	700	lexer->input = input;
	701	lexer->start = NULL;
	702	lex_token_init(&lexer->token);
	703	}
	704
	705	/* Frees storage associated with 'lexer'. */
	706	void
	707	lexer_destroy(struct lexer *lexer)
	708	{
	709	lex_token_destroy(&lexer->token);
	710	}
	711
	712	/* Obtains the next token from 'lexer' into 'lexer->token', and returns the
	713	* token's type. The caller may examine 'lexer->token' directly to obtain full
	714	* information about the token. */
	715	enum lex_type
	716	lexer_get(struct lexer *lexer)
717	{
718	lex_token_destroy(&lexer->token);
719	lexer->input = lex_token_parse(&lexer->token, lexer->input, &lexer->start);
720	return lexer->token.type;
721	}
722
27912fdb BP	723	/* Returns the type of the next token that will be fetched by lexer_get(),
	724	* without advancing 'lexer->token' to that token. */
	725	enum lex_type
	726	lexer_lookahead(const struct lexer *lexer)
	727	{
	728	struct lex_token next;
	729	enum lex_type type;
	730	const char *start;
	731
	732	lex_token_parse(&next, lexer->input, &start);
	733	type = next.type;
	734	lex_token_destroy(&next);
	735	return type;
	736	}
	737
10b1662b BP	738	/* If 'lexer''s current token has the given 'type', advances 'lexer' to the
	739	* next token and returns true. Otherwise returns false. */
	740	bool
	741	lexer_match(struct lexer *lexer, enum lex_type type)
	742	{
	743	if (lexer->token.type == type) {
	744	lexer_get(lexer);
	745	return true;
	746	} else {
	747	return false;
	748	}
	749	}
27912fdb BP	750
	751	/* If 'lexer''s current token is the identifier given in 'id', advances 'lexer'
	752	* to the next token and returns true. Otherwise returns false. */
	753	bool
	754	lexer_match_id(struct lexer lexer, const char id)
	755	{
	756	if (lexer->token.type == LEX_T_ID && !strcmp(lexer->token.s, id)) {
	757	lexer_get(lexer);
	758	return true;
	759	} else {
	760	return false;
	761	}
	762	}
558ec83d BP	763
	764	bool
	765	lexer_is_int(const struct lexer *lexer)
	766	{
	767	return (lexer->token.type == LEX_T_INTEGER
	768	&& lexer->token.format == LEX_F_DECIMAL
	769	&& ntohll(lexer->token.value.integer) <= INT_MAX);
	770	}
	771
	772	bool
	773	lexer_get_int(struct lexer lexer, int value)
	774	{
	775	if (lexer_is_int(lexer)) {
	776	*value = ntohll(lexer->token.value.integer);
	777	lexer_get(lexer);
	778	return true;
	779	} else {
	780	*value = 0;
	781	return false;
	782	}
	783	}