[ovs.git] / ovn / lib / lex.c

/*
 * Copyright (c) 2015 Nicira, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <config.h>
#include "lex.h"
#include <ctype.h>
#include <errno.h>
#include <stdarg.h>
#include "dynamic-string.h"
#include "json.h"
#include "util.h"
\f
/* Returns a string that represents 'format'. */
const char *
lex_format_to_string(enum lex_format format)
{
    switch (format) {
    case LEX_F_DECIMAL:
        return "decimal";
    case LEX_F_HEXADECIMAL:
        return "hexadecimal";
    case LEX_F_IPV4:
        return "IPv4";
    case LEX_F_IPV6:
        return "IPv6";
    case LEX_F_ETHERNET:
        return "Ethernet";
    default:
        abort();
    }
}
\f
/* Initializes 'token'. */
void
lex_token_init(struct lex_token *token)
{
    token->type = LEX_T_END;
    token->s = NULL;
}

/* Frees memory owned by 'token'. */
void
lex_token_destroy(struct lex_token *token)
{
    free(token->s);
}

/* Exchanges 'a' and 'b'. */
void
lex_token_swap(struct lex_token *a, struct lex_token *b)
{
    struct lex_token tmp = *a;
    *a = *b;
    *b = tmp;
}
\f
/* lex_token_format(). */

static size_t
lex_token_n_zeros(enum lex_format format)
{
    switch (format) {
    case LEX_F_DECIMAL:     return offsetof(union mf_subvalue, integer);
    case LEX_F_HEXADECIMAL: return 0;
    case LEX_F_IPV4:        return offsetof(union mf_subvalue, ipv4);
    case LEX_F_IPV6:        return offsetof(union mf_subvalue, ipv6);
    case LEX_F_ETHERNET:    return offsetof(union mf_subvalue, mac);
    default: OVS_NOT_REACHED();
    }
}

/* Returns the effective format for 'token', that is, the format in which it
 * should actually be printed.  This is ordinarily the same as 'token->format',
 * but it's always possible that someone sets up a token with a format that
 * won't work for a value, e.g. 'token->value' is wider than 32 bits but the
 * format is LEX_F_IPV4.  (The lexer itself won't do that; this is an attempt
 * to avoid confusion in the future.) */
static enum lex_format
lex_token_get_format(const struct lex_token *token)
{
    size_t n_zeros = lex_token_n_zeros(token->format);
    return (is_all_zeros(&token->value, n_zeros)
            && (token->type != LEX_T_MASKED_INTEGER
                || is_all_zeros(&token->mask, n_zeros))
            ? token->format
            : LEX_F_HEXADECIMAL);
}

static void
lex_token_format_value(const union mf_subvalue *value,
                       enum lex_format format, struct ds *s)
{
    switch (format) {
    case LEX_F_DECIMAL:
        ds_put_format(s, "%"PRIu64, ntohll(value->integer));
        break;

    case LEX_F_HEXADECIMAL:
        mf_format_subvalue(value, s);
        break;

    case LEX_F_IPV4:
        ds_put_format(s, IP_FMT, IP_ARGS(value->ipv4));
        break;

    case LEX_F_IPV6:
        print_ipv6_addr(s, &value->ipv6);
        break;

    case LEX_F_ETHERNET:
        ds_put_format(s, ETH_ADDR_FMT, ETH_ADDR_ARGS(value->mac));
        break;

    default:
        OVS_NOT_REACHED();
    }

}

static void
lex_token_format_masked_integer(const struct lex_token *token, struct ds *s)
{
    enum lex_format format = lex_token_get_format(token);

    lex_token_format_value(&token->value, format, s);
    ds_put_char(s, '/');

    const union mf_subvalue *mask = &token->mask;
    if (format == LEX_F_IPV4 && ip_is_cidr(mask->ipv4)) {
        ds_put_format(s, "%d", ip_count_cidr_bits(mask->ipv4));
    } else if (token->format == LEX_F_IPV6 && ipv6_is_cidr(&mask->ipv6)) {
        ds_put_format(s, "%d", ipv6_count_cidr_bits(&mask->ipv6));
    } else {
        lex_token_format_value(&token->mask, format, s);
    }
}

/* Appends a string representation of 'token' to 's', in a format that can be
 * losslessly parsed back by the lexer.  (LEX_T_END and LEX_T_ERROR can't be
 * parsed back.) */
void
lex_token_format(const struct lex_token *token, struct ds *s)
{
    switch (token->type) {
    case LEX_T_END:
        ds_put_cstr(s, "$");
        break;

    case LEX_T_ID:
        ds_put_cstr(s, token->s);
        break;

    case LEX_T_ERROR:
        ds_put_cstr(s, "error(");
        json_string_escape(token->s, s);
        ds_put_char(s, ')');
        break;

    case LEX_T_STRING:
        json_string_escape(token->s, s);
        break;

    case LEX_T_INTEGER:
        lex_token_format_value(&token->value, lex_token_get_format(token), s);
        break;

    case LEX_T_MASKED_INTEGER:
        lex_token_format_masked_integer(token, s);
        break;

    case LEX_T_LPAREN:
        ds_put_cstr(s, "(");
        break;
    case LEX_T_RPAREN:
        ds_put_cstr(s, ")");
        break;
    case LEX_T_LCURLY:
        ds_put_cstr(s, "{");
        break;
    case LEX_T_RCURLY:
        ds_put_cstr(s, "}");
        break;
    case LEX_T_LSQUARE:
        ds_put_cstr(s, "[");
        break;
    case LEX_T_RSQUARE:
        ds_put_cstr(s, "]");
        break;
    case LEX_T_EQ:
        ds_put_cstr(s, "==");
        break;
    case LEX_T_NE:
        ds_put_cstr(s, "!=");
        break;
    case LEX_T_LT:
        ds_put_cstr(s, "<");
        break;
    case LEX_T_LE:
        ds_put_cstr(s, "<=");
        break;
    case LEX_T_GT:
        ds_put_cstr(s, ">");
        break;
    case LEX_T_GE:
        ds_put_cstr(s, ">=");
        break;
    case LEX_T_LOG_NOT:
        ds_put_cstr(s, "!");
        break;
    case LEX_T_LOG_AND:
        ds_put_cstr(s, "&&");
        break;
    case LEX_T_LOG_OR:
        ds_put_cstr(s, "||");
        break;
    case LEX_T_ELLIPSIS:
        ds_put_cstr(s, "..");
        break;
    case LEX_T_COMMA:
        ds_put_cstr(s, ",");
        break;
    case LEX_T_SEMICOLON:
        ds_put_cstr(s, ";");
        break;
    case LEX_T_EQUALS:
        ds_put_cstr(s, "=");
        break;
    case LEX_T_EXCHANGE:
        ds_put_cstr(s, "<->");
        break;
    case LEX_T_DECREMENT:
        ds_put_cstr(s, "--");
        break;
    default:
        OVS_NOT_REACHED();
    }

}
\f
/* lex_token_parse(). */

static void OVS_PRINTF_FORMAT(2, 3)
lex_error(struct lex_token *token, const char *message, ...)
{
    ovs_assert(!token->s);
    token->type = LEX_T_ERROR;

    va_list args;
    va_start(args, message);
    token->s = xvasprintf(message, args);
    va_end(args);
}

static void
lex_parse_hex_integer(const char *start, size_t len, struct lex_token *token)
{
    const char *in = start + (len - 1);
    uint8_t *out = token->value.u8 + (sizeof token->value.u8 - 1);

    for (int i = 0; i < len; i++) {
        int hexit = hexit_value(in[-i]);
        if (hexit < 0) {
            lex_error(token, "Invalid syntax in hexadecimal constant.");
            return;
        }
        if (hexit && i / 2 >= sizeof token->value.u8) {
            lex_error(token, "Hexadecimal constant requires more than "
                      "%"PRIuSIZE" bits.", 8 * sizeof token->value.u8);
            return;
        }
        out[-(i / 2)] |= i % 2 ? hexit << 4 : hexit;
    }
    token->format = LEX_F_HEXADECIMAL;
}

static const char *
lex_parse_integer__(const char *p, struct lex_token *token)
{
    lex_token_init(token);
    token->type = LEX_T_INTEGER;
    memset(&token->value, 0, sizeof token->value);
    const char *start = p;
    const char *end = start;
    while (isalnum((unsigned char) *end) || *end == ':'
           || (*end == '.' && end[1] != '.')) {
        end++;
    }
    size_t len = end - start;

    int n;
    struct eth_addr mac;

    if (!len) {
        lex_error(token, "Integer constant expected.");
    } else if (len == 17
               && ovs_scan(start, ETH_ADDR_SCAN_FMT"%n",
                           ETH_ADDR_SCAN_ARGS(mac), &n)
               && n == len) {
        token->value.mac = mac;
        token->format = LEX_F_ETHERNET;
    } else if (start + strspn(start, "0123456789") == end) {
        if (p[0] == '0' && len > 1) {
            lex_error(token, "Decimal constants must not have leading zeros.");
        } else {
            unsigned long long int integer;
            char *tail;

            errno = 0;
            integer = strtoull(p, &tail, 10);
            if (tail != end || errno == ERANGE) {
                lex_error(token, "Decimal constants must be less than 2**64.");
            } else {
                token->value.integer = htonll(integer);
                token->format = LEX_F_DECIMAL;
            }
        }
    } else if (p[0] == '0' && (p[1] == 'x' || p[1] == 'X')) {
        if (len > 2) {
            lex_parse_hex_integer(start + 2, len - 2, token);
        } else {
            lex_error(token, "Hex digits expected following 0%c.", p[1]);
        }
    } else if (len < INET6_ADDRSTRLEN) {
        char copy[INET6_ADDRSTRLEN];
        memcpy(copy, p, len);
        copy[len] = '\0';

        struct in_addr ipv4;
        struct in6_addr ipv6;
        if (inet_pton(AF_INET, copy, &ipv4) == 1) {
            token->value.ipv4 = ipv4.s_addr;
            token->format = LEX_F_IPV4;
        } else if (inet_pton(AF_INET6, copy, &ipv6) == 1) {
            token->value.ipv6 = ipv6;
            token->format = LEX_F_IPV6;
        } else {
            lex_error(token, "Invalid numeric constant.");
        }
    } else {
        lex_error(token, "Invalid numeric constant.");
    }

    ovs_assert(token->type == LEX_T_INTEGER || token->type == LEX_T_ERROR);
    return end;
}

static const char *
lex_parse_mask(const char *p, struct lex_token *token)
{
    struct lex_token mask;

    /* Parse just past the '/' as a second integer.  Handle errors. */
    p = lex_parse_integer__(p + 1, &mask);
    if (mask.type == LEX_T_ERROR) {
        lex_token_swap(&mask, token);
        lex_token_destroy(&mask);
        return p;
    }
    ovs_assert(mask.type == LEX_T_INTEGER);

    /* Now convert the value and mask into a masked integer token.
     * We have a few special cases. */
    token->type = LEX_T_MASKED_INTEGER;
    memset(&token->mask, 0, sizeof token->mask);
    uint32_t prefix_bits = ntohll(mask.value.integer);
    if (token->format == mask.format) {
        /* Same format value and mask is always OK. */
        token->mask = mask.value;
    } else if (token->format == LEX_F_IPV4
               && mask.format == LEX_F_DECIMAL
               && prefix_bits <= 32) {
        /* IPv4 address with decimal mask is a CIDR prefix. */
        token->mask.integer = htonll(ntohl(be32_prefix_mask(prefix_bits)));
    } else if (token->format == LEX_F_IPV6
               && mask.format == LEX_F_DECIMAL
               && prefix_bits <= 128) {
        /* IPv6 address with decimal mask is a CIDR prefix. */
        token->mask.ipv6 = ipv6_create_mask(prefix_bits);
    } else if (token->format == LEX_F_DECIMAL
               && mask.format == LEX_F_HEXADECIMAL
               && token->value.integer == 0) {
        /* Special case for e.g. 0/0x1234. */
        token->format = LEX_F_HEXADECIMAL;
        token->mask = mask.value;
    } else {
        lex_error(token, "Value and mask have incompatible formats.");
        return p;
    }

    /* Check invariant that a 1-bit in the value corresponds to a 1-bit in the
     * mask. */
    for (int i = 0; i < ARRAY_SIZE(token->mask.be32); i++) {
        ovs_be32 v = token->value.be32[i];
        ovs_be32 m = token->mask.be32[i];

        if (v & ~m) {
            lex_error(token, "Value contains unmasked 1-bits.");
            break;
        }
    }

    /* Done! */
    lex_token_destroy(&mask);
    return p;
}

static const char *
lex_parse_integer(const char *p, struct lex_token *token)
{
    p = lex_parse_integer__(p, token);
    if (token->type == LEX_T_INTEGER && *p == '/') {
        p = lex_parse_mask(p, token);
    }
    return p;
}

static const char *
lex_parse_string(const char *p, struct lex_token *token)
{
    const char *start = ++p;
    for (;;) {
        switch (*p) {
        case '\0':
            lex_error(token, "Input ends inside quoted string.");
            return p;

        case '"':
            token->type = (json_string_unescape(start, p - start, &token->s)
                           ? LEX_T_STRING : LEX_T_ERROR);
            return p + 1;

        case '\\':
            p++;
            if (*p) {
                p++;
            }
            break;

        default:
            p++;
            break;
        }
    }
}

static bool
lex_is_id1(unsigned char c)
{
    return ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
            || c == '_' || c == '.');
}

static bool
lex_is_idn(unsigned char c)
{
    return lex_is_id1(c) || (c >= '0' && c <= '9');
}

static const char *
lex_parse_id(const char *p, struct lex_token *token)
{
    const char *start = p;

    do {
        p++;
    } while (lex_is_idn(*p));

    token->type = LEX_T_ID;
    token->s = xmemdup0(start, p - start);
    return p;
}

/* Initializes 'token' and parses the first token from the beginning of
 * null-terminated string 'p' into 'token'.  Stores a pointer to the start of
 * the token (after skipping white space and comments, if any) into '*startp'.
 * Returns the character position at which to begin parsing the next token. */
const char *
lex_token_parse(struct lex_token *token, const char *p, const char **startp)
{
    lex_token_init(token);

next:
    *startp = p;
    switch (*p) {
    case '\0':
        token->type = LEX_T_END;
        return p;

    case ' ': case '\t': case '\n': case '\r':
        p++;
        goto next;

    case '/':
        p++;
        if (*p == '/') {
            do {
                p++;
            } while (*p != '\0' && *p != '\n');
            goto next;
        } else if (*p == '*') {
            p++;
            for (;;) {
                if (*p == '*' && p[1] == '/') {
                    p += 2;
                    goto next;
                } else if (*p == '\0' || *p == '\n') {
                    lex_error(token, "`/*' without matching `*/'.");
                    return p;
                } else {
                    p++;
                }
            }
            goto next;
        } else {
            lex_error(token,
                      "`/' is only valid as part of `//' or `/*'.");
        }
        break;

    case '(':
        token->type = LEX_T_LPAREN;
        p++;
        break;

    case ')':
        token->type = LEX_T_RPAREN;
        p++;
        break;

    case '{':
        token->type = LEX_T_LCURLY;
        p++;
        break;

    case '}':
        token->type = LEX_T_RCURLY;
        p++;
        break;

    case '[':
        token->type = LEX_T_LSQUARE;
        p++;
        break;

    case ']':
        token->type = LEX_T_RSQUARE;
        p++;
        break;

    case '=':
        p++;
        if (*p == '=') {
            token->type = LEX_T_EQ;
            p++;
        } else {
            token->type = LEX_T_EQUALS;
        }
        break;

    case '!':
        p++;
        if (*p == '=') {
            token->type = LEX_T_NE;
            p++;
        } else {
            token->type = LEX_T_LOG_NOT;
        }
        break;

    case '&':
        p++;
        if (*p == '&') {
            token->type = LEX_T_LOG_AND;
            p++;
        } else {
            lex_error(token, "`&' is only valid as part of `&&'.");
        }
        break;

    case '|':
        p++;
        if (*p == '|') {
            token->type = LEX_T_LOG_OR;
            p++;
        } else {
            lex_error(token, "`|' is only valid as part of `||'.");
        }
        break;

    case '<':
        p++;
        if (*p == '=') {
            token->type = LEX_T_LE;
            p++;
        } else if (*p == '-' && p[1] == '>') {
            token->type = LEX_T_EXCHANGE;
            p += 2;
        } else {
            token->type = LEX_T_LT;
        }
        break;

    case '>':
        p++;
        if (*p == '=') {
            token->type = LEX_T_GE;
            p++;
        } else {
            token->type = LEX_T_GT;
        }
        break;

    case '.':
        p++;
        if (*p == '.') {
            token->type = LEX_T_ELLIPSIS;
            p++;
        } else {
            lex_error(token, "`.' is only valid as part of `..' or a number.");
        }
        break;

    case ',':
        p++;
        token->type = LEX_T_COMMA;
        break;

    case ';':
        p++;
        token->type = LEX_T_SEMICOLON;
        break;

    case '-':
        p++;
        if (*p == '-') {
            token->type = LEX_T_DECREMENT;
            p++;
        } else {
            lex_error(token, "`-' is only valid as part of `--'.");
        }
        break;

    case '0': case '1': case '2': case '3': case '4':
    case '5': case '6': case '7': case '8': case '9':
    case ':':
        p = lex_parse_integer(p, token);
        break;

    case '"':
        p = lex_parse_string(p, token);
        break;

    case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
    case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
        /* We need to distinguish an Ethernet address or IPv6 address from an
         * identifier.  Fortunately, Ethernet addresses and IPv6 addresses that
         * are ambiguous based on the first character, always start with hex
         * digits followed by a colon, but identifiers never do. */
        p = (p[strspn(p, "0123456789abcdefABCDEF")] == ':'
             ? lex_parse_integer(p, token)
             : lex_parse_id(p, token));
        break;

    default:
        if (lex_is_id1(*p)) {
            p = lex_parse_id(p, token);
        } else {
            if (isprint((unsigned char) *p)) {
                lex_error(token, "Invalid character `%c' in input.", *p);
            } else {
                lex_error(token, "Invalid byte 0x%d in input.", *p);
            }
            p++;
        }
        break;
    }

    return p;
}
\f
/* Initializes 'lexer' for parsing 'input'.
 *
 * While the lexer is in use, 'input' must remain available, but the caller
 * otherwise retains ownership of 'input'.
 *
 * The caller must call lexer_get() to obtain the first token. */
void
lexer_init(struct lexer *lexer, const char *input)
{
    lexer->input = input;
    lexer->start = NULL;
    lex_token_init(&lexer->token);
}

/* Frees storage associated with 'lexer'. */
void
lexer_destroy(struct lexer *lexer)
{
    lex_token_destroy(&lexer->token);
}

/* Obtains the next token from 'lexer' into 'lexer->token', and returns the
 * token's type.  The caller may examine 'lexer->token' directly to obtain full
 * information about the token. */
enum lex_type
lexer_get(struct lexer *lexer)
{
    lex_token_destroy(&lexer->token);
    lexer->input = lex_token_parse(&lexer->token, lexer->input, &lexer->start);
    return lexer->token.type;
}

/* Returns the type of the next token that will be fetched by lexer_get(),
 * without advancing 'lexer->token' to that token. */
enum lex_type
lexer_lookahead(const struct lexer *lexer)
{
    struct lex_token next;
    enum lex_type type;
    const char *start;

    lex_token_parse(&next, lexer->input, &start);
    type = next.type;
    lex_token_destroy(&next);
    return type;
}

/* If 'lexer''s current token has the given 'type', advances 'lexer' to the
 * next token and returns true.  Otherwise returns false. */
bool
lexer_match(struct lexer *lexer, enum lex_type type)
{
    if (lexer->token.type == type) {
        lexer_get(lexer);
        return true;
    } else {
        return false;
    }
}

/* If 'lexer''s current token is the identifier given in 'id', advances 'lexer'
 * to the next token and returns true.  Otherwise returns false.  */
bool
lexer_match_id(struct lexer *lexer, const char *id)
{
    if (lexer->token.type == LEX_T_ID && !strcmp(lexer->token.s, id)) {
        lexer_get(lexer);
        return true;
    } else {
        return false;
    }
}

bool
lexer_is_int(const struct lexer *lexer)
{
    return (lexer->token.type == LEX_T_INTEGER
            && lexer->token.format == LEX_F_DECIMAL
            && ntohll(lexer->token.value.integer) <= INT_MAX);
}

bool
lexer_get_int(struct lexer *lexer, int *value)
{
    if (lexer_is_int(lexer)) {
        *value = ntohll(lexer->token.value.integer);
        lexer_get(lexer);
        return true;
    } else {
        *value = 0;
        return false;
    }
}
Commit	Line	Data
10b1662b BP	1	/*
	2	* Copyright (c) 2015 Nicira, Inc.
	3	*
	4	* Licensed under the Apache License, Version 2.0 (the "License");
	5	* you may not use this file except in compliance with the License.
	6	* You may obtain a copy of the License at:
	7	*
	8	* http://www.apache.org/licenses/LICENSE-2.0
	9	*
	10	* Unless required by applicable law or agreed to in writing, software
	11	* distributed under the License is distributed on an "AS IS" BASIS,
	12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	13	* See the License for the specific language governing permissions and
	14	* limitations under the License.
	15	*/
	16
	17	#include <config.h>
	18	#include "lex.h"
	19	#include <ctype.h>
	20	#include <errno.h>
	21	#include <stdarg.h>
	22	#include "dynamic-string.h"
	23	#include "json.h"
	24	#include "util.h"
363b5330 BP	25	\f
	26	/* Returns a string that represents 'format'. */
	27	const char *
	28	lex_format_to_string(enum lex_format format)
	29	{
	30	switch (format) {
	31	case LEX_F_DECIMAL:
	32	return "decimal";
	33	case LEX_F_HEXADECIMAL:
	34	return "hexadecimal";
	35	case LEX_F_IPV4:
	36	return "IPv4";
	37	case LEX_F_IPV6:
	38	return "IPv6";
	39	case LEX_F_ETHERNET:
	40	return "Ethernet";
	41	default:
	42	abort();
	43	}
	44	}
	45	\f
10b1662b BP	46	/* Initializes 'token'. */
	47	void
	48	lex_token_init(struct lex_token *token)
	49	{
	50	token->type = LEX_T_END;
	51	token->s = NULL;
	52	}
	53
	54	/* Frees memory owned by 'token'. */
	55	void
	56	lex_token_destroy(struct lex_token *token)
	57	{
	58	free(token->s);
	59	}
	60
	61	/* Exchanges 'a' and 'b'. */
	62	void
	63	lex_token_swap(struct lex_token a, struct lex_token b)
	64	{
	65	struct lex_token tmp = *a;
	66	a = b;
	67	*b = tmp;
	68	}
	69	\f
	70	/* lex_token_format(). */
	71
	72	static size_t
	73	lex_token_n_zeros(enum lex_format format)
	74	{
	75	switch (format) {
	76	case LEX_F_DECIMAL: return offsetof(union mf_subvalue, integer);
	77	case LEX_F_HEXADECIMAL: return 0;
	78	case LEX_F_IPV4: return offsetof(union mf_subvalue, ipv4);
	79	case LEX_F_IPV6: return offsetof(union mf_subvalue, ipv6);
	80	case LEX_F_ETHERNET: return offsetof(union mf_subvalue, mac);
	81	default: OVS_NOT_REACHED();
	82	}
	83	}
	84
	85	/* Returns the effective format for 'token', that is, the format in which it
	86	* should actually be printed. This is ordinarily the same as 'token->format',
	87	* but it's always possible that someone sets up a token with a format that
	88	* won't work for a value, e.g. 'token->value' is wider than 32 bits but the
	89	* format is LEX_F_IPV4. (The lexer itself won't do that; this is an attempt
	90	* to avoid confusion in the future.) */
	91	static enum lex_format
	92	lex_token_get_format(const struct lex_token *token)
	93	{
	94	size_t n_zeros = lex_token_n_zeros(token->format);
	95	return (is_all_zeros(&token->value, n_zeros)
	96	&& (token->type != LEX_T_MASKED_INTEGER
	97	\|\| is_all_zeros(&token->mask, n_zeros))
	98	? token->format
	99	: LEX_F_HEXADECIMAL);
	100	}
	101
	102	static void
	103	lex_token_format_value(const union mf_subvalue *value,
	104	enum lex_format format, struct ds *s)
	105	{
	106	switch (format) {
	107	case LEX_F_DECIMAL:
	108	ds_put_format(s, "%"PRIu64, ntohll(value->integer));
	109	break;
110
111	case LEX_F_HEXADECIMAL:
112	mf_format_subvalue(value, s);
113	break;
114
115	case LEX_F_IPV4:
116	ds_put_format(s, IP_FMT, IP_ARGS(value->ipv4));
117	break;
118
119	case LEX_F_IPV6:
120	print_ipv6_addr(s, &value->ipv6);
121	break;
122
123	case LEX_F_ETHERNET:
124	ds_put_format(s, ETH_ADDR_FMT, ETH_ADDR_ARGS(value->mac));
125	break;
126
127	default:
128	OVS_NOT_REACHED();
129	}
130
131	}
132
133	static void
134	lex_token_format_masked_integer(const struct lex_token token, struct ds s)
135	{
136	enum lex_format format = lex_token_get_format(token);
137
138	lex_token_format_value(&token->value, format, s);
139	ds_put_char(s, '/');
140
141	const union mf_subvalue *mask = &token->mask;
142	if (format == LEX_F_IPV4 && ip_is_cidr(mask->ipv4)) {
143	ds_put_format(s, "%d", ip_count_cidr_bits(mask->ipv4));
144	} else if (token->format == LEX_F_IPV6 && ipv6_is_cidr(&mask->ipv6)) {
145	ds_put_format(s, "%d", ipv6_count_cidr_bits(&mask->ipv6));
146	} else {
147	lex_token_format_value(&token->mask, format, s);
148	}
149	}
150
10b1662b BP	151	/* Appends a string representation of 'token' to 's', in a format that can be
	152	* losslessly parsed back by the lexer. (LEX_T_END and LEX_T_ERROR can't be
	153	* parsed back.) */
	154	void
3d611299	155	lex_token_format(const struct lex_token token, struct ds s)
10b1662b BP	156	{
	157	switch (token->type) {
	158	case LEX_T_END:
	159	ds_put_cstr(s, "$");
	160	break;
	161
	162	case LEX_T_ID:
	163	ds_put_cstr(s, token->s);
	164	break;
	165
	166	case LEX_T_ERROR:
	167	ds_put_cstr(s, "error(");
3b626771	168	json_string_escape(token->s, s);
10b1662b BP	169	ds_put_char(s, ')');
	170	break;
	171
	172	case LEX_T_STRING:
3b626771	173	json_string_escape(token->s, s);
10b1662b BP	174	break;
	175
	176	case LEX_T_INTEGER:
	177	lex_token_format_value(&token->value, lex_token_get_format(token), s);
	178	break;
	179
	180	case LEX_T_MASKED_INTEGER:
	181	lex_token_format_masked_integer(token, s);
	182	break;
	183
	184	case LEX_T_LPAREN:
	185	ds_put_cstr(s, "(");
	186	break;
	187	case LEX_T_RPAREN:
	188	ds_put_cstr(s, ")");
	189	break;
	190	case LEX_T_LCURLY:
	191	ds_put_cstr(s, "{");
	192	break;
	193	case LEX_T_RCURLY:
	194	ds_put_cstr(s, "}");
	195	break;
	196	case LEX_T_LSQUARE:
	197	ds_put_cstr(s, "[");
	198	break;
	199	case LEX_T_RSQUARE:
	200	ds_put_cstr(s, "]");
	201	break;
	202	case LEX_T_EQ:
	203	ds_put_cstr(s, "==");
	204	break;
	205	case LEX_T_NE:
	206	ds_put_cstr(s, "!=");
	207	break;
	208	case LEX_T_LT:
	209	ds_put_cstr(s, "<");
	210	break;
	211	case LEX_T_LE:
	212	ds_put_cstr(s, "<=");
	213	break;
	214	case LEX_T_GT:
	215	ds_put_cstr(s, ">");
	216	break;
	217	case LEX_T_GE:
	218	ds_put_cstr(s, ">=");
	219	break;
	220	case LEX_T_LOG_NOT:
	221	ds_put_cstr(s, "!");
	222	break;
	223	case LEX_T_LOG_AND:
	224	ds_put_cstr(s, "&&");
	225	break;
	226	case LEX_T_LOG_OR:
	227	ds_put_cstr(s, "\|\|");
	228	break;
	229	case LEX_T_ELLIPSIS:
	230	ds_put_cstr(s, "..");
	231	break;
	232	case LEX_T_COMMA:
	233	ds_put_cstr(s, ",");
	234	break;
	235	case LEX_T_SEMICOLON:
	236	ds_put_cstr(s, ";");
	237	break;
238	case LEX_T_EQUALS:
239	ds_put_cstr(s, "=");
240	break;
a20c96c6 BP	241	case LEX_T_EXCHANGE:
	242	ds_put_cstr(s, "<->");
	243	break;
56091efe BP	244	case LEX_T_DECREMENT:
	245	ds_put_cstr(s, "--");
	246	break;
10b1662b BP	247	default:
	248	OVS_NOT_REACHED();
	249	}
	250
	251	}
	252	\f
	253	/* lex_token_parse(). */
	254
	255	static void OVS_PRINTF_FORMAT(2, 3)
	256	lex_error(struct lex_token token, const char message, ...)
	257	{
	258	ovs_assert(!token->s);
	259	token->type = LEX_T_ERROR;
	260
	261	va_list args;
	262	va_start(args, message);
	263	token->s = xvasprintf(message, args);
	264	va_end(args);
	265	}
	266
	267	static void
	268	lex_parse_hex_integer(const char start, size_t len, struct lex_token token)
	269	{
	270	const char *in = start + (len - 1);
	271	uint8_t *out = token->value.u8 + (sizeof token->value.u8 - 1);
	272
	273	for (int i = 0; i < len; i++) {
	274	int hexit = hexit_value(in[-i]);
	275	if (hexit < 0) {
	276	lex_error(token, "Invalid syntax in hexadecimal constant.");
	277	return;
	278	}
	279	if (hexit && i / 2 >= sizeof token->value.u8) {
	280	lex_error(token, "Hexadecimal constant requires more than "
	281	"%"PRIuSIZE" bits.", 8 * sizeof token->value.u8);
	282	return;
	283	}
	284	out[-(i / 2)] \|= i % 2 ? hexit << 4 : hexit;
	285	}
	286	token->format = LEX_F_HEXADECIMAL;
	287	}
	288
	289	static const char *
	290	lex_parse_integer__(const char p, struct lex_token token)
	291	{
	292	lex_token_init(token);
	293	token->type = LEX_T_INTEGER;
	294	memset(&token->value, 0, sizeof token->value);
	295	const char *start = p;
	296	const char *end = start;
	297	while (isalnum((unsigned char) end) \|\| end == ':'
	298	\|\| (*end == '.' && end[1] != '.')) {
	299	end++;
	300	}
	301	size_t len = end - start;
	302
	303	int n;
74ff3298	304	struct eth_addr mac;
10b1662b BP	305
	306	if (!len) {
	307	lex_error(token, "Integer constant expected.");
	308	} else if (len == 17
	309	&& ovs_scan(start, ETH_ADDR_SCAN_FMT"%n",
	310	ETH_ADDR_SCAN_ARGS(mac), &n)
	311	&& n == len) {
74ff3298	312	token->value.mac = mac;
10b1662b BP	313	token->format = LEX_F_ETHERNET;
	314	} else if (start + strspn(start, "0123456789") == end) {
	315	if (p[0] == '0' && len > 1) {
	316	lex_error(token, "Decimal constants must not have leading zeros.");
	317	} else {
	318	unsigned long long int integer;
	319	char *tail;
	320
	321	errno = 0;
	322	integer = strtoull(p, &tail, 10);
	323	if (tail != end \|\| errno == ERANGE) {
	324	lex_error(token, "Decimal constants must be less than 2**64.");
	325	} else {
	326	token->value.integer = htonll(integer);
	327	token->format = LEX_F_DECIMAL;
	328	}
	329	}
	330	} else if (p[0] == '0' && (p[1] == 'x' \|\| p[1] == 'X')) {
	331	if (len > 2) {
	332	lex_parse_hex_integer(start + 2, len - 2, token);
	333	} else {
	334	lex_error(token, "Hex digits expected following 0%c.", p[1]);
	335	}
	336	} else if (len < INET6_ADDRSTRLEN) {
	337	char copy[INET6_ADDRSTRLEN];
	338	memcpy(copy, p, len);
	339	copy[len] = '\0';
	340
	341	struct in_addr ipv4;
	342	struct in6_addr ipv6;
	343	if (inet_pton(AF_INET, copy, &ipv4) == 1) {
	344	token->value.ipv4 = ipv4.s_addr;
	345	token->format = LEX_F_IPV4;
	346	} else if (inet_pton(AF_INET6, copy, &ipv6) == 1) {
	347	token->value.ipv6 = ipv6;
	348	token->format = LEX_F_IPV6;
	349	} else {
	350	lex_error(token, "Invalid numeric constant.");
	351	}
	352	} else {
	353	lex_error(token, "Invalid numeric constant.");
	354	}
	355
	356	ovs_assert(token->type == LEX_T_INTEGER \|\| token->type == LEX_T_ERROR);
	357	return end;
	358	}
	359
	360	static const char *
	361	lex_parse_mask(const char p, struct lex_token token)
	362	{
	363	struct lex_token mask;
	364
	365	/* Parse just past the '/' as a second integer. Handle errors. */
	366	p = lex_parse_integer__(p + 1, &mask);
	367	if (mask.type == LEX_T_ERROR) {
	368	lex_token_swap(&mask, token);
	369	lex_token_destroy(&mask);
	370	return p;
	371	}
	372	ovs_assert(mask.type == LEX_T_INTEGER);
	373
	374	/* Now convert the value and mask into a masked integer token.
	375	* We have a few special cases. */
	376	token->type = LEX_T_MASKED_INTEGER;
377	memset(&token->mask, 0, sizeof token->mask);
378	uint32_t prefix_bits = ntohll(mask.value.integer);
379	if (token->format == mask.format) {
380	/* Same format value and mask is always OK. */
381	token->mask = mask.value;
382	} else if (token->format == LEX_F_IPV4
383	&& mask.format == LEX_F_DECIMAL
384	&& prefix_bits <= 32) {
385	/* IPv4 address with decimal mask is a CIDR prefix. */
386	token->mask.integer = htonll(ntohl(be32_prefix_mask(prefix_bits)));
387	} else if (token->format == LEX_F_IPV6
388	&& mask.format == LEX_F_DECIMAL
389	&& prefix_bits <= 128) {
390	/* IPv6 address with decimal mask is a CIDR prefix. */
391	token->mask.ipv6 = ipv6_create_mask(prefix_bits);
392	} else if (token->format == LEX_F_DECIMAL
393	&& mask.format == LEX_F_HEXADECIMAL
394	&& token->value.integer == 0) {
395	/* Special case for e.g. 0/0x1234. */
396	token->format = LEX_F_HEXADECIMAL;
397	token->mask = mask.value;
398	} else {
399	lex_error(token, "Value and mask have incompatible formats.");
400	return p;
401	}
402
403	/* Check invariant that a 1-bit in the value corresponds to a 1-bit in the
404	* mask. */
405	for (int i = 0; i < ARRAY_SIZE(token->mask.be32); i++) {
406	ovs_be32 v = token->value.be32[i];
407	ovs_be32 m = token->mask.be32[i];
408
409	if (v & ~m) {
410	lex_error(token, "Value contains unmasked 1-bits.");
411	break;
412	}
413	}
414
415	/* Done! */
416	lex_token_destroy(&mask);
417	return p;
418	}
419
420	static const char *
421	lex_parse_integer(const char p, struct lex_token token)
422	{
423	p = lex_parse_integer__(p, token);
424	if (token->type == LEX_T_INTEGER && *p == '/') {
425	p = lex_parse_mask(p, token);
426	}
427	return p;
428	}
429
430	static const char *
431	lex_parse_string(const char p, struct lex_token token)
432	{
433	const char *start = ++p;
434	for (;;) {
435	switch (*p) {
436	case '\0':
437	lex_error(token, "Input ends inside quoted string.");
438	return p;
439
440	case '"':
441	token->type = (json_string_unescape(start, p - start, &token->s)
442	? LEX_T_STRING : LEX_T_ERROR);
443	return p + 1;
444
445	case '\\':
446	p++;
447	if (*p) {
448	p++;
449	}
450	break;
451
452	default:
453	p++;
454	break;
455	}
456	}
457	}
458
459	static bool
460	lex_is_id1(unsigned char c)
461	{
462	return ((c >= 'a' && c <= 'z') \|\| (c >= 'A' && c <= 'Z')
463	\|\| c == '_' \|\| c == '.');
464	}
465
466	static bool
467	lex_is_idn(unsigned char c)
468	{
469	return lex_is_id1(c) \|\| (c >= '0' && c <= '9');
470	}
471
472	static const char *
473	lex_parse_id(const char p, struct lex_token token)
474	{
475	const char *start = p;
476
477	do {
478	p++;
479	} while (lex_is_idn(*p));
480
481	token->type = LEX_T_ID;
482	token->s = xmemdup0(start, p - start);
483	return p;
484	}
485
486	/* Initializes 'token' and parses the first token from the beginning of
487	* null-terminated string 'p' into 'token'. Stores a pointer to the start of
488	* the token (after skipping white space and comments, if any) into '*startp'.
489	* Returns the character position at which to begin parsing the next token. */
490	const char *
491	lex_token_parse(struct lex_token token, const char p, const char **startp)
492	{
493	lex_token_init(token);
494
495	next:
496	*startp = p;
497	switch (*p) {
498	case '\0':
499	token->type = LEX_T_END;
500	return p;
501
502	case ' ': case '\t': case '\n': case '\r':
503	p++;
504	goto next;
505
506	case '/':
507	p++;
508	if (*p == '/') {
509	do {
510	p++;
511	} while (p != '\0' && p != '\n');
512	goto next;
513	} else if (p == '') {
514	p++;
515	for (;;) {
516	if (p == '' && p[1] == '/') {
517	p += 2;
518	goto next;
519	} else if (p == '\0' \|\| p == '\n') {
520	lex_error(token, "`/' without matching `/'.");
521	return p;
522	} else {
523	p++;
524	}
525	}
526	goto next;
527	} else {
528	lex_error(token,
529	"`/' is only valid as part of `//' or `/*'.");
530	}
531	break;
532
533	case '(':
534	token->type = LEX_T_LPAREN;
535	p++;
536	break;
537
538	case ')':
539	token->type = LEX_T_RPAREN;
540	p++;
541	break;
542
543	case '{':
544	token->type = LEX_T_LCURLY;
545	p++;
546	break;
547
548	case '}':
549	token->type = LEX_T_RCURLY;
550	p++;
551	break;
552
553	case '[':
554	token->type = LEX_T_LSQUARE;
555	p++;
556	break;
557
558	case ']':
559	token->type = LEX_T_RSQUARE;
560	p++;
561	break;
562
563	case '=':
564	p++;
565	if (*p == '=') {
566	token->type = LEX_T_EQ;
567	p++;
568	} else {
569	token->type = LEX_T_EQUALS;
570	}
571	break;
572
573	case '!':
574	p++;
575	if (*p == '=') {
576	token->type = LEX_T_NE;
577	p++;
578	} else {
579	token->type = LEX_T_LOG_NOT;
580	}
581	break;
582
583	case '&':
584	p++;
585	if (*p == '&') {
586	token->type = LEX_T_LOG_AND;
587	p++;
588	} else {
589	lex_error(token, "`&' is only valid as part of `&&'.");
590	}
591	break;
592
593	case '\|':
594	p++;
595	if (*p == '\|') {
596	token->type = LEX_T_LOG_OR;
597	p++;
598	} else {
599	lex_error(token, "`\|' is only valid as part of `\|\|'.");
600	}
601	break;
602
603	case '<':
604	p++;
605	if (*p == '=') {
606	token->type = LEX_T_LE;
607	p++;
a20c96c6 BP	608	} else if (*p == '-' && p[1] == '>') {
	609	token->type = LEX_T_EXCHANGE;
	610	p += 2;
10b1662b BP	611	} else {
	612	token->type = LEX_T_LT;
	613	}
	614	break;
	615
	616	case '>':
	617	p++;
	618	if (*p == '=') {
	619	token->type = LEX_T_GE;
	620	p++;
	621	} else {
	622	token->type = LEX_T_GT;
	623	}
	624	break;
	625
	626	case '.':
	627	p++;
	628	if (*p == '.') {
	629	token->type = LEX_T_ELLIPSIS;
	630	p++;
	631	} else {
	632	lex_error(token, "`.' is only valid as part of `..' or a number.");
	633	}
	634	break;
	635
	636	case ',':
	637	p++;
	638	token->type = LEX_T_COMMA;
	639	break;
	640
	641	case ';':
	642	p++;
	643	token->type = LEX_T_SEMICOLON;
	644	break;
	645
56091efe BP	646	case '-':
	647	p++;
	648	if (*p == '-') {
	649	token->type = LEX_T_DECREMENT;
	650	p++;
	651	} else {
	652	lex_error(token, "`-' is only valid as part of `--'.");
	653	}
	654	break;
	655
10b1662b BP	656	case '0': case '1': case '2': case '3': case '4':
	657	case '5': case '6': case '7': case '8': case '9':
	658	case ':':
	659	p = lex_parse_integer(p, token);
	660	break;
	661
	662	case '"':
	663	p = lex_parse_string(p, token);
	664	break;
	665
	666	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
	667	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
	668	/* We need to distinguish an Ethernet address or IPv6 address from an
	669	* identifier. Fortunately, Ethernet addresses and IPv6 addresses that
	670	* are ambiguous based on the first character, always start with hex
	671	* digits followed by a colon, but identifiers never do. */
	672	p = (p[strspn(p, "0123456789abcdefABCDEF")] == ':'
	673	? lex_parse_integer(p, token)
	674	: lex_parse_id(p, token));
	675	break;
	676
	677	default:
	678	if (lex_is_id1(*p)) {
	679	p = lex_parse_id(p, token);
	680	} else {
	681	if (isprint((unsigned char) *p)) {
	682	lex_error(token, "Invalid character `%c' in input.", *p);
	683	} else {
	684	lex_error(token, "Invalid byte 0x%d in input.", *p);
	685	}
	686	p++;
	687	}
	688	break;
	689	}
	690
	691	return p;
	692	}
	693	\f
	694	/* Initializes 'lexer' for parsing 'input'.
	695	*
	696	* While the lexer is in use, 'input' must remain available, but the caller
	697	* otherwise retains ownership of 'input'.
	698	*
	699	* The caller must call lexer_get() to obtain the first token. */
	700	void
	701	lexer_init(struct lexer lexer, const char input)
	702	{
	703	lexer->input = input;
	704	lexer->start = NULL;
	705	lex_token_init(&lexer->token);
	706	}
	707
	708	/* Frees storage associated with 'lexer'. */
	709	void
	710	lexer_destroy(struct lexer *lexer)
	711	{
	712	lex_token_destroy(&lexer->token);
	713	}
	714
	715	/* Obtains the next token from 'lexer' into 'lexer->token', and returns the
	716	* token's type. The caller may examine 'lexer->token' directly to obtain full
	717	* information about the token. */
	718	enum lex_type
	719	lexer_get(struct lexer *lexer)
720	{
721	lex_token_destroy(&lexer->token);
722	lexer->input = lex_token_parse(&lexer->token, lexer->input, &lexer->start);
723	return lexer->token.type;
724	}
725
27912fdb BP	726	/* Returns the type of the next token that will be fetched by lexer_get(),
	727	* without advancing 'lexer->token' to that token. */
	728	enum lex_type
	729	lexer_lookahead(const struct lexer *lexer)
	730	{
	731	struct lex_token next;
	732	enum lex_type type;
	733	const char *start;
	734
	735	lex_token_parse(&next, lexer->input, &start);
	736	type = next.type;
	737	lex_token_destroy(&next);
	738	return type;
	739	}
	740
10b1662b BP	741	/* If 'lexer''s current token has the given 'type', advances 'lexer' to the
	742	* next token and returns true. Otherwise returns false. */
	743	bool
	744	lexer_match(struct lexer *lexer, enum lex_type type)
	745	{
	746	if (lexer->token.type == type) {
	747	lexer_get(lexer);
	748	return true;
	749	} else {
	750	return false;
	751	}
	752	}
27912fdb BP	753
	754	/* If 'lexer''s current token is the identifier given in 'id', advances 'lexer'
	755	* to the next token and returns true. Otherwise returns false. */
	756	bool
	757	lexer_match_id(struct lexer lexer, const char id)
	758	{
	759	if (lexer->token.type == LEX_T_ID && !strcmp(lexer->token.s, id)) {
	760	lexer_get(lexer);
	761	return true;
	762	} else {
	763	return false;
	764	}
	765	}
558ec83d BP	766
	767	bool
	768	lexer_is_int(const struct lexer *lexer)
	769	{
	770	return (lexer->token.type == LEX_T_INTEGER
	771	&& lexer->token.format == LEX_F_DECIMAL
	772	&& ntohll(lexer->token.value.integer) <= INT_MAX);
	773	}
	774
	775	bool
	776	lexer_get_int(struct lexer lexer, int value)
	777	{
	778	if (lexer_is_int(lexer)) {
	779	*value = ntohll(lexer->token.value.integer);
	780	lexer_get(lexer);
	781	return true;
	782	} else {
	783	*value = 0;
	784	return false;
	785	}
	786	}