[ovs.git] / ovn / lib / lex.c

/*
 * Copyright (c) 2015 Nicira, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <config.h>
#include "lex.h"
#include <ctype.h>
#include <errno.h>
#include <stdarg.h>
#include "dynamic-string.h"
#include "json.h"
#include "util.h"
\f
/* Returns a string that represents 'format'. */
const char *
lex_format_to_string(enum lex_format format)
{
    switch (format) {
    case LEX_F_DECIMAL:
        return "decimal";
    case LEX_F_HEXADECIMAL:
        return "hexadecimal";
    case LEX_F_IPV4:
        return "IPv4";
    case LEX_F_IPV6:
        return "IPv6";
    case LEX_F_ETHERNET:
        return "Ethernet";
    default:
        abort();
    }
}
\f
/* Initializes 'token'. */
void
lex_token_init(struct lex_token *token)
{
    token->type = LEX_T_END;
    token->s = NULL;
}

/* Frees memory owned by 'token'. */
void
lex_token_destroy(struct lex_token *token)
{
    free(token->s);
}

/* Exchanges 'a' and 'b'. */
void
lex_token_swap(struct lex_token *a, struct lex_token *b)
{
    struct lex_token tmp = *a;
    *a = *b;
    *b = tmp;
}
\f
/* lex_token_format(). */

static size_t
lex_token_n_zeros(enum lex_format format)
{
    switch (format) {
    case LEX_F_DECIMAL:     return offsetof(union mf_subvalue, integer);
    case LEX_F_HEXADECIMAL: return 0;
    case LEX_F_IPV4:        return offsetof(union mf_subvalue, ipv4);
    case LEX_F_IPV6:        return offsetof(union mf_subvalue, ipv6);
    case LEX_F_ETHERNET:    return offsetof(union mf_subvalue, mac);
    default: OVS_NOT_REACHED();
    }
}

/* Returns the effective format for 'token', that is, the format in which it
 * should actually be printed.  This is ordinarily the same as 'token->format',
 * but it's always possible that someone sets up a token with a format that
 * won't work for a value, e.g. 'token->value' is wider than 32 bits but the
 * format is LEX_F_IPV4.  (The lexer itself won't do that; this is an attempt
 * to avoid confusion in the future.) */
static enum lex_format
lex_token_get_format(const struct lex_token *token)
{
    size_t n_zeros = lex_token_n_zeros(token->format);
    return (is_all_zeros(&token->value, n_zeros)
            && (token->type != LEX_T_MASKED_INTEGER
                || is_all_zeros(&token->mask, n_zeros))
            ? token->format
            : LEX_F_HEXADECIMAL);
}

static void
lex_token_format_value(const union mf_subvalue *value,
                       enum lex_format format, struct ds *s)
{
    switch (format) {
    case LEX_F_DECIMAL:
        ds_put_format(s, "%"PRIu64, ntohll(value->integer));
        break;

    case LEX_F_HEXADECIMAL:
        mf_format_subvalue(value, s);
        break;

    case LEX_F_IPV4:
        ds_put_format(s, IP_FMT, IP_ARGS(value->ipv4));
        break;

    case LEX_F_IPV6:
        print_ipv6_addr(s, &value->ipv6);
        break;

    case LEX_F_ETHERNET:
        ds_put_format(s, ETH_ADDR_FMT, ETH_ADDR_ARGS(value->mac));
        break;

    default:
        OVS_NOT_REACHED();
    }

}

static void
lex_token_format_masked_integer(const struct lex_token *token, struct ds *s)
{
    enum lex_format format = lex_token_get_format(token);

    lex_token_format_value(&token->value, format, s);
    ds_put_char(s, '/');

    const union mf_subvalue *mask = &token->mask;
    if (format == LEX_F_IPV4 && ip_is_cidr(mask->ipv4)) {
        ds_put_format(s, "%d", ip_count_cidr_bits(mask->ipv4));
    } else if (token->format == LEX_F_IPV6 && ipv6_is_cidr(&mask->ipv6)) {
        ds_put_format(s, "%d", ipv6_count_cidr_bits(&mask->ipv6));
    } else {
        lex_token_format_value(&token->mask, format, s);
    }
}

/* Appends a string representation of 'token' to 's', in a format that can be
 * losslessly parsed back by the lexer.  (LEX_T_END and LEX_T_ERROR can't be
 * parsed back.) */
void
lex_token_format(const struct lex_token *token, struct ds *s)
{
    switch (token->type) {
    case LEX_T_END:
        ds_put_cstr(s, "$");
        break;

    case LEX_T_ID:
        ds_put_cstr(s, token->s);
        break;

    case LEX_T_ERROR:
        ds_put_cstr(s, "error(");
        json_string_escape(token->s, s);
        ds_put_char(s, ')');
        break;

    case LEX_T_STRING:
        json_string_escape(token->s, s);
        break;

    case LEX_T_INTEGER:
        lex_token_format_value(&token->value, lex_token_get_format(token), s);
        break;

    case LEX_T_MASKED_INTEGER:
        lex_token_format_masked_integer(token, s);
        break;

    case LEX_T_LPAREN:
        ds_put_cstr(s, "(");
        break;
    case LEX_T_RPAREN:
        ds_put_cstr(s, ")");
        break;
    case LEX_T_LCURLY:
        ds_put_cstr(s, "{");
        break;
    case LEX_T_RCURLY:
        ds_put_cstr(s, "}");
        break;
    case LEX_T_LSQUARE:
        ds_put_cstr(s, "[");
        break;
    case LEX_T_RSQUARE:
        ds_put_cstr(s, "]");
        break;
    case LEX_T_EQ:
        ds_put_cstr(s, "==");
        break;
    case LEX_T_NE:
        ds_put_cstr(s, "!=");
        break;
    case LEX_T_LT:
        ds_put_cstr(s, "<");
        break;
    case LEX_T_LE:
        ds_put_cstr(s, "<=");
        break;
    case LEX_T_GT:
        ds_put_cstr(s, ">");
        break;
    case LEX_T_GE:
        ds_put_cstr(s, ">=");
        break;
    case LEX_T_LOG_NOT:
        ds_put_cstr(s, "!");
        break;
    case LEX_T_LOG_AND:
        ds_put_cstr(s, "&&");
        break;
    case LEX_T_LOG_OR:
        ds_put_cstr(s, "||");
        break;
    case LEX_T_ELLIPSIS:
        ds_put_cstr(s, "..");
        break;
    case LEX_T_COMMA:
        ds_put_cstr(s, ",");
        break;
    case LEX_T_SEMICOLON:
        ds_put_cstr(s, ";");
        break;
    case LEX_T_EQUALS:
        ds_put_cstr(s, "=");
        break;
    default:
        OVS_NOT_REACHED();
    }

}
\f
/* lex_token_parse(). */

static void OVS_PRINTF_FORMAT(2, 3)
lex_error(struct lex_token *token, const char *message, ...)
{
    ovs_assert(!token->s);
    token->type = LEX_T_ERROR;

    va_list args;
    va_start(args, message);
    token->s = xvasprintf(message, args);
    va_end(args);
}

static void
lex_parse_hex_integer(const char *start, size_t len, struct lex_token *token)
{
    const char *in = start + (len - 1);
    uint8_t *out = token->value.u8 + (sizeof token->value.u8 - 1);

    for (int i = 0; i < len; i++) {
        int hexit = hexit_value(in[-i]);
        if (hexit < 0) {
            lex_error(token, "Invalid syntax in hexadecimal constant.");
            return;
        }
        if (hexit && i / 2 >= sizeof token->value.u8) {
            lex_error(token, "Hexadecimal constant requires more than "
                      "%"PRIuSIZE" bits.", 8 * sizeof token->value.u8);
            return;
        }
        out[-(i / 2)] |= i % 2 ? hexit << 4 : hexit;
    }
    token->format = LEX_F_HEXADECIMAL;
}

static const char *
lex_parse_integer__(const char *p, struct lex_token *token)
{
    lex_token_init(token);
    token->type = LEX_T_INTEGER;
    memset(&token->value, 0, sizeof token->value);
    const char *start = p;
    const char *end = start;
    while (isalnum((unsigned char) *end) || *end == ':'
           || (*end == '.' && end[1] != '.')) {
        end++;
    }
    size_t len = end - start;

    int n;
    uint8_t mac[ETH_ADDR_LEN];

    if (!len) {
        lex_error(token, "Integer constant expected.");
    } else if (len == 17
               && ovs_scan(start, ETH_ADDR_SCAN_FMT"%n",
                           ETH_ADDR_SCAN_ARGS(mac), &n)
               && n == len) {
        memcpy(token->value.mac, mac, sizeof token->value.mac);
        token->format = LEX_F_ETHERNET;
    } else if (start + strspn(start, "0123456789") == end) {
        if (p[0] == '0' && len > 1) {
            lex_error(token, "Decimal constants must not have leading zeros.");
        } else {
            unsigned long long int integer;
            char *tail;

            errno = 0;
            integer = strtoull(p, &tail, 10);
            if (tail != end || errno == ERANGE) {
                lex_error(token, "Decimal constants must be less than 2**64.");
            } else {
                token->value.integer = htonll(integer);
                token->format = LEX_F_DECIMAL;
            }
        }
    } else if (p[0] == '0' && (p[1] == 'x' || p[1] == 'X')) {
        if (len > 2) {
            lex_parse_hex_integer(start + 2, len - 2, token);
        } else {
            lex_error(token, "Hex digits expected following 0%c.", p[1]);
        }
    } else if (len < INET6_ADDRSTRLEN) {
        char copy[INET6_ADDRSTRLEN];
        memcpy(copy, p, len);
        copy[len] = '\0';

        struct in_addr ipv4;
        struct in6_addr ipv6;
        if (inet_pton(AF_INET, copy, &ipv4) == 1) {
            token->value.ipv4 = ipv4.s_addr;
            token->format = LEX_F_IPV4;
        } else if (inet_pton(AF_INET6, copy, &ipv6) == 1) {
            token->value.ipv6 = ipv6;
            token->format = LEX_F_IPV6;
        } else {
            lex_error(token, "Invalid numeric constant.");
        }
    } else {
        lex_error(token, "Invalid numeric constant.");
    }

    ovs_assert(token->type == LEX_T_INTEGER || token->type == LEX_T_ERROR);
    return end;
}

static const char *
lex_parse_mask(const char *p, struct lex_token *token)
{
    struct lex_token mask;

    /* Parse just past the '/' as a second integer.  Handle errors. */
    p = lex_parse_integer__(p + 1, &mask);
    if (mask.type == LEX_T_ERROR) {
        lex_token_swap(&mask, token);
        lex_token_destroy(&mask);
        return p;
    }
    ovs_assert(mask.type == LEX_T_INTEGER);

    /* Now convert the value and mask into a masked integer token.
     * We have a few special cases. */
    token->type = LEX_T_MASKED_INTEGER;
    memset(&token->mask, 0, sizeof token->mask);
    uint32_t prefix_bits = ntohll(mask.value.integer);
    if (token->format == mask.format) {
        /* Same format value and mask is always OK. */
        token->mask = mask.value;
    } else if (token->format == LEX_F_IPV4
               && mask.format == LEX_F_DECIMAL
               && prefix_bits <= 32) {
        /* IPv4 address with decimal mask is a CIDR prefix. */
        token->mask.integer = htonll(ntohl(be32_prefix_mask(prefix_bits)));
    } else if (token->format == LEX_F_IPV6
               && mask.format == LEX_F_DECIMAL
               && prefix_bits <= 128) {
        /* IPv6 address with decimal mask is a CIDR prefix. */
        token->mask.ipv6 = ipv6_create_mask(prefix_bits);
    } else if (token->format == LEX_F_DECIMAL
               && mask.format == LEX_F_HEXADECIMAL
               && token->value.integer == 0) {
        /* Special case for e.g. 0/0x1234. */
        token->format = LEX_F_HEXADECIMAL;
        token->mask = mask.value;
    } else {
        lex_error(token, "Value and mask have incompatible formats.");
        return p;
    }

    /* Check invariant that a 1-bit in the value corresponds to a 1-bit in the
     * mask. */
    for (int i = 0; i < ARRAY_SIZE(token->mask.be32); i++) {
        ovs_be32 v = token->value.be32[i];
        ovs_be32 m = token->mask.be32[i];

        if (v & ~m) {
            lex_error(token, "Value contains unmasked 1-bits.");
            break;
        }
    }

    /* Done! */
    lex_token_destroy(&mask);
    return p;
}

static const char *
lex_parse_integer(const char *p, struct lex_token *token)
{
    p = lex_parse_integer__(p, token);
    if (token->type == LEX_T_INTEGER && *p == '/') {
        p = lex_parse_mask(p, token);
    }
    return p;
}

static const char *
lex_parse_string(const char *p, struct lex_token *token)
{
    const char *start = ++p;
    for (;;) {
        switch (*p) {
        case '\0':
            lex_error(token, "Input ends inside quoted string.");
            return p;

        case '"':
            token->type = (json_string_unescape(start, p - start, &token->s)
                           ? LEX_T_STRING : LEX_T_ERROR);
            return p + 1;

        case '\\':
            p++;
            if (*p) {
                p++;
            }
            break;

        default:
            p++;
            break;
        }
    }
}

static bool
lex_is_id1(unsigned char c)
{
    return ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
            || c == '_' || c == '.');
}

static bool
lex_is_idn(unsigned char c)
{
    return lex_is_id1(c) || (c >= '0' && c <= '9');
}

static const char *
lex_parse_id(const char *p, struct lex_token *token)
{
    const char *start = p;

    do {
        p++;
    } while (lex_is_idn(*p));

    token->type = LEX_T_ID;
    token->s = xmemdup0(start, p - start);
    return p;
}

/* Initializes 'token' and parses the first token from the beginning of
 * null-terminated string 'p' into 'token'.  Stores a pointer to the start of
 * the token (after skipping white space and comments, if any) into '*startp'.
 * Returns the character position at which to begin parsing the next token. */
const char *
lex_token_parse(struct lex_token *token, const char *p, const char **startp)
{
    lex_token_init(token);

next:
    *startp = p;
    switch (*p) {
    case '\0':
        token->type = LEX_T_END;
        return p;

    case ' ': case '\t': case '\n': case '\r':
        p++;
        goto next;

    case '/':
        p++;
        if (*p == '/') {
            do {
                p++;
            } while (*p != '\0' && *p != '\n');
            goto next;
        } else if (*p == '*') {
            p++;
            for (;;) {
                if (*p == '*' && p[1] == '/') {
                    p += 2;
                    goto next;
                } else if (*p == '\0' || *p == '\n') {
                    lex_error(token, "`/*' without matching `*/'.");
                    return p;
                } else {
                    p++;
                }
            }
            goto next;
        } else {
            lex_error(token,
                      "`/' is only valid as part of `//' or `/*'.");
        }
        break;

    case '(':
        token->type = LEX_T_LPAREN;
        p++;
        break;

    case ')':
        token->type = LEX_T_RPAREN;
        p++;
        break;

    case '{':
        token->type = LEX_T_LCURLY;
        p++;
        break;

    case '}':
        token->type = LEX_T_RCURLY;
        p++;
        break;

    case '[':
        token->type = LEX_T_LSQUARE;
        p++;
        break;

    case ']':
        token->type = LEX_T_RSQUARE;
        p++;
        break;

    case '=':
        p++;
        if (*p == '=') {
            token->type = LEX_T_EQ;
            p++;
        } else {
            token->type = LEX_T_EQUALS;
        }
        break;

    case '!':
        p++;
        if (*p == '=') {
            token->type = LEX_T_NE;
            p++;
        } else {
            token->type = LEX_T_LOG_NOT;
        }
        break;

    case '&':
        p++;
        if (*p == '&') {
            token->type = LEX_T_LOG_AND;
            p++;
        } else {
            lex_error(token, "`&' is only valid as part of `&&'.");
        }
        break;

    case '|':
        p++;
        if (*p == '|') {
            token->type = LEX_T_LOG_OR;
            p++;
        } else {
            lex_error(token, "`|' is only valid as part of `||'.");
        }
        break;

    case '<':
        p++;
        if (*p == '=') {
            token->type = LEX_T_LE;
            p++;
        } else {
            token->type = LEX_T_LT;
        }
        break;

    case '>':
        p++;
        if (*p == '=') {
            token->type = LEX_T_GE;
            p++;
        } else {
            token->type = LEX_T_GT;
        }
        break;

    case '.':
        p++;
        if (*p == '.') {
            token->type = LEX_T_ELLIPSIS;
            p++;
        } else {
            lex_error(token, "`.' is only valid as part of `..' or a number.");
        }
        break;

    case ',':
        p++;
        token->type = LEX_T_COMMA;
        break;

    case ';':
        p++;
        token->type = LEX_T_SEMICOLON;
        break;

    case '0': case '1': case '2': case '3': case '4':
    case '5': case '6': case '7': case '8': case '9':
    case ':':
        p = lex_parse_integer(p, token);
        break;

    case '"':
        p = lex_parse_string(p, token);
        break;

    case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
    case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
        /* We need to distinguish an Ethernet address or IPv6 address from an
         * identifier.  Fortunately, Ethernet addresses and IPv6 addresses that
         * are ambiguous based on the first character, always start with hex
         * digits followed by a colon, but identifiers never do. */
        p = (p[strspn(p, "0123456789abcdefABCDEF")] == ':'
             ? lex_parse_integer(p, token)
             : lex_parse_id(p, token));
        break;

    default:
        if (lex_is_id1(*p)) {
            p = lex_parse_id(p, token);
        } else {
            if (isprint((unsigned char) *p)) {
                lex_error(token, "Invalid character `%c' in input.", *p);
            } else {
                lex_error(token, "Invalid byte 0x%d in input.", *p);
            }
            p++;
        }
        break;
    }

    return p;
}
\f
/* Initializes 'lexer' for parsing 'input'.
 *
 * While the lexer is in use, 'input' must remain available, but the caller
 * otherwise retains ownership of 'input'.
 *
 * The caller must call lexer_get() to obtain the first token. */
void
lexer_init(struct lexer *lexer, const char *input)
{
    lexer->input = input;
    lexer->start = NULL;
    lex_token_init(&lexer->token);
}

/* Frees storage associated with 'lexer'. */
void
lexer_destroy(struct lexer *lexer)
{
    lex_token_destroy(&lexer->token);
}

/* Obtains the next token from 'lexer' into 'lexer->token', and returns the
 * token's type.  The caller may examine 'lexer->token' directly to obtain full
 * information about the token. */
enum lex_type
lexer_get(struct lexer *lexer)
{
    lex_token_destroy(&lexer->token);
    lexer->input = lex_token_parse(&lexer->token, lexer->input, &lexer->start);
    return lexer->token.type;
}

/* Returns the type of the next token that will be fetched by lexer_get(),
 * without advancing 'lexer->token' to that token. */
enum lex_type
lexer_lookahead(const struct lexer *lexer)
{
    struct lex_token next;
    enum lex_type type;
    const char *start;

    lex_token_parse(&next, lexer->input, &start);
    type = next.type;
    lex_token_destroy(&next);
    return type;
}

/* If 'lexer''s current token has the given 'type', advances 'lexer' to the
 * next token and returns true.  Otherwise returns false. */
bool
lexer_match(struct lexer *lexer, enum lex_type type)
{
    if (lexer->token.type == type) {
        lexer_get(lexer);
        return true;
    } else {
        return false;
    }
}

/* If 'lexer''s current token is the identifier given in 'id', advances 'lexer'
 * to the next token and returns true.  Otherwise returns false.  */
bool
lexer_match_id(struct lexer *lexer, const char *id)
{
    if (lexer->token.type == LEX_T_ID && !strcmp(lexer->token.s, id)) {
        lexer_get(lexer);
        return true;
    } else {
        return false;
    }
}
Commit	Line	Data
10b1662b BP	1	/*
	2	* Copyright (c) 2015 Nicira, Inc.
	3	*
	4	* Licensed under the Apache License, Version 2.0 (the "License");
	5	* you may not use this file except in compliance with the License.
	6	* You may obtain a copy of the License at:
	7	*
	8	* http://www.apache.org/licenses/LICENSE-2.0
	9	*
	10	* Unless required by applicable law or agreed to in writing, software
	11	* distributed under the License is distributed on an "AS IS" BASIS,
	12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	13	* See the License for the specific language governing permissions and
	14	* limitations under the License.
	15	*/
	16
	17	#include <config.h>
	18	#include "lex.h"
	19	#include <ctype.h>
	20	#include <errno.h>
	21	#include <stdarg.h>
	22	#include "dynamic-string.h"
	23	#include "json.h"
	24	#include "util.h"
363b5330 BP	25	\f
	26	/* Returns a string that represents 'format'. */
	27	const char *
	28	lex_format_to_string(enum lex_format format)
	29	{
	30	switch (format) {
	31	case LEX_F_DECIMAL:
	32	return "decimal";
	33	case LEX_F_HEXADECIMAL:
	34	return "hexadecimal";
	35	case LEX_F_IPV4:
	36	return "IPv4";
	37	case LEX_F_IPV6:
	38	return "IPv6";
	39	case LEX_F_ETHERNET:
	40	return "Ethernet";
	41	default:
	42	abort();
	43	}
	44	}
	45	\f
10b1662b BP	46	/* Initializes 'token'. */
	47	void
	48	lex_token_init(struct lex_token *token)
	49	{
	50	token->type = LEX_T_END;
	51	token->s = NULL;
	52	}
	53
	54	/* Frees memory owned by 'token'. */
	55	void
	56	lex_token_destroy(struct lex_token *token)
	57	{
	58	free(token->s);
	59	}
	60
	61	/* Exchanges 'a' and 'b'. */
	62	void
	63	lex_token_swap(struct lex_token a, struct lex_token b)
	64	{
	65	struct lex_token tmp = *a;
	66	a = b;
	67	*b = tmp;
	68	}
	69	\f
	70	/* lex_token_format(). */
	71
	72	static size_t
	73	lex_token_n_zeros(enum lex_format format)
	74	{
	75	switch (format) {
	76	case LEX_F_DECIMAL: return offsetof(union mf_subvalue, integer);
	77	case LEX_F_HEXADECIMAL: return 0;
	78	case LEX_F_IPV4: return offsetof(union mf_subvalue, ipv4);
	79	case LEX_F_IPV6: return offsetof(union mf_subvalue, ipv6);
	80	case LEX_F_ETHERNET: return offsetof(union mf_subvalue, mac);
	81	default: OVS_NOT_REACHED();
	82	}
	83	}
	84
	85	/* Returns the effective format for 'token', that is, the format in which it
	86	* should actually be printed. This is ordinarily the same as 'token->format',
	87	* but it's always possible that someone sets up a token with a format that
	88	* won't work for a value, e.g. 'token->value' is wider than 32 bits but the
	89	* format is LEX_F_IPV4. (The lexer itself won't do that; this is an attempt
	90	* to avoid confusion in the future.) */
	91	static enum lex_format
	92	lex_token_get_format(const struct lex_token *token)
	93	{
	94	size_t n_zeros = lex_token_n_zeros(token->format);
	95	return (is_all_zeros(&token->value, n_zeros)
	96	&& (token->type != LEX_T_MASKED_INTEGER
	97	\|\| is_all_zeros(&token->mask, n_zeros))
	98	? token->format
	99	: LEX_F_HEXADECIMAL);
	100	}
	101
	102	static void
	103	lex_token_format_value(const union mf_subvalue *value,
	104	enum lex_format format, struct ds *s)
	105	{
	106	switch (format) {
	107	case LEX_F_DECIMAL:
	108	ds_put_format(s, "%"PRIu64, ntohll(value->integer));
	109	break;
110
111	case LEX_F_HEXADECIMAL:
112	mf_format_subvalue(value, s);
113	break;
114
115	case LEX_F_IPV4:
116	ds_put_format(s, IP_FMT, IP_ARGS(value->ipv4));
117	break;
118
119	case LEX_F_IPV6:
120	print_ipv6_addr(s, &value->ipv6);
121	break;
122
123	case LEX_F_ETHERNET:
124	ds_put_format(s, ETH_ADDR_FMT, ETH_ADDR_ARGS(value->mac));
125	break;
126
127	default:
128	OVS_NOT_REACHED();
129	}
130
131	}
132
133	static void
134	lex_token_format_masked_integer(const struct lex_token token, struct ds s)
135	{
136	enum lex_format format = lex_token_get_format(token);
137
138	lex_token_format_value(&token->value, format, s);
139	ds_put_char(s, '/');
140
141	const union mf_subvalue *mask = &token->mask;
142	if (format == LEX_F_IPV4 && ip_is_cidr(mask->ipv4)) {
143	ds_put_format(s, "%d", ip_count_cidr_bits(mask->ipv4));
144	} else if (token->format == LEX_F_IPV6 && ipv6_is_cidr(&mask->ipv6)) {
145	ds_put_format(s, "%d", ipv6_count_cidr_bits(&mask->ipv6));
146	} else {
147	lex_token_format_value(&token->mask, format, s);
148	}
149	}
150
10b1662b BP	151	/* Appends a string representation of 'token' to 's', in a format that can be
	152	* losslessly parsed back by the lexer. (LEX_T_END and LEX_T_ERROR can't be
	153	* parsed back.) */
	154	void
3d611299	155	lex_token_format(const struct lex_token token, struct ds s)
10b1662b BP	156	{
	157	switch (token->type) {
	158	case LEX_T_END:
	159	ds_put_cstr(s, "$");
	160	break;
	161
	162	case LEX_T_ID:
	163	ds_put_cstr(s, token->s);
	164	break;
	165
	166	case LEX_T_ERROR:
	167	ds_put_cstr(s, "error(");
3b626771	168	json_string_escape(token->s, s);
10b1662b BP	169	ds_put_char(s, ')');
	170	break;
	171
	172	case LEX_T_STRING:
3b626771	173	json_string_escape(token->s, s);
10b1662b BP	174	break;
	175
	176	case LEX_T_INTEGER:
	177	lex_token_format_value(&token->value, lex_token_get_format(token), s);
	178	break;
	179
	180	case LEX_T_MASKED_INTEGER:
	181	lex_token_format_masked_integer(token, s);
	182	break;
	183
	184	case LEX_T_LPAREN:
	185	ds_put_cstr(s, "(");
	186	break;
	187	case LEX_T_RPAREN:
	188	ds_put_cstr(s, ")");
	189	break;
	190	case LEX_T_LCURLY:
	191	ds_put_cstr(s, "{");
	192	break;
	193	case LEX_T_RCURLY:
	194	ds_put_cstr(s, "}");
	195	break;
	196	case LEX_T_LSQUARE:
	197	ds_put_cstr(s, "[");
	198	break;
	199	case LEX_T_RSQUARE:
	200	ds_put_cstr(s, "]");
	201	break;
	202	case LEX_T_EQ:
	203	ds_put_cstr(s, "==");
	204	break;
	205	case LEX_T_NE:
	206	ds_put_cstr(s, "!=");
	207	break;
	208	case LEX_T_LT:
	209	ds_put_cstr(s, "<");
	210	break;
	211	case LEX_T_LE:
	212	ds_put_cstr(s, "<=");
	213	break;
	214	case LEX_T_GT:
	215	ds_put_cstr(s, ">");
	216	break;
	217	case LEX_T_GE:
	218	ds_put_cstr(s, ">=");
	219	break;
	220	case LEX_T_LOG_NOT:
	221	ds_put_cstr(s, "!");
	222	break;
	223	case LEX_T_LOG_AND:
	224	ds_put_cstr(s, "&&");
	225	break;
	226	case LEX_T_LOG_OR:
	227	ds_put_cstr(s, "\|\|");
	228	break;
	229	case LEX_T_ELLIPSIS:
	230	ds_put_cstr(s, "..");
	231	break;
	232	case LEX_T_COMMA:
	233	ds_put_cstr(s, ",");
	234	break;
	235	case LEX_T_SEMICOLON:
	236	ds_put_cstr(s, ";");
	237	break;
238	case LEX_T_EQUALS:
239	ds_put_cstr(s, "=");
240	break;
241	default:
242	OVS_NOT_REACHED();
243	}
244
245	}
246	\f
247	/* lex_token_parse(). */
248
249	static void OVS_PRINTF_FORMAT(2, 3)
250	lex_error(struct lex_token token, const char message, ...)
251	{
252	ovs_assert(!token->s);
253	token->type = LEX_T_ERROR;
254
255	va_list args;
256	va_start(args, message);
257	token->s = xvasprintf(message, args);
258	va_end(args);
259	}
260
261	static void
262	lex_parse_hex_integer(const char start, size_t len, struct lex_token token)
263	{
264	const char *in = start + (len - 1);
265	uint8_t *out = token->value.u8 + (sizeof token->value.u8 - 1);
266
267	for (int i = 0; i < len; i++) {
268	int hexit = hexit_value(in[-i]);
269	if (hexit < 0) {
270	lex_error(token, "Invalid syntax in hexadecimal constant.");
271	return;
272	}
273	if (hexit && i / 2 >= sizeof token->value.u8) {
274	lex_error(token, "Hexadecimal constant requires more than "
275	"%"PRIuSIZE" bits.", 8 * sizeof token->value.u8);
276	return;
277	}
278	out[-(i / 2)] \|= i % 2 ? hexit << 4 : hexit;
279	}
280	token->format = LEX_F_HEXADECIMAL;
281	}
282
283	static const char *
284	lex_parse_integer__(const char p, struct lex_token token)
285	{
286	lex_token_init(token);
287	token->type = LEX_T_INTEGER;
288	memset(&token->value, 0, sizeof token->value);
289	const char *start = p;
290	const char *end = start;
291	while (isalnum((unsigned char) end) \|\| end == ':'
292	\|\| (*end == '.' && end[1] != '.')) {
293	end++;
294	}
295	size_t len = end - start;
296
297	int n;
298	uint8_t mac[ETH_ADDR_LEN];
299
300	if (!len) {
301	lex_error(token, "Integer constant expected.");
302	} else if (len == 17
303	&& ovs_scan(start, ETH_ADDR_SCAN_FMT"%n",
304	ETH_ADDR_SCAN_ARGS(mac), &n)
305	&& n == len) {
306	memcpy(token->value.mac, mac, sizeof token->value.mac);
307	token->format = LEX_F_ETHERNET;
308	} else if (start + strspn(start, "0123456789") == end) {
309	if (p[0] == '0' && len > 1) {
310	lex_error(token, "Decimal constants must not have leading zeros.");
311	} else {
312	unsigned long long int integer;
313	char *tail;
314
315	errno = 0;
316	integer = strtoull(p, &tail, 10);
317	if (tail != end \|\| errno == ERANGE) {
318	lex_error(token, "Decimal constants must be less than 2**64.");
319	} else {
320	token->value.integer = htonll(integer);
321	token->format = LEX_F_DECIMAL;
322	}
323	}
324	} else if (p[0] == '0' && (p[1] == 'x' \|\| p[1] == 'X')) {
325	if (len > 2) {
326	lex_parse_hex_integer(start + 2, len - 2, token);
327	} else {
328	lex_error(token, "Hex digits expected following 0%c.", p[1]);
329	}
330	} else if (len < INET6_ADDRSTRLEN) {
331	char copy[INET6_ADDRSTRLEN];
332	memcpy(copy, p, len);
333	copy[len] = '\0';
334
335	struct in_addr ipv4;
336	struct in6_addr ipv6;
337	if (inet_pton(AF_INET, copy, &ipv4) == 1) {
338	token->value.ipv4 = ipv4.s_addr;
339	token->format = LEX_F_IPV4;
340	} else if (inet_pton(AF_INET6, copy, &ipv6) == 1) {
341	token->value.ipv6 = ipv6;
342	token->format = LEX_F_IPV6;
343	} else {
344	lex_error(token, "Invalid numeric constant.");
345	}
346	} else {
347	lex_error(token, "Invalid numeric constant.");
348	}
349
350	ovs_assert(token->type == LEX_T_INTEGER \|\| token->type == LEX_T_ERROR);
351	return end;
352	}
353
354	static const char *
355	lex_parse_mask(const char p, struct lex_token token)
356	{
357	struct lex_token mask;
358
359	/* Parse just past the '/' as a second integer. Handle errors. */
360	p = lex_parse_integer__(p + 1, &mask);
361	if (mask.type == LEX_T_ERROR) {
362	lex_token_swap(&mask, token);
363	lex_token_destroy(&mask);
364	return p;
365	}
366	ovs_assert(mask.type == LEX_T_INTEGER);
367
368	/* Now convert the value and mask into a masked integer token.
369	* We have a few special cases. */
370	token->type = LEX_T_MASKED_INTEGER;
371	memset(&token->mask, 0, sizeof token->mask);
372	uint32_t prefix_bits = ntohll(mask.value.integer);
373	if (token->format == mask.format) {
374	/* Same format value and mask is always OK. */
375	token->mask = mask.value;
376	} else if (token->format == LEX_F_IPV4
377	&& mask.format == LEX_F_DECIMAL
378	&& prefix_bits <= 32) {
379	/* IPv4 address with decimal mask is a CIDR prefix. */
380	token->mask.integer = htonll(ntohl(be32_prefix_mask(prefix_bits)));
381	} else if (token->format == LEX_F_IPV6
382	&& mask.format == LEX_F_DECIMAL
383	&& prefix_bits <= 128) {
384	/* IPv6 address with decimal mask is a CIDR prefix. */
385	token->mask.ipv6 = ipv6_create_mask(prefix_bits);
386	} else if (token->format == LEX_F_DECIMAL
387	&& mask.format == LEX_F_HEXADECIMAL
388	&& token->value.integer == 0) {
389	/* Special case for e.g. 0/0x1234. */
390	token->format = LEX_F_HEXADECIMAL;
391	token->mask = mask.value;
392	} else {
393	lex_error(token, "Value and mask have incompatible formats.");
394	return p;
395	}
396
397	/* Check invariant that a 1-bit in the value corresponds to a 1-bit in the
398	* mask. */
399	for (int i = 0; i < ARRAY_SIZE(token->mask.be32); i++) {
400	ovs_be32 v = token->value.be32[i];
401	ovs_be32 m = token->mask.be32[i];
402
403	if (v & ~m) {
404	lex_error(token, "Value contains unmasked 1-bits.");
405	break;
406	}
407	}
408
409	/* Done! */
410	lex_token_destroy(&mask);
411	return p;
412	}
413
414	static const char *
415	lex_parse_integer(const char p, struct lex_token token)
416	{
417	p = lex_parse_integer__(p, token);
418	if (token->type == LEX_T_INTEGER && *p == '/') {
419	p = lex_parse_mask(p, token);
420	}
421	return p;
422	}
423
424	static const char *
425	lex_parse_string(const char p, struct lex_token token)
426	{
427	const char *start = ++p;
428	for (;;) {
429	switch (*p) {
430	case '\0':
431	lex_error(token, "Input ends inside quoted string.");
432	return p;
433
434	case '"':
435	token->type = (json_string_unescape(start, p - start, &token->s)
436	? LEX_T_STRING : LEX_T_ERROR);
437	return p + 1;
438
439	case '\\':
440	p++;
441	if (*p) {
442	p++;
443	}
444	break;
445
446	default:
447	p++;
448	break;
449	}
450	}
451	}
452
453	static bool
454	lex_is_id1(unsigned char c)
455	{
456	return ((c >= 'a' && c <= 'z') \|\| (c >= 'A' && c <= 'Z')
457	\|\| c == '_' \|\| c == '.');
458	}
459
460	static bool
461	lex_is_idn(unsigned char c)
462	{
463	return lex_is_id1(c) \|\| (c >= '0' && c <= '9');
464	}
465
466	static const char *
467	lex_parse_id(const char p, struct lex_token token)
468	{
469	const char *start = p;
470
471	do {
472	p++;
473	} while (lex_is_idn(*p));
474
475	token->type = LEX_T_ID;
476	token->s = xmemdup0(start, p - start);
477	return p;
478	}
479
480	/* Initializes 'token' and parses the first token from the beginning of
481	* null-terminated string 'p' into 'token'. Stores a pointer to the start of
482	* the token (after skipping white space and comments, if any) into '*startp'.
483	* Returns the character position at which to begin parsing the next token. */
484	const char *
485	lex_token_parse(struct lex_token token, const char p, const char **startp)
486	{
487	lex_token_init(token);
488
489	next:
490	*startp = p;
491	switch (*p) {
492	case '\0':
493	token->type = LEX_T_END;
494	return p;
495
496	case ' ': case '\t': case '\n': case '\r':
497	p++;
498	goto next;
499
500	case '/':
501	p++;
502	if (*p == '/') {
503	do {
504	p++;
505	} while (p != '\0' && p != '\n');
506	goto next;
507	} else if (p == '') {
508	p++;
509	for (;;) {
510	if (p == '' && p[1] == '/') {
511	p += 2;
512	goto next;
513	} else if (p == '\0' \|\| p == '\n') {
514	lex_error(token, "`/' without matching `/'.");
515	return p;
516	} else {
517	p++;
518	}
519	}
520	goto next;
521	} else {
522	lex_error(token,
523	"`/' is only valid as part of `//' or `/*'.");
524	}
525	break;
526
527	case '(':
528	token->type = LEX_T_LPAREN;
529	p++;
530	break;
531
532	case ')':
533	token->type = LEX_T_RPAREN;
534	p++;
535	break;
536
537	case '{':
538	token->type = LEX_T_LCURLY;
539	p++;
540	break;
541
542	case '}':
543	token->type = LEX_T_RCURLY;
544	p++;
545	break;
546
547	case '[':
548	token->type = LEX_T_LSQUARE;
549	p++;
550	break;
551
552	case ']':
553	token->type = LEX_T_RSQUARE;
554	p++;
555	break;
556
557	case '=':
558	p++;
559	if (*p == '=') {
560	token->type = LEX_T_EQ;
561	p++;
562	} else {
563	token->type = LEX_T_EQUALS;
564	}
565	break;
566
567	case '!':
568	p++;
569	if (*p == '=') {
570	token->type = LEX_T_NE;
571	p++;
572	} else {
573	token->type = LEX_T_LOG_NOT;
574	}
575	break;
576
577	case '&':
578	p++;
579	if (*p == '&') {
580	token->type = LEX_T_LOG_AND;
581	p++;
582	} else {
583	lex_error(token, "`&' is only valid as part of `&&'.");
584	}
585	break;
586
587	case '\|':
588	p++;
589	if (*p == '\|') {
590	token->type = LEX_T_LOG_OR;
591	p++;
592	} else {
593	lex_error(token, "`\|' is only valid as part of `\|\|'.");
594	}
595	break;
596
597	case '<':
598	p++;
599	if (*p == '=') {
600	token->type = LEX_T_LE;
601	p++;
602	} else {
603	token->type = LEX_T_LT;
604	}
605	break;
606
607	case '>':
608	p++;
609	if (*p == '=') {
610	token->type = LEX_T_GE;
611	p++;
612	} else {
613	token->type = LEX_T_GT;
614	}
615	break;
616
617	case '.':
618	p++;
619	if (*p == '.') {
620	token->type = LEX_T_ELLIPSIS;
621	p++;
622	} else {
623	lex_error(token, "`.' is only valid as part of `..' or a number.");
624	}
625	break;
626
627	case ',':
628	p++;
629	token->type = LEX_T_COMMA;
630	break;
631
632	case ';':
633	p++;
634	token->type = LEX_T_SEMICOLON;
635	break;
636
637	case '0': case '1': case '2': case '3': case '4':
638	case '5': case '6': case '7': case '8': case '9':
639	case ':':
640	p = lex_parse_integer(p, token);
641	break;
642
643	case '"':
644	p = lex_parse_string(p, token);
645	break;
646
647	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
648	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
649	/* We need to distinguish an Ethernet address or IPv6 address from an
650	* identifier. Fortunately, Ethernet addresses and IPv6 addresses that
651	* are ambiguous based on the first character, always start with hex
652	* digits followed by a colon, but identifiers never do. */
653	p = (p[strspn(p, "0123456789abcdefABCDEF")] == ':'
654	? lex_parse_integer(p, token)
655	: lex_parse_id(p, token));
656	break;
657
658	default:
659	if (lex_is_id1(*p)) {
660	p = lex_parse_id(p, token);
661	} else {
662	if (isprint((unsigned char) *p)) {
663	lex_error(token, "Invalid character `%c' in input.", *p);
664	} else {
665	lex_error(token, "Invalid byte 0x%d in input.", *p);
666	}
667	p++;
668	}
669	break;
670	}
671
672	return p;
673	}
674	\f
675	/* Initializes 'lexer' for parsing 'input'.
676	*
677	* While the lexer is in use, 'input' must remain available, but the caller
678	* otherwise retains ownership of 'input'.
679	*
680	* The caller must call lexer_get() to obtain the first token. */
681	void
682	lexer_init(struct lexer lexer, const char input)
683	{
684	lexer->input = input;
685	lexer->start = NULL;
686	lex_token_init(&lexer->token);
687	}
688
689	/* Frees storage associated with 'lexer'. */
690	void
691	lexer_destroy(struct lexer *lexer)
692	{
693	lex_token_destroy(&lexer->token);
694	}
695
696	/* Obtains the next token from 'lexer' into 'lexer->token', and returns the
697	* token's type. The caller may examine 'lexer->token' directly to obtain full
698	* information about the token. */
699	enum lex_type
700	lexer_get(struct lexer *lexer)
701	{
702	lex_token_destroy(&lexer->token);
703	lexer->input = lex_token_parse(&lexer->token, lexer->input, &lexer->start);
704	return lexer->token.type;
705	}
706
27912fdb BP	707	/* Returns the type of the next token that will be fetched by lexer_get(),
	708	* without advancing 'lexer->token' to that token. */
	709	enum lex_type
	710	lexer_lookahead(const struct lexer *lexer)
	711	{
	712	struct lex_token next;
	713	enum lex_type type;
	714	const char *start;
	715
	716	lex_token_parse(&next, lexer->input, &start);
	717	type = next.type;
	718	lex_token_destroy(&next);
	719	return type;
	720	}
	721
10b1662b BP	722	/* If 'lexer''s current token has the given 'type', advances 'lexer' to the
	723	* next token and returns true. Otherwise returns false. */
	724	bool
	725	lexer_match(struct lexer *lexer, enum lex_type type)
	726	{
	727	if (lexer->token.type == type) {
	728	lexer_get(lexer);
	729	return true;
	730	} else {
	731	return false;
	732	}
	733	}
27912fdb BP	734
	735	/* If 'lexer''s current token is the identifier given in 'id', advances 'lexer'
	736	* to the next token and returns true. Otherwise returns false. */
	737	bool
	738	lexer_match_id(struct lexer lexer, const char id)
	739	{
	740	if (lexer->token.type == LEX_T_ID && !strcmp(lexer->token.s, id)) {
	741	lexer_get(lexer);
	742	return true;
	743	} else {
	744	return false;
	745	}
	746	}