ovn/lib/lex.c

   1 /*
   2  * Copyright (c) 2015, 2016, 2017 Nicira, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at:
   7  *
   8  *     http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include <config.h>
  18 #include <ctype.h>
  19 #include <errno.h>
  20 #include <stdarg.h>
  21 #include "openvswitch/dynamic-string.h"
  22 #include "openvswitch/json.h"
  23 #include "ovn/lex.h"
  24 #include "packets.h"
  25 #include "util.h"
  26 \f
  27 /* Returns a string that represents 'format'. */
  28 const char *
  29 lex_format_to_string(enum lex_format format)
  30 {
  31     switch (format) {
  32     case LEX_F_DECIMAL:
  33         return "decimal";
  34     case LEX_F_HEXADECIMAL:
  35         return "hexadecimal";
  36     case LEX_F_IPV4:
  37         return "IPv4";
  38     case LEX_F_IPV6:
  39         return "IPv6";
  40     case LEX_F_ETHERNET:
  41         return "Ethernet";
  42     default:
  43         abort();
  44     }
  45 }
  46 \f
  47 /* Initializes 'token'. */
  48 void
  49 lex_token_init(struct lex_token *token)
  50 {
  51     token->type = LEX_T_END;
  52     token->s = NULL;
  53 }
  54
  55 /* Frees memory owned by 'token'. */
  56 void
  57 lex_token_destroy(struct lex_token *token)
  58 {
  59     if (token->s != token->buffer) {
  60         free(token->s);
  61     }
  62     token->s = NULL;
  63 }
  64
  65 /* Exchanges 'a' and 'b'. */
  66 void
  67 lex_token_swap(struct lex_token *a, struct lex_token *b)
  68 {
  69     struct lex_token tmp = *a;
  70     *a = *b;
  71     *b = tmp;
  72
  73     /* Before swap, if 's' was pointed to 'buffer', its value shall be changed
  74      * to point to the 'buffer' with the copied value. */
  75     if (a->s == b->buffer) {
  76         a->s = a->buffer;
  77     }
  78     if (b->s == a->buffer) {
  79         b->s = b->buffer;
  80     }
  81 }
  82
  83 /* The string 's' need not be null-terminated at 'length'. */
  84 void
  85 lex_token_strcpy(struct lex_token *token, const char *s, size_t length)
  86 {
  87     lex_token_destroy(token);
  88     token->s = (length + 1 <= sizeof token->buffer
  89                 ? token->buffer
  90                 : xmalloc(length + 1));
  91     memcpy(token->s, s, length);
  92     token->s[length] = '\0';
  93 }
  94
  95 void
  96 lex_token_strset(struct lex_token *token, char *s)
  97 {
  98     lex_token_destroy(token);
  99     token->s = s;
 100 }
 101
 102 void
 103 lex_token_vsprintf(struct lex_token *token, const char *format, va_list args)
 104 {
 105     lex_token_destroy(token);
 106
 107     va_list args2;
 108     va_copy(args2, args);
 109     token->s = (vsnprintf(token->buffer, sizeof token->buffer, format, args)
 110                 < sizeof token->buffer
 111                 ? token->buffer
 112                 : xvasprintf(format, args2));
 113     va_end(args2);
 114 }
 115 \f
 116 /* lex_token_format(). */
 117
 118 static size_t
 119 lex_token_n_zeros(enum lex_format format)
 120 {
 121     switch (format) {
 122     case LEX_F_DECIMAL:     return offsetof(union mf_subvalue, integer);
 123     case LEX_F_HEXADECIMAL: return 0;
 124     case LEX_F_IPV4:        return offsetof(union mf_subvalue, ipv4);
 125     case LEX_F_IPV6:        return offsetof(union mf_subvalue, ipv6);
 126     case LEX_F_ETHERNET:    return offsetof(union mf_subvalue, mac);
 127     default: OVS_NOT_REACHED();
 128     }
 129 }
 130
 131 /* Returns the effective format for 'token', that is, the format in which it
 132  * should actually be printed.  This is ordinarily the same as 'token->format',
 133  * but it's always possible that someone sets up a token with a format that
 134  * won't work for a value, e.g. 'token->value' is wider than 32 bits but the
 135  * format is LEX_F_IPV4.  (The lexer itself won't do that; this is an attempt
 136  * to avoid confusion in the future.) */
 137 static enum lex_format
 138 lex_token_get_format(const struct lex_token *token)
 139 {
 140     size_t n_zeros = lex_token_n_zeros(token->format);
 141     return (is_all_zeros(&token->value, n_zeros)
 142             && (token->type != LEX_T_MASKED_INTEGER
 143                 || is_all_zeros(&token->mask, n_zeros))
 144             ? token->format
 145             : LEX_F_HEXADECIMAL);
 146 }
 147
 148 static void
 149 lex_token_format_value(const union mf_subvalue *value,
 150                        enum lex_format format, struct ds *s)
 151 {
 152     switch (format) {
 153     case LEX_F_DECIMAL:
 154         ds_put_format(s, "%"PRIu64, ntohll(value->integer));
 155         break;
 156
 157     case LEX_F_HEXADECIMAL:
 158         mf_format_subvalue(value, s);
 159         break;
 160
 161     case LEX_F_IPV4:
 162         ds_put_format(s, IP_FMT, IP_ARGS(value->ipv4));
 163         break;
 164
 165     case LEX_F_IPV6:
 166         ipv6_format_addr(&value->ipv6, s);
 167         break;
 168
 169     case LEX_F_ETHERNET:
 170         ds_put_format(s, ETH_ADDR_FMT, ETH_ADDR_ARGS(value->mac));
 171         break;
 172
 173     default:
 174         OVS_NOT_REACHED();
 175     }
 176
 177 }
 178
 179 static void
 180 lex_token_format_masked_integer(const struct lex_token *token, struct ds *s)
 181 {
 182     enum lex_format format = lex_token_get_format(token);
 183
 184     lex_token_format_value(&token->value, format, s);
 185     ds_put_char(s, '/');
 186
 187     const union mf_subvalue *mask = &token->mask;
 188     if (format == LEX_F_IPV4 && ip_is_cidr(mask->ipv4)) {
 189         ds_put_format(s, "%d", ip_count_cidr_bits(mask->ipv4));
 190     } else if (token->format == LEX_F_IPV6 && ipv6_is_cidr(&mask->ipv6)) {
 191         ds_put_format(s, "%d", ipv6_count_cidr_bits(&mask->ipv6));
 192     } else {
 193         lex_token_format_value(&token->mask, format, s);
 194     }
 195 }
 196
 197 /* Appends a string representation of 'token' to 's', in a format that can be
 198  * losslessly parsed back by the lexer.  (LEX_T_END and LEX_T_ERROR can't be
 199  * parsed back.) */
 200 void
 201 lex_token_format(const struct lex_token *token, struct ds *s)
 202 {
 203     switch (token->type) {
 204     case LEX_T_END:
 205         ds_put_cstr(s, "$");
 206         break;
 207
 208     case LEX_T_ID:
 209         ds_put_cstr(s, token->s);
 210         break;
 211
 212     case LEX_T_ERROR:
 213         ds_put_cstr(s, "error(");
 214         json_string_escape(token->s, s);
 215         ds_put_char(s, ')');
 216         break;
 217
 218     case LEX_T_STRING:
 219         json_string_escape(token->s, s);
 220         break;
 221
 222     case LEX_T_INTEGER:
 223         lex_token_format_value(&token->value, lex_token_get_format(token), s);
 224         break;
 225
 226     case LEX_T_MASKED_INTEGER:
 227         lex_token_format_masked_integer(token, s);
 228         break;
 229
 230     case LEX_T_MACRO:
 231         ds_put_format(s, "$%s", token->s);
 232         break;
 233
 234     case LEX_T_LPAREN:
 235         ds_put_cstr(s, "(");
 236         break;
 237     case LEX_T_RPAREN:
 238         ds_put_cstr(s, ")");
 239         break;
 240     case LEX_T_LCURLY:
 241         ds_put_cstr(s, "{");
 242         break;
 243     case LEX_T_RCURLY:
 244         ds_put_cstr(s, "}");
 245         break;
 246     case LEX_T_LSQUARE:
 247         ds_put_cstr(s, "[");
 248         break;
 249     case LEX_T_RSQUARE:
 250         ds_put_cstr(s, "]");
 251         break;
 252     case LEX_T_EQ:
 253         ds_put_cstr(s, "==");
 254         break;
 255     case LEX_T_NE:
 256         ds_put_cstr(s, "!=");
 257         break;
 258     case LEX_T_LT:
 259         ds_put_cstr(s, "<");
 260         break;
 261     case LEX_T_LE:
 262         ds_put_cstr(s, "<=");
 263         break;
 264     case LEX_T_GT:
 265         ds_put_cstr(s, ">");
 266         break;
 267     case LEX_T_GE:
 268         ds_put_cstr(s, ">=");
 269         break;
 270     case LEX_T_LOG_NOT:
 271         ds_put_cstr(s, "!");
 272         break;
 273     case LEX_T_LOG_AND:
 274         ds_put_cstr(s, "&&");
 275         break;
 276     case LEX_T_LOG_OR:
 277         ds_put_cstr(s, "||");
 278         break;
 279     case LEX_T_ELLIPSIS:
 280         ds_put_cstr(s, "..");
 281         break;
 282     case LEX_T_COMMA:
 283         ds_put_cstr(s, ",");
 284         break;
 285     case LEX_T_SEMICOLON:
 286         ds_put_cstr(s, ";");
 287         break;
 288     case LEX_T_EQUALS:
 289         ds_put_cstr(s, "=");
 290         break;
 291     case LEX_T_EXCHANGE:
 292         ds_put_cstr(s, "<->");
 293         break;
 294     case LEX_T_DECREMENT:
 295         ds_put_cstr(s, "--");
 296         break;
 297     case LEX_T_COLON:
 298         ds_put_char(s, ':');
 299         break;
 300     default:
 301         OVS_NOT_REACHED();
 302     }
 303
 304 }
 305 \f
 306 /* lex_token_parse(). */
 307
 308 static void OVS_PRINTF_FORMAT(2, 3)
 309 lex_error(struct lex_token *token, const char *message, ...)
 310 {
 311     ovs_assert(!token->s);
 312     token->type = LEX_T_ERROR;
 313
 314     va_list args;
 315     va_start(args, message);
 316     lex_token_vsprintf(token, message, args);
 317     va_end(args);
 318 }
 319
 320 static void
 321 lex_parse_hex_integer(const char *start, size_t len, struct lex_token *token)
 322 {
 323     const char *in = start + (len - 1);
 324     uint8_t *out = token->value.u8 + (sizeof token->value.u8 - 1);
 325
 326     for (int i = 0; i < len; i++) {
 327         int hexit = hexit_value(in[-i]);
 328         if (hexit < 0) {
 329             lex_error(token, "Invalid syntax in hexadecimal constant.");
 330             return;
 331         }
 332         if (hexit && i / 2 >= sizeof token->value.u8) {
 333             lex_error(token, "Hexadecimal constant requires more than "
 334                       "%"PRIuSIZE" bits.", 8 * sizeof token->value.u8);
 335             return;
 336         }
 337         out[-(i / 2)] |= i % 2 ? hexit << 4 : hexit;
 338     }
 339     token->format = LEX_F_HEXADECIMAL;
 340 }
 341
 342 static const char *
 343 lex_parse_integer__(const char *p, struct lex_token *token)
 344 {
 345     lex_token_init(token);
 346     token->type = LEX_T_INTEGER;
 347     memset(&token->value, 0, sizeof token->value);
 348
 349     /* Find the extent of an "integer" token, which can be in decimal or
 350      * hexadecimal, or an Ethernet address or IPv4 or IPv6 address, as 'start'
 351      * through 'end'.
 352      *
 353      * Special cases we handle here are:
 354      *
 355      *     - The ellipsis token "..", used as e.g. 123..456.  A doubled dot
 356      *       is never valid syntax as part of an "integer", so we stop if
 357      *       we encounter two dots in a row.
 358      *
 359      *     - Syntax like 1.2.3.4:1234 to indicate an IPv4 address followed by a
 360      *       port number should be considered three tokens: 1.2.3.4 : 1234.
 361      *       The obvious approach is to allow just dots or just colons within a
 362      *       given integer, but that would disallow IPv4-mapped IPv6 addresses,
 363      *       e.g. ::ffff:192.0.2.128.  However, even in those addresses, a
 364      *       colon never follows a dot, so we stop if we encounter a colon
 365      *       after a dot.
 366      *
 367      *       (There is no corresponding way to parse an IPv6 address followed
 368      *       by a port number: ::1:2:3:4:1234 is unavoidably ambiguous.)
 369      */
 370     const char *start = p;
 371     const char *end = start;
 372     bool saw_dot = false;
 373     while (isalnum((unsigned char) *end)
 374            || (*end == ':' && !saw_dot)
 375            || (*end == '.' && end[1] != '.')) {
 376         if (*end == '.') {
 377             saw_dot = true;
 378         }
 379         end++;
 380     }
 381     size_t len = end - start;
 382
 383     int n;
 384     struct eth_addr mac;
 385
 386     if (!len) {
 387         lex_error(token, "Integer constant expected.");
 388     } else if (len == 17
 389                && ovs_scan(start, ETH_ADDR_SCAN_FMT"%n",
 390                            ETH_ADDR_SCAN_ARGS(mac), &n)
 391                && n == len) {
 392         token->value.mac = mac;
 393         token->format = LEX_F_ETHERNET;
 394     } else if (start + strspn(start, "0123456789") == end) {
 395         if (p[0] == '0' && len > 1) {
 396             lex_error(token, "Decimal constants must not have leading zeros.");
 397         } else {
 398             unsigned long long int integer;
 399             char *tail;
 400
 401             errno = 0;
 402             integer = strtoull(p, &tail, 10);
 403             if (tail != end || errno == ERANGE) {
 404                 lex_error(token, "Decimal constants must be less than 2**64.");
 405             } else {
 406                 token->value.integer = htonll(integer);
 407                 token->format = LEX_F_DECIMAL;
 408             }
 409         }
 410     } else if (p[0] == '0' && (p[1] == 'x' || p[1] == 'X')) {
 411         if (len > 2) {
 412             lex_parse_hex_integer(start + 2, len - 2, token);
 413         } else {
 414             lex_error(token, "Hex digits expected following 0%c.", p[1]);
 415         }
 416     } else if (len < INET6_ADDRSTRLEN) {
 417         char copy[INET6_ADDRSTRLEN];
 418         memcpy(copy, p, len);
 419         copy[len] = '\0';
 420
 421         if (ip_parse(copy, &token->value.ipv4)) {
 422             token->format = LEX_F_IPV4;
 423         } else if (ipv6_parse(copy, &token->value.ipv6)) {
 424             token->format = LEX_F_IPV6;
 425         } else {
 426             lex_error(token, "Invalid numeric constant.");
 427         }
 428     } else {
 429         lex_error(token, "Invalid numeric constant.");
 430     }
 431
 432     ovs_assert(token->type == LEX_T_INTEGER || token->type == LEX_T_ERROR);
 433     return end;
 434 }
 435
 436 static const char *
 437 lex_parse_mask(const char *p, struct lex_token *token)
 438 {
 439     struct lex_token mask;
 440
 441     /* Parse just past the '/' as a second integer.  Handle errors. */
 442     p = lex_parse_integer__(p + 1, &mask);
 443     if (mask.type == LEX_T_ERROR) {
 444         lex_token_swap(&mask, token);
 445         lex_token_destroy(&mask);
 446         return p;
 447     }
 448     ovs_assert(mask.type == LEX_T_INTEGER);
 449
 450     /* Now convert the value and mask into a masked integer token.
 451      * We have a few special cases. */
 452     token->type = LEX_T_MASKED_INTEGER;
 453     memset(&token->mask, 0, sizeof token->mask);
 454     uint32_t prefix_bits = ntohll(mask.value.integer);
 455     if (token->format == mask.format) {
 456         /* Same format value and mask is always OK. */
 457         token->mask = mask.value;
 458     } else if (token->format == LEX_F_IPV4
 459                && mask.format == LEX_F_DECIMAL
 460                && prefix_bits <= 32) {
 461         /* IPv4 address with decimal mask is a CIDR prefix. */
 462         token->mask.integer = htonll(ntohl(be32_prefix_mask(prefix_bits)));
 463     } else if (token->format == LEX_F_IPV6
 464                && mask.format == LEX_F_DECIMAL
 465                && prefix_bits <= 128) {
 466         /* IPv6 address with decimal mask is a CIDR prefix. */
 467         token->mask.ipv6 = ipv6_create_mask(prefix_bits);
 468     } else if (token->format == LEX_F_DECIMAL
 469                && mask.format == LEX_F_HEXADECIMAL
 470                && token->value.integer == 0) {
 471         /* Special case for e.g. 0/0x1234. */
 472         token->format = LEX_F_HEXADECIMAL;
 473         token->mask = mask.value;
 474     } else {
 475         lex_error(token, "Value and mask have incompatible formats.");
 476         return p;
 477     }
 478
 479     /* Check invariant that a 1-bit in the value corresponds to a 1-bit in the
 480      * mask. */
 481     for (int i = 0; i < ARRAY_SIZE(token->mask.be32); i++) {
 482         ovs_be32 v = token->value.be32[i];
 483         ovs_be32 m = token->mask.be32[i];
 484
 485         if (v & ~m) {
 486             lex_error(token, "Value contains unmasked 1-bits.");
 487             break;
 488         }
 489     }
 490
 491     /* Done! */
 492     lex_token_destroy(&mask);
 493     return p;
 494 }
 495
 496 static const char *
 497 lex_parse_integer(const char *p, struct lex_token *token)
 498 {
 499     p = lex_parse_integer__(p, token);
 500     if (token->type == LEX_T_INTEGER && *p == '/') {
 501         p = lex_parse_mask(p, token);
 502     }
 503     return p;
 504 }
 505
 506 static const char *
 507 lex_parse_string(const char *p, struct lex_token *token)
 508 {
 509     const char *start = ++p;
 510     char * s = NULL;
 511     for (;;) {
 512         switch (*p) {
 513         case '\0':
 514             lex_error(token, "Input ends inside quoted string.");
 515             return p;
 516
 517         case '"':
 518             token->type = (json_string_unescape(start, p - start, &s)
 519                            ? LEX_T_STRING : LEX_T_ERROR);
 520             lex_token_strset(token, s);
 521             return p + 1;
 522
 523         case '\\':
 524             p++;
 525             if (*p) {
 526                 p++;
 527             }
 528             break;
 529
 530         default:
 531             p++;
 532             break;
 533         }
 534     }
 535 }
 536
 537 static bool
 538 lex_is_id1(unsigned char c)
 539 {
 540     return ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
 541             || c == '_' || c == '.');
 542 }
 543
 544 static bool
 545 lex_is_idn(unsigned char c)
 546 {
 547     return lex_is_id1(c) || (c >= '0' && c <= '9');
 548 }
 549
 550 static const char *
 551 lex_parse_id(const char *p, enum lex_type type, struct lex_token *token)
 552 {
 553     const char *start = p;
 554
 555     do {
 556         p++;
 557     } while (lex_is_idn(*p));
 558
 559     token->type = type;
 560     lex_token_strcpy(token, start, p - start);
 561     return p;
 562 }
 563
 564 static const char *
 565 lex_parse_addr_set(const char *p, struct lex_token *token)
 566 {
 567     p++;
 568     if (!lex_is_id1(*p)) {
 569         lex_error(token, "`$' must be followed by a valid identifier.");
 570         return p;
 571     }
 572
 573     return lex_parse_id(p, LEX_T_MACRO, token);
 574 }
 575
 576 /* Initializes 'token' and parses the first token from the beginning of
 577  * null-terminated string 'p' into 'token'.  Stores a pointer to the start of
 578  * the token (after skipping white space and comments, if any) into '*startp'.
 579  * Returns the character position at which to begin parsing the next token. */
 580 const char *
 581 lex_token_parse(struct lex_token *token, const char *p, const char **startp)
 582 {
 583     lex_token_init(token);
 584
 585 next:
 586     *startp = p;
 587     switch (*p) {
 588     case '\0':
 589         token->type = LEX_T_END;
 590         return p;
 591
 592     case ' ': case '\t': case '\n': case '\r': case '\v': case '\f':
 593         p++;
 594         goto next;
 595
 596     case '/':
 597         p++;
 598         if (*p == '/') {
 599             do {
 600                 p++;
 601             } while (*p != '\0' && *p != '\n');
 602             goto next;
 603         } else if (*p == '*') {
 604             p++;
 605             for (;;) {
 606                 if (*p == '*' && p[1] == '/') {
 607                     p += 2;
 608                     goto next;
 609                 } else if (*p == '\0' || *p == '\n') {
 610                     lex_error(token, "`/*' without matching `*/'.");
 611                     return p;
 612                 } else {
 613                     p++;
 614                 }
 615             }
 616             goto next;
 617         } else {
 618             lex_error(token,
 619                       "`/' is only valid as part of `//' or `/*'.");
 620         }
 621         break;
 622
 623     case '(':
 624         token->type = LEX_T_LPAREN;
 625         p++;
 626         break;
 627
 628     case ')':
 629         token->type = LEX_T_RPAREN;
 630         p++;
 631         break;
 632
 633     case '{':
 634         token->type = LEX_T_LCURLY;
 635         p++;
 636         break;
 637
 638     case '}':
 639         token->type = LEX_T_RCURLY;
 640         p++;
 641         break;
 642
 643     case '[':
 644         token->type = LEX_T_LSQUARE;
 645         p++;
 646         break;
 647
 648     case ']':
 649         token->type = LEX_T_RSQUARE;
 650         p++;
 651         break;
 652
 653     case '=':
 654         p++;
 655         if (*p == '=') {
 656             token->type = LEX_T_EQ;
 657             p++;
 658         } else {
 659             token->type = LEX_T_EQUALS;
 660         }
 661         break;
 662
 663     case '!':
 664         p++;
 665         if (*p == '=') {
 666             token->type = LEX_T_NE;
 667             p++;
 668         } else {
 669             token->type = LEX_T_LOG_NOT;
 670         }
 671         break;
 672
 673     case '&':
 674         p++;
 675         if (*p == '&') {
 676             token->type = LEX_T_LOG_AND;
 677             p++;
 678         } else {
 679             lex_error(token, "`&' is only valid as part of `&&'.");
 680         }
 681         break;
 682
 683     case '|':
 684         p++;
 685         if (*p == '|') {
 686             token->type = LEX_T_LOG_OR;
 687             p++;
 688         } else {
 689             lex_error(token, "`|' is only valid as part of `||'.");
 690         }
 691         break;
 692
 693     case '<':
 694         p++;
 695         if (*p == '=') {
 696             token->type = LEX_T_LE;
 697             p++;
 698         } else if (*p == '-' && p[1] == '>') {
 699             token->type = LEX_T_EXCHANGE;
 700             p += 2;
 701         } else {
 702             token->type = LEX_T_LT;
 703         }
 704         break;
 705
 706     case '>':
 707         p++;
 708         if (*p == '=') {
 709             token->type = LEX_T_GE;
 710             p++;
 711         } else {
 712             token->type = LEX_T_GT;
 713         }
 714         break;
 715
 716     case '.':
 717         p++;
 718         if (*p == '.') {
 719             token->type = LEX_T_ELLIPSIS;
 720             p++;
 721         } else {
 722             lex_error(token, "`.' is only valid as part of `..' or a number.");
 723         }
 724         break;
 725
 726     case ',':
 727         p++;
 728         token->type = LEX_T_COMMA;
 729         break;
 730
 731     case ';':
 732         p++;
 733         token->type = LEX_T_SEMICOLON;
 734         break;
 735
 736     case '-':
 737         p++;
 738         if (*p == '-') {
 739             token->type = LEX_T_DECREMENT;
 740             p++;
 741         } else {
 742             lex_error(token, "`-' is only valid as part of `--'.");
 743         }
 744         break;
 745
 746     case '$':
 747         p = lex_parse_addr_set(p, token);
 748         break;
 749
 750     case ':':
 751         if (p[1] != ':') {
 752             token->type = LEX_T_COLON;
 753             p++;
 754             break;
 755         }
 756         /* IPv6 address beginning with "::". */
 757         /* fall through */
 758     case '0': case '1': case '2': case '3': case '4':
 759     case '5': case '6': case '7': case '8': case '9':
 760         p = lex_parse_integer(p, token);
 761         break;
 762
 763     case '"':
 764         p = lex_parse_string(p, token);
 765         break;
 766
 767     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
 768     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
 769         /* We need to distinguish an Ethernet address or IPv6 address from an
 770          * identifier.  Fortunately, Ethernet addresses and IPv6 addresses that
 771          * are ambiguous based on the first character, always start with hex
 772          * digits followed by a colon, but identifiers never do. */
 773         p = (p[strspn(p, "0123456789abcdefABCDEF")] == ':'
 774              ? lex_parse_integer(p, token)
 775              : lex_parse_id(p, LEX_T_ID, token));
 776         break;
 777
 778     default:
 779         if (lex_is_id1(*p)) {
 780             p = lex_parse_id(p, LEX_T_ID, token);
 781         } else {
 782             if (isprint((unsigned char) *p)) {
 783                 lex_error(token, "Invalid character `%c' in input.", *p);
 784             } else {
 785                 lex_error(token, "Invalid byte 0x%d in input.", *p);
 786             }
 787             p++;
 788         }
 789         break;
 790     }
 791
 792     return p;
 793 }
 794 \f
 795 /* Initializes 'lexer' for parsing 'input'.
 796  *
 797  * While the lexer is in use, 'input' must remain available, but the caller
 798  * otherwise retains ownership of 'input'.
 799  *
 800  * The caller must call lexer_get() to obtain the first token. */
 801 void
 802 lexer_init(struct lexer *lexer, const char *input)
 803 {
 804     lexer->input = input;
 805     lexer->start = NULL;
 806     lex_token_init(&lexer->token);
 807     lexer->error = NULL;
 808 }
 809
 810 /* Frees storage associated with 'lexer'. */
 811 void
 812 lexer_destroy(struct lexer *lexer)
 813 {
 814     lex_token_destroy(&lexer->token);
 815     free(lexer->error);
 816 }
 817
 818 /* Obtains the next token from 'lexer' into 'lexer->token', and returns the
 819  * token's type.  The caller may examine 'lexer->token' directly to obtain full
 820  * information about the token. */
 821 enum lex_type
 822 lexer_get(struct lexer *lexer)
 823 {
 824     lex_token_destroy(&lexer->token);
 825     lexer->input = lex_token_parse(&lexer->token, lexer->input, &lexer->start);
 826     return lexer->token.type;
 827 }
 828
 829 /* Returns the type of the next token that will be fetched by lexer_get(),
 830  * without advancing 'lexer->token' to that token. */
 831 enum lex_type
 832 lexer_lookahead(const struct lexer *lexer)
 833 {
 834     struct lex_token next;
 835     enum lex_type type;
 836     const char *start;
 837
 838     lex_token_parse(&next, lexer->input, &start);
 839     type = next.type;
 840     lex_token_destroy(&next);
 841     return type;
 842 }
 843
 844 /* If 'lexer''s current token has the given 'type', advances 'lexer' to the
 845  * next token and returns true.  Otherwise returns false. */
 846 bool
 847 lexer_match(struct lexer *lexer, enum lex_type type)
 848 {
 849     if (lexer->token.type == type) {
 850         lexer_get(lexer);
 851         return true;
 852     } else {
 853         return false;
 854     }
 855 }
 856
 857 bool
 858 lexer_force_match(struct lexer *lexer, enum lex_type t)
 859 {
 860     if (t == LEX_T_END) {
 861         return lexer_force_end(lexer);
 862     } else if (lexer_match(lexer, t)) {
 863         return true;
 864     } else {
 865         struct lex_token token = { .type = t };
 866         struct ds s = DS_EMPTY_INITIALIZER;
 867         lex_token_format(&token, &s);
 868
 869         lexer_syntax_error(lexer, "expecting `%s'", ds_cstr(&s));
 870
 871         ds_destroy(&s);
 872
 873         return false;
 874     }
 875 }
 876
 877 /* If 'lexer''s current token is the identifier given in 'id', advances 'lexer'
 878  * to the next token and returns true.  Otherwise returns false.  */
 879 bool
 880 lexer_match_id(struct lexer *lexer, const char *id)
 881 {
 882     if (lexer->token.type == LEX_T_ID && !strcmp(lexer->token.s, id)) {
 883         lexer_get(lexer);
 884         return true;
 885     } else {
 886         return false;
 887     }
 888 }
 889
 890 bool
 891 lexer_is_int(const struct lexer *lexer)
 892 {
 893     return (lexer->token.type == LEX_T_INTEGER
 894             && lexer->token.format == LEX_F_DECIMAL
 895             && ntohll(lexer->token.value.integer) <= INT_MAX);
 896 }
 897
 898 bool
 899 lexer_get_int(struct lexer *lexer, int *value)
 900 {
 901     if (lexer_is_int(lexer)) {
 902         *value = ntohll(lexer->token.value.integer);
 903         lexer_get(lexer);
 904         return true;
 905     } else {
 906         *value = 0;
 907         return false;
 908     }
 909 }
 910
 911 bool
 912 lexer_force_int(struct lexer *lexer, int *value)
 913 {
 914     bool ok = lexer_get_int(lexer, value);
 915     if (!ok) {
 916         lexer_syntax_error(lexer, "expecting small integer");
 917     }
 918     return ok;
 919 }
 920
 921 bool
 922 lexer_force_end(struct lexer *lexer)
 923 {
 924     if (lexer->token.type == LEX_T_END) {
 925         return true;
 926     } else {
 927         lexer_syntax_error(lexer, "expecting end of input");
 928         return false;
 929     }
 930 }
 931
 932 static bool
 933 lexer_error_handle_common(struct lexer *lexer)
 934 {
 935     if (lexer->error) {
 936         /* Already have an error, suppress this one since the cascade seems
 937          * unlikely to be useful. */
 938         return true;
 939     } else if (lexer->token.type == LEX_T_ERROR) {
 940         /* The lexer signaled an error.  Nothing at a higher level accepts an
 941          * error token, so we'll inevitably end up here with some meaningless
 942          * parse error.  Report the lexical error instead. */
 943         lexer->error = xstrdup(lexer->token.s);
 944         return true;
 945     } else {
 946         return false;
 947     }
 948 }
 949
 950 void OVS_PRINTF_FORMAT(2, 3)
 951 lexer_error(struct lexer *lexer, const char *message, ...)
 952 {
 953     if (lexer_error_handle_common(lexer)) {
 954         return;
 955     }
 956
 957     va_list args;
 958     va_start(args, message);
 959     lexer->error = xvasprintf(message, args);
 960     va_end(args);
 961 }
 962
 963 void OVS_PRINTF_FORMAT(2, 3)
 964 lexer_syntax_error(struct lexer *lexer, const char *message, ...)
 965 {
 966     if (lexer_error_handle_common(lexer)) {
 967         return;
 968     }
 969
 970     struct ds s;
 971
 972     ds_init(&s);
 973     ds_put_cstr(&s, "Syntax error");
 974     if (lexer->token.type == LEX_T_END) {
 975         ds_put_cstr(&s, " at end of input");
 976     } else if (lexer->start) {
 977         ds_put_format(&s, " at `%.*s'",
 978                       (int) (lexer->input - lexer->start),
 979                       lexer->start);
 980     }
 981
 982     if (message) {
 983         ds_put_char(&s, ' ');
 984
 985         va_list args;
 986         va_start(args, message);
 987         ds_put_format_valist(&s, message, args);
 988         va_end(args);
 989     }
 990     ds_put_char(&s, '.');
 991
 992     lexer->error = ds_steal_cstr(&s);
 993 }
 994
 995 char *
 996 lexer_steal_error(struct lexer *lexer)
 997 {
 998     char *error = lexer->error;
 999     lexer->error = NULL;
1000     return error;
1001 }