]> git.proxmox.com Git - ovs.git/blob - ovn/lib/lex.c
lex: Fix parsing of long tokens.
[ovs.git] / ovn / lib / lex.c
1 /*
2 * Copyright (c) 2015, 2016, 2017 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18 #include <ctype.h>
19 #include <errno.h>
20 #include <stdarg.h>
21 #include "openvswitch/dynamic-string.h"
22 #include "openvswitch/json.h"
23 #include "ovn/lex.h"
24 #include "packets.h"
25 #include "util.h"
26 \f
27 /* Returns a string that represents 'format'. */
28 const char *
29 lex_format_to_string(enum lex_format format)
30 {
31 switch (format) {
32 case LEX_F_DECIMAL:
33 return "decimal";
34 case LEX_F_HEXADECIMAL:
35 return "hexadecimal";
36 case LEX_F_IPV4:
37 return "IPv4";
38 case LEX_F_IPV6:
39 return "IPv6";
40 case LEX_F_ETHERNET:
41 return "Ethernet";
42 default:
43 abort();
44 }
45 }
46 \f
47 /* Initializes 'token'. */
48 void
49 lex_token_init(struct lex_token *token)
50 {
51 token->type = LEX_T_END;
52 token->s = NULL;
53 }
54
55 /* Frees memory owned by 'token'. */
56 void
57 lex_token_destroy(struct lex_token *token)
58 {
59 if (token->s != token->buffer) {
60 free(token->s);
61 }
62 token->s = NULL;
63 }
64
65 /* Exchanges 'a' and 'b'. */
66 void
67 lex_token_swap(struct lex_token *a, struct lex_token *b)
68 {
69 struct lex_token tmp = *a;
70 *a = *b;
71 *b = tmp;
72
73 /* Before swap, if 's' was pointed to 'buffer', its value shall be changed
74 * to point to the 'buffer' with the copied value. */
75 if (a->s == b->buffer) {
76 a->s = a->buffer;
77 }
78 if (b->s == a->buffer) {
79 b->s = b->buffer;
80 }
81 }
82
83 /* The string 's' need not be null-terminated at 'length'. */
84 void
85 lex_token_strcpy(struct lex_token *token, const char *s, size_t length)
86 {
87 lex_token_destroy(token);
88 token->s = (length + 1 <= sizeof token->buffer
89 ? token->buffer
90 : xmalloc(length + 1));
91 memcpy(token->s, s, length);
92 token->s[length] = '\0';
93 }
94
95 void
96 lex_token_strset(struct lex_token *token, char *s)
97 {
98 lex_token_destroy(token);
99 token->s = s;
100 }
101
102 void
103 lex_token_vsprintf(struct lex_token *token, const char *format, va_list args)
104 {
105 lex_token_destroy(token);
106
107 va_list args2;
108 va_copy(args2, args);
109 token->s = (vsnprintf(token->buffer, sizeof token->buffer, format, args)
110 < sizeof token->buffer
111 ? token->buffer
112 : xvasprintf(format, args2));
113 va_end(args2);
114 }
115 \f
116 /* lex_token_format(). */
117
118 static size_t
119 lex_token_n_zeros(enum lex_format format)
120 {
121 switch (format) {
122 case LEX_F_DECIMAL: return offsetof(union mf_subvalue, integer);
123 case LEX_F_HEXADECIMAL: return 0;
124 case LEX_F_IPV4: return offsetof(union mf_subvalue, ipv4);
125 case LEX_F_IPV6: return offsetof(union mf_subvalue, ipv6);
126 case LEX_F_ETHERNET: return offsetof(union mf_subvalue, mac);
127 default: OVS_NOT_REACHED();
128 }
129 }
130
131 /* Returns the effective format for 'token', that is, the format in which it
132 * should actually be printed. This is ordinarily the same as 'token->format',
133 * but it's always possible that someone sets up a token with a format that
134 * won't work for a value, e.g. 'token->value' is wider than 32 bits but the
135 * format is LEX_F_IPV4. (The lexer itself won't do that; this is an attempt
136 * to avoid confusion in the future.) */
137 static enum lex_format
138 lex_token_get_format(const struct lex_token *token)
139 {
140 size_t n_zeros = lex_token_n_zeros(token->format);
141 return (is_all_zeros(&token->value, n_zeros)
142 && (token->type != LEX_T_MASKED_INTEGER
143 || is_all_zeros(&token->mask, n_zeros))
144 ? token->format
145 : LEX_F_HEXADECIMAL);
146 }
147
148 static void
149 lex_token_format_value(const union mf_subvalue *value,
150 enum lex_format format, struct ds *s)
151 {
152 switch (format) {
153 case LEX_F_DECIMAL:
154 ds_put_format(s, "%"PRIu64, ntohll(value->integer));
155 break;
156
157 case LEX_F_HEXADECIMAL:
158 mf_format_subvalue(value, s);
159 break;
160
161 case LEX_F_IPV4:
162 ds_put_format(s, IP_FMT, IP_ARGS(value->ipv4));
163 break;
164
165 case LEX_F_IPV6:
166 ipv6_format_addr(&value->ipv6, s);
167 break;
168
169 case LEX_F_ETHERNET:
170 ds_put_format(s, ETH_ADDR_FMT, ETH_ADDR_ARGS(value->mac));
171 break;
172
173 default:
174 OVS_NOT_REACHED();
175 }
176
177 }
178
179 static void
180 lex_token_format_masked_integer(const struct lex_token *token, struct ds *s)
181 {
182 enum lex_format format = lex_token_get_format(token);
183
184 lex_token_format_value(&token->value, format, s);
185 ds_put_char(s, '/');
186
187 const union mf_subvalue *mask = &token->mask;
188 if (format == LEX_F_IPV4 && ip_is_cidr(mask->ipv4)) {
189 ds_put_format(s, "%d", ip_count_cidr_bits(mask->ipv4));
190 } else if (token->format == LEX_F_IPV6 && ipv6_is_cidr(&mask->ipv6)) {
191 ds_put_format(s, "%d", ipv6_count_cidr_bits(&mask->ipv6));
192 } else {
193 lex_token_format_value(&token->mask, format, s);
194 }
195 }
196
197 /* Appends a string representation of 'token' to 's', in a format that can be
198 * losslessly parsed back by the lexer. (LEX_T_END and LEX_T_ERROR can't be
199 * parsed back.) */
200 void
201 lex_token_format(const struct lex_token *token, struct ds *s)
202 {
203 switch (token->type) {
204 case LEX_T_END:
205 ds_put_cstr(s, "$");
206 break;
207
208 case LEX_T_ID:
209 ds_put_cstr(s, token->s);
210 break;
211
212 case LEX_T_ERROR:
213 ds_put_cstr(s, "error(");
214 json_string_escape(token->s, s);
215 ds_put_char(s, ')');
216 break;
217
218 case LEX_T_STRING:
219 json_string_escape(token->s, s);
220 break;
221
222 case LEX_T_INTEGER:
223 lex_token_format_value(&token->value, lex_token_get_format(token), s);
224 break;
225
226 case LEX_T_MASKED_INTEGER:
227 lex_token_format_masked_integer(token, s);
228 break;
229
230 case LEX_T_MACRO:
231 ds_put_format(s, "$%s", token->s);
232 break;
233
234 case LEX_T_LPAREN:
235 ds_put_cstr(s, "(");
236 break;
237 case LEX_T_RPAREN:
238 ds_put_cstr(s, ")");
239 break;
240 case LEX_T_LCURLY:
241 ds_put_cstr(s, "{");
242 break;
243 case LEX_T_RCURLY:
244 ds_put_cstr(s, "}");
245 break;
246 case LEX_T_LSQUARE:
247 ds_put_cstr(s, "[");
248 break;
249 case LEX_T_RSQUARE:
250 ds_put_cstr(s, "]");
251 break;
252 case LEX_T_EQ:
253 ds_put_cstr(s, "==");
254 break;
255 case LEX_T_NE:
256 ds_put_cstr(s, "!=");
257 break;
258 case LEX_T_LT:
259 ds_put_cstr(s, "<");
260 break;
261 case LEX_T_LE:
262 ds_put_cstr(s, "<=");
263 break;
264 case LEX_T_GT:
265 ds_put_cstr(s, ">");
266 break;
267 case LEX_T_GE:
268 ds_put_cstr(s, ">=");
269 break;
270 case LEX_T_LOG_NOT:
271 ds_put_cstr(s, "!");
272 break;
273 case LEX_T_LOG_AND:
274 ds_put_cstr(s, "&&");
275 break;
276 case LEX_T_LOG_OR:
277 ds_put_cstr(s, "||");
278 break;
279 case LEX_T_ELLIPSIS:
280 ds_put_cstr(s, "..");
281 break;
282 case LEX_T_COMMA:
283 ds_put_cstr(s, ",");
284 break;
285 case LEX_T_SEMICOLON:
286 ds_put_cstr(s, ";");
287 break;
288 case LEX_T_EQUALS:
289 ds_put_cstr(s, "=");
290 break;
291 case LEX_T_EXCHANGE:
292 ds_put_cstr(s, "<->");
293 break;
294 case LEX_T_DECREMENT:
295 ds_put_cstr(s, "--");
296 break;
297 case LEX_T_COLON:
298 ds_put_char(s, ':');
299 break;
300 default:
301 OVS_NOT_REACHED();
302 }
303
304 }
305 \f
306 /* lex_token_parse(). */
307
308 static void OVS_PRINTF_FORMAT(2, 3)
309 lex_error(struct lex_token *token, const char *message, ...)
310 {
311 ovs_assert(!token->s);
312 token->type = LEX_T_ERROR;
313
314 va_list args;
315 va_start(args, message);
316 lex_token_vsprintf(token, message, args);
317 va_end(args);
318 }
319
320 static void
321 lex_parse_hex_integer(const char *start, size_t len, struct lex_token *token)
322 {
323 const char *in = start + (len - 1);
324 uint8_t *out = token->value.u8 + (sizeof token->value.u8 - 1);
325
326 for (int i = 0; i < len; i++) {
327 int hexit = hexit_value(in[-i]);
328 if (hexit < 0) {
329 lex_error(token, "Invalid syntax in hexadecimal constant.");
330 return;
331 }
332 if (hexit && i / 2 >= sizeof token->value.u8) {
333 lex_error(token, "Hexadecimal constant requires more than "
334 "%"PRIuSIZE" bits.", 8 * sizeof token->value.u8);
335 return;
336 }
337 out[-(i / 2)] |= i % 2 ? hexit << 4 : hexit;
338 }
339 token->format = LEX_F_HEXADECIMAL;
340 }
341
342 static const char *
343 lex_parse_integer__(const char *p, struct lex_token *token)
344 {
345 lex_token_init(token);
346 token->type = LEX_T_INTEGER;
347 memset(&token->value, 0, sizeof token->value);
348
349 /* Find the extent of an "integer" token, which can be in decimal or
350 * hexadecimal, or an Ethernet address or IPv4 or IPv6 address, as 'start'
351 * through 'end'.
352 *
353 * Special cases we handle here are:
354 *
355 * - The ellipsis token "..", used as e.g. 123..456. A doubled dot
356 * is never valid syntax as part of an "integer", so we stop if
357 * we encounter two dots in a row.
358 *
359 * - Syntax like 1.2.3.4:1234 to indicate an IPv4 address followed by a
360 * port number should be considered three tokens: 1.2.3.4 : 1234.
361 * The obvious approach is to allow just dots or just colons within a
362 * given integer, but that would disallow IPv4-mapped IPv6 addresses,
363 * e.g. ::ffff:192.0.2.128. However, even in those addresses, a
364 * colon never follows a dot, so we stop if we encounter a colon
365 * after a dot.
366 *
367 * (There is no corresponding way to parse an IPv6 address followed
368 * by a port number: ::1:2:3:4:1234 is unavoidably ambiguous.)
369 */
370 const char *start = p;
371 const char *end = start;
372 bool saw_dot = false;
373 while (isalnum((unsigned char) *end)
374 || (*end == ':' && !saw_dot)
375 || (*end == '.' && end[1] != '.')) {
376 if (*end == '.') {
377 saw_dot = true;
378 }
379 end++;
380 }
381 size_t len = end - start;
382
383 int n;
384 struct eth_addr mac;
385
386 if (!len) {
387 lex_error(token, "Integer constant expected.");
388 } else if (len == 17
389 && ovs_scan(start, ETH_ADDR_SCAN_FMT"%n",
390 ETH_ADDR_SCAN_ARGS(mac), &n)
391 && n == len) {
392 token->value.mac = mac;
393 token->format = LEX_F_ETHERNET;
394 } else if (start + strspn(start, "0123456789") == end) {
395 if (p[0] == '0' && len > 1) {
396 lex_error(token, "Decimal constants must not have leading zeros.");
397 } else {
398 unsigned long long int integer;
399 char *tail;
400
401 errno = 0;
402 integer = strtoull(p, &tail, 10);
403 if (tail != end || errno == ERANGE) {
404 lex_error(token, "Decimal constants must be less than 2**64.");
405 } else {
406 token->value.integer = htonll(integer);
407 token->format = LEX_F_DECIMAL;
408 }
409 }
410 } else if (p[0] == '0' && (p[1] == 'x' || p[1] == 'X')) {
411 if (len > 2) {
412 lex_parse_hex_integer(start + 2, len - 2, token);
413 } else {
414 lex_error(token, "Hex digits expected following 0%c.", p[1]);
415 }
416 } else if (len < INET6_ADDRSTRLEN) {
417 char copy[INET6_ADDRSTRLEN];
418 memcpy(copy, p, len);
419 copy[len] = '\0';
420
421 if (ip_parse(copy, &token->value.ipv4)) {
422 token->format = LEX_F_IPV4;
423 } else if (ipv6_parse(copy, &token->value.ipv6)) {
424 token->format = LEX_F_IPV6;
425 } else {
426 lex_error(token, "Invalid numeric constant.");
427 }
428 } else {
429 lex_error(token, "Invalid numeric constant.");
430 }
431
432 ovs_assert(token->type == LEX_T_INTEGER || token->type == LEX_T_ERROR);
433 return end;
434 }
435
436 static const char *
437 lex_parse_mask(const char *p, struct lex_token *token)
438 {
439 struct lex_token mask;
440
441 /* Parse just past the '/' as a second integer. Handle errors. */
442 p = lex_parse_integer__(p + 1, &mask);
443 if (mask.type == LEX_T_ERROR) {
444 lex_token_swap(&mask, token);
445 lex_token_destroy(&mask);
446 return p;
447 }
448 ovs_assert(mask.type == LEX_T_INTEGER);
449
450 /* Now convert the value and mask into a masked integer token.
451 * We have a few special cases. */
452 token->type = LEX_T_MASKED_INTEGER;
453 memset(&token->mask, 0, sizeof token->mask);
454 uint32_t prefix_bits = ntohll(mask.value.integer);
455 if (token->format == mask.format) {
456 /* Same format value and mask is always OK. */
457 token->mask = mask.value;
458 } else if (token->format == LEX_F_IPV4
459 && mask.format == LEX_F_DECIMAL
460 && prefix_bits <= 32) {
461 /* IPv4 address with decimal mask is a CIDR prefix. */
462 token->mask.integer = htonll(ntohl(be32_prefix_mask(prefix_bits)));
463 } else if (token->format == LEX_F_IPV6
464 && mask.format == LEX_F_DECIMAL
465 && prefix_bits <= 128) {
466 /* IPv6 address with decimal mask is a CIDR prefix. */
467 token->mask.ipv6 = ipv6_create_mask(prefix_bits);
468 } else if (token->format == LEX_F_DECIMAL
469 && mask.format == LEX_F_HEXADECIMAL
470 && token->value.integer == 0) {
471 /* Special case for e.g. 0/0x1234. */
472 token->format = LEX_F_HEXADECIMAL;
473 token->mask = mask.value;
474 } else {
475 lex_error(token, "Value and mask have incompatible formats.");
476 return p;
477 }
478
479 /* Check invariant that a 1-bit in the value corresponds to a 1-bit in the
480 * mask. */
481 for (int i = 0; i < ARRAY_SIZE(token->mask.be32); i++) {
482 ovs_be32 v = token->value.be32[i];
483 ovs_be32 m = token->mask.be32[i];
484
485 if (v & ~m) {
486 lex_error(token, "Value contains unmasked 1-bits.");
487 break;
488 }
489 }
490
491 /* Done! */
492 lex_token_destroy(&mask);
493 return p;
494 }
495
496 static const char *
497 lex_parse_integer(const char *p, struct lex_token *token)
498 {
499 p = lex_parse_integer__(p, token);
500 if (token->type == LEX_T_INTEGER && *p == '/') {
501 p = lex_parse_mask(p, token);
502 }
503 return p;
504 }
505
506 static const char *
507 lex_parse_string(const char *p, struct lex_token *token)
508 {
509 const char *start = ++p;
510 char * s = NULL;
511 for (;;) {
512 switch (*p) {
513 case '\0':
514 lex_error(token, "Input ends inside quoted string.");
515 return p;
516
517 case '"':
518 token->type = (json_string_unescape(start, p - start, &s)
519 ? LEX_T_STRING : LEX_T_ERROR);
520 lex_token_strset(token, s);
521 return p + 1;
522
523 case '\\':
524 p++;
525 if (*p) {
526 p++;
527 }
528 break;
529
530 default:
531 p++;
532 break;
533 }
534 }
535 }
536
537 static bool
538 lex_is_id1(unsigned char c)
539 {
540 return ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
541 || c == '_' || c == '.');
542 }
543
544 static bool
545 lex_is_idn(unsigned char c)
546 {
547 return lex_is_id1(c) || (c >= '0' && c <= '9');
548 }
549
550 static const char *
551 lex_parse_id(const char *p, enum lex_type type, struct lex_token *token)
552 {
553 const char *start = p;
554
555 do {
556 p++;
557 } while (lex_is_idn(*p));
558
559 token->type = type;
560 lex_token_strcpy(token, start, p - start);
561 return p;
562 }
563
564 static const char *
565 lex_parse_addr_set(const char *p, struct lex_token *token)
566 {
567 p++;
568 if (!lex_is_id1(*p)) {
569 lex_error(token, "`$' must be followed by a valid identifier.");
570 return p;
571 }
572
573 return lex_parse_id(p, LEX_T_MACRO, token);
574 }
575
576 /* Initializes 'token' and parses the first token from the beginning of
577 * null-terminated string 'p' into 'token'. Stores a pointer to the start of
578 * the token (after skipping white space and comments, if any) into '*startp'.
579 * Returns the character position at which to begin parsing the next token. */
580 const char *
581 lex_token_parse(struct lex_token *token, const char *p, const char **startp)
582 {
583 lex_token_init(token);
584
585 next:
586 *startp = p;
587 switch (*p) {
588 case '\0':
589 token->type = LEX_T_END;
590 return p;
591
592 case ' ': case '\t': case '\n': case '\r': case '\v': case '\f':
593 p++;
594 goto next;
595
596 case '/':
597 p++;
598 if (*p == '/') {
599 do {
600 p++;
601 } while (*p != '\0' && *p != '\n');
602 goto next;
603 } else if (*p == '*') {
604 p++;
605 for (;;) {
606 if (*p == '*' && p[1] == '/') {
607 p += 2;
608 goto next;
609 } else if (*p == '\0' || *p == '\n') {
610 lex_error(token, "`/*' without matching `*/'.");
611 return p;
612 } else {
613 p++;
614 }
615 }
616 goto next;
617 } else {
618 lex_error(token,
619 "`/' is only valid as part of `//' or `/*'.");
620 }
621 break;
622
623 case '(':
624 token->type = LEX_T_LPAREN;
625 p++;
626 break;
627
628 case ')':
629 token->type = LEX_T_RPAREN;
630 p++;
631 break;
632
633 case '{':
634 token->type = LEX_T_LCURLY;
635 p++;
636 break;
637
638 case '}':
639 token->type = LEX_T_RCURLY;
640 p++;
641 break;
642
643 case '[':
644 token->type = LEX_T_LSQUARE;
645 p++;
646 break;
647
648 case ']':
649 token->type = LEX_T_RSQUARE;
650 p++;
651 break;
652
653 case '=':
654 p++;
655 if (*p == '=') {
656 token->type = LEX_T_EQ;
657 p++;
658 } else {
659 token->type = LEX_T_EQUALS;
660 }
661 break;
662
663 case '!':
664 p++;
665 if (*p == '=') {
666 token->type = LEX_T_NE;
667 p++;
668 } else {
669 token->type = LEX_T_LOG_NOT;
670 }
671 break;
672
673 case '&':
674 p++;
675 if (*p == '&') {
676 token->type = LEX_T_LOG_AND;
677 p++;
678 } else {
679 lex_error(token, "`&' is only valid as part of `&&'.");
680 }
681 break;
682
683 case '|':
684 p++;
685 if (*p == '|') {
686 token->type = LEX_T_LOG_OR;
687 p++;
688 } else {
689 lex_error(token, "`|' is only valid as part of `||'.");
690 }
691 break;
692
693 case '<':
694 p++;
695 if (*p == '=') {
696 token->type = LEX_T_LE;
697 p++;
698 } else if (*p == '-' && p[1] == '>') {
699 token->type = LEX_T_EXCHANGE;
700 p += 2;
701 } else {
702 token->type = LEX_T_LT;
703 }
704 break;
705
706 case '>':
707 p++;
708 if (*p == '=') {
709 token->type = LEX_T_GE;
710 p++;
711 } else {
712 token->type = LEX_T_GT;
713 }
714 break;
715
716 case '.':
717 p++;
718 if (*p == '.') {
719 token->type = LEX_T_ELLIPSIS;
720 p++;
721 } else {
722 lex_error(token, "`.' is only valid as part of `..' or a number.");
723 }
724 break;
725
726 case ',':
727 p++;
728 token->type = LEX_T_COMMA;
729 break;
730
731 case ';':
732 p++;
733 token->type = LEX_T_SEMICOLON;
734 break;
735
736 case '-':
737 p++;
738 if (*p == '-') {
739 token->type = LEX_T_DECREMENT;
740 p++;
741 } else {
742 lex_error(token, "`-' is only valid as part of `--'.");
743 }
744 break;
745
746 case '$':
747 p = lex_parse_addr_set(p, token);
748 break;
749
750 case ':':
751 if (p[1] != ':') {
752 token->type = LEX_T_COLON;
753 p++;
754 break;
755 }
756 /* IPv6 address beginning with "::". */
757 /* fall through */
758 case '0': case '1': case '2': case '3': case '4':
759 case '5': case '6': case '7': case '8': case '9':
760 p = lex_parse_integer(p, token);
761 break;
762
763 case '"':
764 p = lex_parse_string(p, token);
765 break;
766
767 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
768 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
769 /* We need to distinguish an Ethernet address or IPv6 address from an
770 * identifier. Fortunately, Ethernet addresses and IPv6 addresses that
771 * are ambiguous based on the first character, always start with hex
772 * digits followed by a colon, but identifiers never do. */
773 p = (p[strspn(p, "0123456789abcdefABCDEF")] == ':'
774 ? lex_parse_integer(p, token)
775 : lex_parse_id(p, LEX_T_ID, token));
776 break;
777
778 default:
779 if (lex_is_id1(*p)) {
780 p = lex_parse_id(p, LEX_T_ID, token);
781 } else {
782 if (isprint((unsigned char) *p)) {
783 lex_error(token, "Invalid character `%c' in input.", *p);
784 } else {
785 lex_error(token, "Invalid byte 0x%d in input.", *p);
786 }
787 p++;
788 }
789 break;
790 }
791
792 return p;
793 }
794 \f
795 /* Initializes 'lexer' for parsing 'input'.
796 *
797 * While the lexer is in use, 'input' must remain available, but the caller
798 * otherwise retains ownership of 'input'.
799 *
800 * The caller must call lexer_get() to obtain the first token. */
801 void
802 lexer_init(struct lexer *lexer, const char *input)
803 {
804 lexer->input = input;
805 lexer->start = NULL;
806 lex_token_init(&lexer->token);
807 lexer->error = NULL;
808 }
809
810 /* Frees storage associated with 'lexer'. */
811 void
812 lexer_destroy(struct lexer *lexer)
813 {
814 lex_token_destroy(&lexer->token);
815 free(lexer->error);
816 }
817
818 /* Obtains the next token from 'lexer' into 'lexer->token', and returns the
819 * token's type. The caller may examine 'lexer->token' directly to obtain full
820 * information about the token. */
821 enum lex_type
822 lexer_get(struct lexer *lexer)
823 {
824 lex_token_destroy(&lexer->token);
825 lexer->input = lex_token_parse(&lexer->token, lexer->input, &lexer->start);
826 return lexer->token.type;
827 }
828
829 /* Returns the type of the next token that will be fetched by lexer_get(),
830 * without advancing 'lexer->token' to that token. */
831 enum lex_type
832 lexer_lookahead(const struct lexer *lexer)
833 {
834 struct lex_token next;
835 enum lex_type type;
836 const char *start;
837
838 lex_token_parse(&next, lexer->input, &start);
839 type = next.type;
840 lex_token_destroy(&next);
841 return type;
842 }
843
844 /* If 'lexer''s current token has the given 'type', advances 'lexer' to the
845 * next token and returns true. Otherwise returns false. */
846 bool
847 lexer_match(struct lexer *lexer, enum lex_type type)
848 {
849 if (lexer->token.type == type) {
850 lexer_get(lexer);
851 return true;
852 } else {
853 return false;
854 }
855 }
856
857 bool
858 lexer_force_match(struct lexer *lexer, enum lex_type t)
859 {
860 if (t == LEX_T_END) {
861 return lexer_force_end(lexer);
862 } else if (lexer_match(lexer, t)) {
863 return true;
864 } else {
865 struct lex_token token = { .type = t };
866 struct ds s = DS_EMPTY_INITIALIZER;
867 lex_token_format(&token, &s);
868
869 lexer_syntax_error(lexer, "expecting `%s'", ds_cstr(&s));
870
871 ds_destroy(&s);
872
873 return false;
874 }
875 }
876
877 /* If 'lexer''s current token is the identifier given in 'id', advances 'lexer'
878 * to the next token and returns true. Otherwise returns false. */
879 bool
880 lexer_match_id(struct lexer *lexer, const char *id)
881 {
882 if (lexer->token.type == LEX_T_ID && !strcmp(lexer->token.s, id)) {
883 lexer_get(lexer);
884 return true;
885 } else {
886 return false;
887 }
888 }
889
890 bool
891 lexer_is_int(const struct lexer *lexer)
892 {
893 return (lexer->token.type == LEX_T_INTEGER
894 && lexer->token.format == LEX_F_DECIMAL
895 && ntohll(lexer->token.value.integer) <= INT_MAX);
896 }
897
898 bool
899 lexer_get_int(struct lexer *lexer, int *value)
900 {
901 if (lexer_is_int(lexer)) {
902 *value = ntohll(lexer->token.value.integer);
903 lexer_get(lexer);
904 return true;
905 } else {
906 *value = 0;
907 return false;
908 }
909 }
910
911 bool
912 lexer_force_int(struct lexer *lexer, int *value)
913 {
914 bool ok = lexer_get_int(lexer, value);
915 if (!ok) {
916 lexer_syntax_error(lexer, "expecting small integer");
917 }
918 return ok;
919 }
920
921 bool
922 lexer_force_end(struct lexer *lexer)
923 {
924 if (lexer->token.type == LEX_T_END) {
925 return true;
926 } else {
927 lexer_syntax_error(lexer, "expecting end of input");
928 return false;
929 }
930 }
931
932 static bool
933 lexer_error_handle_common(struct lexer *lexer)
934 {
935 if (lexer->error) {
936 /* Already have an error, suppress this one since the cascade seems
937 * unlikely to be useful. */
938 return true;
939 } else if (lexer->token.type == LEX_T_ERROR) {
940 /* The lexer signaled an error. Nothing at a higher level accepts an
941 * error token, so we'll inevitably end up here with some meaningless
942 * parse error. Report the lexical error instead. */
943 lexer->error = xstrdup(lexer->token.s);
944 return true;
945 } else {
946 return false;
947 }
948 }
949
950 void OVS_PRINTF_FORMAT(2, 3)
951 lexer_error(struct lexer *lexer, const char *message, ...)
952 {
953 if (lexer_error_handle_common(lexer)) {
954 return;
955 }
956
957 va_list args;
958 va_start(args, message);
959 lexer->error = xvasprintf(message, args);
960 va_end(args);
961 }
962
963 void OVS_PRINTF_FORMAT(2, 3)
964 lexer_syntax_error(struct lexer *lexer, const char *message, ...)
965 {
966 if (lexer_error_handle_common(lexer)) {
967 return;
968 }
969
970 struct ds s;
971
972 ds_init(&s);
973 ds_put_cstr(&s, "Syntax error");
974 if (lexer->token.type == LEX_T_END) {
975 ds_put_cstr(&s, " at end of input");
976 } else if (lexer->start) {
977 ds_put_format(&s, " at `%.*s'",
978 (int) (lexer->input - lexer->start),
979 lexer->start);
980 }
981
982 if (message) {
983 ds_put_char(&s, ' ');
984
985 va_list args;
986 va_start(args, message);
987 ds_put_format_valist(&s, message, args);
988 va_end(args);
989 }
990 ds_put_char(&s, '.');
991
992 lexer->error = ds_steal_cstr(&s);
993 }
994
995 char *
996 lexer_steal_error(struct lexer *lexer)
997 {
998 char *error = lexer->error;
999 lexer->error = NULL;
1000 return error;
1001 }