]> git.proxmox.com Git - ovs.git/blame - ovn/lib/lex.c
appveyor: Renew SSL link.
[ovs.git] / ovn / lib / lex.c
CommitLineData
10b1662b
BP
1/*
2 * Copyright (c) 2015 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
18#include "lex.h"
19#include <ctype.h>
20#include <errno.h>
21#include <stdarg.h>
22#include "dynamic-string.h"
23#include "json.h"
24#include "util.h"
363b5330
BP
25\f
26/* Returns a string that represents 'format'. */
27const char *
28lex_format_to_string(enum lex_format format)
29{
30 switch (format) {
31 case LEX_F_DECIMAL:
32 return "decimal";
33 case LEX_F_HEXADECIMAL:
34 return "hexadecimal";
35 case LEX_F_IPV4:
36 return "IPv4";
37 case LEX_F_IPV6:
38 return "IPv6";
39 case LEX_F_ETHERNET:
40 return "Ethernet";
41 default:
42 abort();
43 }
44}
45\f
10b1662b
BP
46/* Initializes 'token'. */
47void
48lex_token_init(struct lex_token *token)
49{
50 token->type = LEX_T_END;
51 token->s = NULL;
52}
53
54/* Frees memory owned by 'token'. */
55void
56lex_token_destroy(struct lex_token *token)
57{
58 free(token->s);
59}
60
61/* Exchanges 'a' and 'b'. */
62void
63lex_token_swap(struct lex_token *a, struct lex_token *b)
64{
65 struct lex_token tmp = *a;
66 *a = *b;
67 *b = tmp;
68}
69\f
70/* lex_token_format(). */
71
72static size_t
73lex_token_n_zeros(enum lex_format format)
74{
75 switch (format) {
76 case LEX_F_DECIMAL: return offsetof(union mf_subvalue, integer);
77 case LEX_F_HEXADECIMAL: return 0;
78 case LEX_F_IPV4: return offsetof(union mf_subvalue, ipv4);
79 case LEX_F_IPV6: return offsetof(union mf_subvalue, ipv6);
80 case LEX_F_ETHERNET: return offsetof(union mf_subvalue, mac);
81 default: OVS_NOT_REACHED();
82 }
83}
84
85/* Returns the effective format for 'token', that is, the format in which it
86 * should actually be printed. This is ordinarily the same as 'token->format',
87 * but it's always possible that someone sets up a token with a format that
88 * won't work for a value, e.g. 'token->value' is wider than 32 bits but the
89 * format is LEX_F_IPV4. (The lexer itself won't do that; this is an attempt
90 * to avoid confusion in the future.) */
91static enum lex_format
92lex_token_get_format(const struct lex_token *token)
93{
94 size_t n_zeros = lex_token_n_zeros(token->format);
95 return (is_all_zeros(&token->value, n_zeros)
96 && (token->type != LEX_T_MASKED_INTEGER
97 || is_all_zeros(&token->mask, n_zeros))
98 ? token->format
99 : LEX_F_HEXADECIMAL);
100}
101
102static void
103lex_token_format_value(const union mf_subvalue *value,
104 enum lex_format format, struct ds *s)
105{
106 switch (format) {
107 case LEX_F_DECIMAL:
108 ds_put_format(s, "%"PRIu64, ntohll(value->integer));
109 break;
110
111 case LEX_F_HEXADECIMAL:
112 mf_format_subvalue(value, s);
113 break;
114
115 case LEX_F_IPV4:
116 ds_put_format(s, IP_FMT, IP_ARGS(value->ipv4));
117 break;
118
119 case LEX_F_IPV6:
120 print_ipv6_addr(s, &value->ipv6);
121 break;
122
123 case LEX_F_ETHERNET:
124 ds_put_format(s, ETH_ADDR_FMT, ETH_ADDR_ARGS(value->mac));
125 break;
126
127 default:
128 OVS_NOT_REACHED();
129 }
130
131}
132
133static void
134lex_token_format_masked_integer(const struct lex_token *token, struct ds *s)
135{
136 enum lex_format format = lex_token_get_format(token);
137
138 lex_token_format_value(&token->value, format, s);
139 ds_put_char(s, '/');
140
141 const union mf_subvalue *mask = &token->mask;
142 if (format == LEX_F_IPV4 && ip_is_cidr(mask->ipv4)) {
143 ds_put_format(s, "%d", ip_count_cidr_bits(mask->ipv4));
144 } else if (token->format == LEX_F_IPV6 && ipv6_is_cidr(&mask->ipv6)) {
145 ds_put_format(s, "%d", ipv6_count_cidr_bits(&mask->ipv6));
146 } else {
147 lex_token_format_value(&token->mask, format, s);
148 }
149}
150
10b1662b
BP
151/* Appends a string representation of 'token' to 's', in a format that can be
152 * losslessly parsed back by the lexer. (LEX_T_END and LEX_T_ERROR can't be
153 * parsed back.) */
154void
3d611299 155lex_token_format(const struct lex_token *token, struct ds *s)
10b1662b
BP
156{
157 switch (token->type) {
158 case LEX_T_END:
159 ds_put_cstr(s, "$");
160 break;
161
162 case LEX_T_ID:
163 ds_put_cstr(s, token->s);
164 break;
165
166 case LEX_T_ERROR:
167 ds_put_cstr(s, "error(");
3b626771 168 json_string_escape(token->s, s);
10b1662b
BP
169 ds_put_char(s, ')');
170 break;
171
172 case LEX_T_STRING:
3b626771 173 json_string_escape(token->s, s);
10b1662b
BP
174 break;
175
176 case LEX_T_INTEGER:
177 lex_token_format_value(&token->value, lex_token_get_format(token), s);
178 break;
179
180 case LEX_T_MASKED_INTEGER:
181 lex_token_format_masked_integer(token, s);
182 break;
183
184 case LEX_T_LPAREN:
185 ds_put_cstr(s, "(");
186 break;
187 case LEX_T_RPAREN:
188 ds_put_cstr(s, ")");
189 break;
190 case LEX_T_LCURLY:
191 ds_put_cstr(s, "{");
192 break;
193 case LEX_T_RCURLY:
194 ds_put_cstr(s, "}");
195 break;
196 case LEX_T_LSQUARE:
197 ds_put_cstr(s, "[");
198 break;
199 case LEX_T_RSQUARE:
200 ds_put_cstr(s, "]");
201 break;
202 case LEX_T_EQ:
203 ds_put_cstr(s, "==");
204 break;
205 case LEX_T_NE:
206 ds_put_cstr(s, "!=");
207 break;
208 case LEX_T_LT:
209 ds_put_cstr(s, "<");
210 break;
211 case LEX_T_LE:
212 ds_put_cstr(s, "<=");
213 break;
214 case LEX_T_GT:
215 ds_put_cstr(s, ">");
216 break;
217 case LEX_T_GE:
218 ds_put_cstr(s, ">=");
219 break;
220 case LEX_T_LOG_NOT:
221 ds_put_cstr(s, "!");
222 break;
223 case LEX_T_LOG_AND:
224 ds_put_cstr(s, "&&");
225 break;
226 case LEX_T_LOG_OR:
227 ds_put_cstr(s, "||");
228 break;
229 case LEX_T_ELLIPSIS:
230 ds_put_cstr(s, "..");
231 break;
232 case LEX_T_COMMA:
233 ds_put_cstr(s, ",");
234 break;
235 case LEX_T_SEMICOLON:
236 ds_put_cstr(s, ";");
237 break;
238 case LEX_T_EQUALS:
239 ds_put_cstr(s, "=");
240 break;
241 default:
242 OVS_NOT_REACHED();
243 }
244
245}
246\f
247/* lex_token_parse(). */
248
249static void OVS_PRINTF_FORMAT(2, 3)
250lex_error(struct lex_token *token, const char *message, ...)
251{
252 ovs_assert(!token->s);
253 token->type = LEX_T_ERROR;
254
255 va_list args;
256 va_start(args, message);
257 token->s = xvasprintf(message, args);
258 va_end(args);
259}
260
261static void
262lex_parse_hex_integer(const char *start, size_t len, struct lex_token *token)
263{
264 const char *in = start + (len - 1);
265 uint8_t *out = token->value.u8 + (sizeof token->value.u8 - 1);
266
267 for (int i = 0; i < len; i++) {
268 int hexit = hexit_value(in[-i]);
269 if (hexit < 0) {
270 lex_error(token, "Invalid syntax in hexadecimal constant.");
271 return;
272 }
273 if (hexit && i / 2 >= sizeof token->value.u8) {
274 lex_error(token, "Hexadecimal constant requires more than "
275 "%"PRIuSIZE" bits.", 8 * sizeof token->value.u8);
276 return;
277 }
278 out[-(i / 2)] |= i % 2 ? hexit << 4 : hexit;
279 }
280 token->format = LEX_F_HEXADECIMAL;
281}
282
283static const char *
284lex_parse_integer__(const char *p, struct lex_token *token)
285{
286 lex_token_init(token);
287 token->type = LEX_T_INTEGER;
288 memset(&token->value, 0, sizeof token->value);
289 const char *start = p;
290 const char *end = start;
291 while (isalnum((unsigned char) *end) || *end == ':'
292 || (*end == '.' && end[1] != '.')) {
293 end++;
294 }
295 size_t len = end - start;
296
297 int n;
298 uint8_t mac[ETH_ADDR_LEN];
299
300 if (!len) {
301 lex_error(token, "Integer constant expected.");
302 } else if (len == 17
303 && ovs_scan(start, ETH_ADDR_SCAN_FMT"%n",
304 ETH_ADDR_SCAN_ARGS(mac), &n)
305 && n == len) {
306 memcpy(token->value.mac, mac, sizeof token->value.mac);
307 token->format = LEX_F_ETHERNET;
308 } else if (start + strspn(start, "0123456789") == end) {
309 if (p[0] == '0' && len > 1) {
310 lex_error(token, "Decimal constants must not have leading zeros.");
311 } else {
312 unsigned long long int integer;
313 char *tail;
314
315 errno = 0;
316 integer = strtoull(p, &tail, 10);
317 if (tail != end || errno == ERANGE) {
318 lex_error(token, "Decimal constants must be less than 2**64.");
319 } else {
320 token->value.integer = htonll(integer);
321 token->format = LEX_F_DECIMAL;
322 }
323 }
324 } else if (p[0] == '0' && (p[1] == 'x' || p[1] == 'X')) {
325 if (len > 2) {
326 lex_parse_hex_integer(start + 2, len - 2, token);
327 } else {
328 lex_error(token, "Hex digits expected following 0%c.", p[1]);
329 }
330 } else if (len < INET6_ADDRSTRLEN) {
331 char copy[INET6_ADDRSTRLEN];
332 memcpy(copy, p, len);
333 copy[len] = '\0';
334
335 struct in_addr ipv4;
336 struct in6_addr ipv6;
337 if (inet_pton(AF_INET, copy, &ipv4) == 1) {
338 token->value.ipv4 = ipv4.s_addr;
339 token->format = LEX_F_IPV4;
340 } else if (inet_pton(AF_INET6, copy, &ipv6) == 1) {
341 token->value.ipv6 = ipv6;
342 token->format = LEX_F_IPV6;
343 } else {
344 lex_error(token, "Invalid numeric constant.");
345 }
346 } else {
347 lex_error(token, "Invalid numeric constant.");
348 }
349
350 ovs_assert(token->type == LEX_T_INTEGER || token->type == LEX_T_ERROR);
351 return end;
352}
353
354static const char *
355lex_parse_mask(const char *p, struct lex_token *token)
356{
357 struct lex_token mask;
358
359 /* Parse just past the '/' as a second integer. Handle errors. */
360 p = lex_parse_integer__(p + 1, &mask);
361 if (mask.type == LEX_T_ERROR) {
362 lex_token_swap(&mask, token);
363 lex_token_destroy(&mask);
364 return p;
365 }
366 ovs_assert(mask.type == LEX_T_INTEGER);
367
368 /* Now convert the value and mask into a masked integer token.
369 * We have a few special cases. */
370 token->type = LEX_T_MASKED_INTEGER;
371 memset(&token->mask, 0, sizeof token->mask);
372 uint32_t prefix_bits = ntohll(mask.value.integer);
373 if (token->format == mask.format) {
374 /* Same format value and mask is always OK. */
375 token->mask = mask.value;
376 } else if (token->format == LEX_F_IPV4
377 && mask.format == LEX_F_DECIMAL
378 && prefix_bits <= 32) {
379 /* IPv4 address with decimal mask is a CIDR prefix. */
380 token->mask.integer = htonll(ntohl(be32_prefix_mask(prefix_bits)));
381 } else if (token->format == LEX_F_IPV6
382 && mask.format == LEX_F_DECIMAL
383 && prefix_bits <= 128) {
384 /* IPv6 address with decimal mask is a CIDR prefix. */
385 token->mask.ipv6 = ipv6_create_mask(prefix_bits);
386 } else if (token->format == LEX_F_DECIMAL
387 && mask.format == LEX_F_HEXADECIMAL
388 && token->value.integer == 0) {
389 /* Special case for e.g. 0/0x1234. */
390 token->format = LEX_F_HEXADECIMAL;
391 token->mask = mask.value;
392 } else {
393 lex_error(token, "Value and mask have incompatible formats.");
394 return p;
395 }
396
397 /* Check invariant that a 1-bit in the value corresponds to a 1-bit in the
398 * mask. */
399 for (int i = 0; i < ARRAY_SIZE(token->mask.be32); i++) {
400 ovs_be32 v = token->value.be32[i];
401 ovs_be32 m = token->mask.be32[i];
402
403 if (v & ~m) {
404 lex_error(token, "Value contains unmasked 1-bits.");
405 break;
406 }
407 }
408
409 /* Done! */
410 lex_token_destroy(&mask);
411 return p;
412}
413
414static const char *
415lex_parse_integer(const char *p, struct lex_token *token)
416{
417 p = lex_parse_integer__(p, token);
418 if (token->type == LEX_T_INTEGER && *p == '/') {
419 p = lex_parse_mask(p, token);
420 }
421 return p;
422}
423
424static const char *
425lex_parse_string(const char *p, struct lex_token *token)
426{
427 const char *start = ++p;
428 for (;;) {
429 switch (*p) {
430 case '\0':
431 lex_error(token, "Input ends inside quoted string.");
432 return p;
433
434 case '"':
435 token->type = (json_string_unescape(start, p - start, &token->s)
436 ? LEX_T_STRING : LEX_T_ERROR);
437 return p + 1;
438
439 case '\\':
440 p++;
441 if (*p) {
442 p++;
443 }
444 break;
445
446 default:
447 p++;
448 break;
449 }
450 }
451}
452
453static bool
454lex_is_id1(unsigned char c)
455{
456 return ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
457 || c == '_' || c == '.');
458}
459
460static bool
461lex_is_idn(unsigned char c)
462{
463 return lex_is_id1(c) || (c >= '0' && c <= '9');
464}
465
466static const char *
467lex_parse_id(const char *p, struct lex_token *token)
468{
469 const char *start = p;
470
471 do {
472 p++;
473 } while (lex_is_idn(*p));
474
475 token->type = LEX_T_ID;
476 token->s = xmemdup0(start, p - start);
477 return p;
478}
479
480/* Initializes 'token' and parses the first token from the beginning of
481 * null-terminated string 'p' into 'token'. Stores a pointer to the start of
482 * the token (after skipping white space and comments, if any) into '*startp'.
483 * Returns the character position at which to begin parsing the next token. */
484const char *
485lex_token_parse(struct lex_token *token, const char *p, const char **startp)
486{
487 lex_token_init(token);
488
489next:
490 *startp = p;
491 switch (*p) {
492 case '\0':
493 token->type = LEX_T_END;
494 return p;
495
496 case ' ': case '\t': case '\n': case '\r':
497 p++;
498 goto next;
499
500 case '/':
501 p++;
502 if (*p == '/') {
503 do {
504 p++;
505 } while (*p != '\0' && *p != '\n');
506 goto next;
507 } else if (*p == '*') {
508 p++;
509 for (;;) {
510 if (*p == '*' && p[1] == '/') {
511 p += 2;
512 goto next;
513 } else if (*p == '\0' || *p == '\n') {
514 lex_error(token, "`/*' without matching `*/'.");
515 return p;
516 } else {
517 p++;
518 }
519 }
520 goto next;
521 } else {
522 lex_error(token,
523 "`/' is only valid as part of `//' or `/*'.");
524 }
525 break;
526
527 case '(':
528 token->type = LEX_T_LPAREN;
529 p++;
530 break;
531
532 case ')':
533 token->type = LEX_T_RPAREN;
534 p++;
535 break;
536
537 case '{':
538 token->type = LEX_T_LCURLY;
539 p++;
540 break;
541
542 case '}':
543 token->type = LEX_T_RCURLY;
544 p++;
545 break;
546
547 case '[':
548 token->type = LEX_T_LSQUARE;
549 p++;
550 break;
551
552 case ']':
553 token->type = LEX_T_RSQUARE;
554 p++;
555 break;
556
557 case '=':
558 p++;
559 if (*p == '=') {
560 token->type = LEX_T_EQ;
561 p++;
562 } else {
563 token->type = LEX_T_EQUALS;
564 }
565 break;
566
567 case '!':
568 p++;
569 if (*p == '=') {
570 token->type = LEX_T_NE;
571 p++;
572 } else {
573 token->type = LEX_T_LOG_NOT;
574 }
575 break;
576
577 case '&':
578 p++;
579 if (*p == '&') {
580 token->type = LEX_T_LOG_AND;
581 p++;
582 } else {
583 lex_error(token, "`&' is only valid as part of `&&'.");
584 }
585 break;
586
587 case '|':
588 p++;
589 if (*p == '|') {
590 token->type = LEX_T_LOG_OR;
591 p++;
592 } else {
593 lex_error(token, "`|' is only valid as part of `||'.");
594 }
595 break;
596
597 case '<':
598 p++;
599 if (*p == '=') {
600 token->type = LEX_T_LE;
601 p++;
602 } else {
603 token->type = LEX_T_LT;
604 }
605 break;
606
607 case '>':
608 p++;
609 if (*p == '=') {
610 token->type = LEX_T_GE;
611 p++;
612 } else {
613 token->type = LEX_T_GT;
614 }
615 break;
616
617 case '.':
618 p++;
619 if (*p == '.') {
620 token->type = LEX_T_ELLIPSIS;
621 p++;
622 } else {
623 lex_error(token, "`.' is only valid as part of `..' or a number.");
624 }
625 break;
626
627 case ',':
628 p++;
629 token->type = LEX_T_COMMA;
630 break;
631
632 case ';':
633 p++;
634 token->type = LEX_T_SEMICOLON;
635 break;
636
637 case '0': case '1': case '2': case '3': case '4':
638 case '5': case '6': case '7': case '8': case '9':
639 case ':':
640 p = lex_parse_integer(p, token);
641 break;
642
643 case '"':
644 p = lex_parse_string(p, token);
645 break;
646
647 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
648 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
649 /* We need to distinguish an Ethernet address or IPv6 address from an
650 * identifier. Fortunately, Ethernet addresses and IPv6 addresses that
651 * are ambiguous based on the first character, always start with hex
652 * digits followed by a colon, but identifiers never do. */
653 p = (p[strspn(p, "0123456789abcdefABCDEF")] == ':'
654 ? lex_parse_integer(p, token)
655 : lex_parse_id(p, token));
656 break;
657
658 default:
659 if (lex_is_id1(*p)) {
660 p = lex_parse_id(p, token);
661 } else {
662 if (isprint((unsigned char) *p)) {
663 lex_error(token, "Invalid character `%c' in input.", *p);
664 } else {
665 lex_error(token, "Invalid byte 0x%d in input.", *p);
666 }
667 p++;
668 }
669 break;
670 }
671
672 return p;
673}
674\f
675/* Initializes 'lexer' for parsing 'input'.
676 *
677 * While the lexer is in use, 'input' must remain available, but the caller
678 * otherwise retains ownership of 'input'.
679 *
680 * The caller must call lexer_get() to obtain the first token. */
681void
682lexer_init(struct lexer *lexer, const char *input)
683{
684 lexer->input = input;
685 lexer->start = NULL;
686 lex_token_init(&lexer->token);
687}
688
689/* Frees storage associated with 'lexer'. */
690void
691lexer_destroy(struct lexer *lexer)
692{
693 lex_token_destroy(&lexer->token);
694}
695
696/* Obtains the next token from 'lexer' into 'lexer->token', and returns the
697 * token's type. The caller may examine 'lexer->token' directly to obtain full
698 * information about the token. */
699enum lex_type
700lexer_get(struct lexer *lexer)
701{
702 lex_token_destroy(&lexer->token);
703 lexer->input = lex_token_parse(&lexer->token, lexer->input, &lexer->start);
704 return lexer->token.type;
705}
706
27912fdb
BP
707/* Returns the type of the next token that will be fetched by lexer_get(),
708 * without advancing 'lexer->token' to that token. */
709enum lex_type
710lexer_lookahead(const struct lexer *lexer)
711{
712 struct lex_token next;
713 enum lex_type type;
714 const char *start;
715
716 lex_token_parse(&next, lexer->input, &start);
717 type = next.type;
718 lex_token_destroy(&next);
719 return type;
720}
721
10b1662b
BP
722/* If 'lexer''s current token has the given 'type', advances 'lexer' to the
723 * next token and returns true. Otherwise returns false. */
724bool
725lexer_match(struct lexer *lexer, enum lex_type type)
726{
727 if (lexer->token.type == type) {
728 lexer_get(lexer);
729 return true;
730 } else {
731 return false;
732 }
733}
27912fdb
BP
734
735/* If 'lexer''s current token is the identifier given in 'id', advances 'lexer'
736 * to the next token and returns true. Otherwise returns false. */
737bool
738lexer_match_id(struct lexer *lexer, const char *id)
739{
740 if (lexer->token.type == LEX_T_ID && !strcmp(lexer->token.s, id)) {
741 lexer_get(lexer);
742 return true;
743 } else {
744 return false;
745 }
746}