]> git.proxmox.com Git - ovs.git/blob - ovn/northd/ovn-northd.c
ovn util: Refactor dhcp_opts_map to make it generic
[ovs.git] / ovn / northd / ovn-northd.c
1 /*
2 * Licensed under the Apache License, Version 2.0 (the "License");
3 * you may not use this file except in compliance with the License.
4 * You may obtain a copy of the License at:
5 *
6 * http://www.apache.org/licenses/LICENSE-2.0
7 *
8 * Unless required by applicable law or agreed to in writing, software
9 * distributed under the License is distributed on an "AS IS" BASIS,
10 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 * See the License for the specific language governing permissions and
12 * limitations under the License.
13 */
14
15 #include <config.h>
16
17 #include <getopt.h>
18 #include <stdlib.h>
19 #include <stdio.h>
20
21 #include "bitmap.h"
22 #include "command-line.h"
23 #include "daemon.h"
24 #include "dirs.h"
25 #include "openvswitch/dynamic-string.h"
26 #include "fatal-signal.h"
27 #include "hash.h"
28 #include "openvswitch/hmap.h"
29 #include "openvswitch/json.h"
30 #include "ovn/lex.h"
31 #include "ovn/lib/chassis-index.h"
32 #include "ovn/lib/logical-fields.h"
33 #include "ovn/lib/ovn-l7.h"
34 #include "ovn/lib/ovn-nb-idl.h"
35 #include "ovn/lib/ovn-sb-idl.h"
36 #include "ovn/lib/ovn-util.h"
37 #include "ovn/actions.h"
38 #include "packets.h"
39 #include "poll-loop.h"
40 #include "smap.h"
41 #include "sset.h"
42 #include "stream.h"
43 #include "stream-ssl.h"
44 #include "unixctl.h"
45 #include "util.h"
46 #include "uuid.h"
47 #include "openvswitch/vlog.h"
48
49 VLOG_DEFINE_THIS_MODULE(ovn_northd);
50
51 static unixctl_cb_func ovn_northd_exit;
52
53 struct northd_context {
54 struct ovsdb_idl *ovnnb_idl;
55 struct ovsdb_idl *ovnsb_idl;
56 struct ovsdb_idl_txn *ovnnb_txn;
57 struct ovsdb_idl_txn *ovnsb_txn;
58 };
59
60 static const char *ovnnb_db;
61 static const char *ovnsb_db;
62
63 #define MAC_ADDR_PREFIX 0x0A0000000000ULL
64 #define MAC_ADDR_SPACE 0xffffff
65
66 /* MAC address management (macam) table of "struct eth_addr"s, that holds the
67 * MAC addresses allocated by the OVN ipam module. */
68 static struct hmap macam = HMAP_INITIALIZER(&macam);
69
70 #define MAX_OVN_TAGS 4096
71 \f
72 /* Pipeline stages. */
73
74 /* The two pipelines in an OVN logical flow table. */
75 enum ovn_pipeline {
76 P_IN, /* Ingress pipeline. */
77 P_OUT /* Egress pipeline. */
78 };
79
80 /* The two purposes for which ovn-northd uses OVN logical datapaths. */
81 enum ovn_datapath_type {
82 DP_SWITCH, /* OVN logical switch. */
83 DP_ROUTER /* OVN logical router. */
84 };
85
86 /* Returns an "enum ovn_stage" built from the arguments.
87 *
88 * (It's better to use ovn_stage_build() for type-safety reasons, but inline
89 * functions can't be used in enums or switch cases.) */
90 #define OVN_STAGE_BUILD(DP_TYPE, PIPELINE, TABLE) \
91 (((DP_TYPE) << 9) | ((PIPELINE) << 8) | (TABLE))
92
93 /* A stage within an OVN logical switch or router.
94 *
95 * An "enum ovn_stage" indicates whether the stage is part of a logical switch
96 * or router, whether the stage is part of the ingress or egress pipeline, and
97 * the table within that pipeline. The first three components are combined to
98 * form the stage's full name, e.g. S_SWITCH_IN_PORT_SEC_L2,
99 * S_ROUTER_OUT_DELIVERY. */
100 enum ovn_stage {
101 #define PIPELINE_STAGES \
102 /* Logical switch ingress stages. */ \
103 PIPELINE_STAGE(SWITCH, IN, PORT_SEC_L2, 0, "ls_in_port_sec_l2") \
104 PIPELINE_STAGE(SWITCH, IN, PORT_SEC_IP, 1, "ls_in_port_sec_ip") \
105 PIPELINE_STAGE(SWITCH, IN, PORT_SEC_ND, 2, "ls_in_port_sec_nd") \
106 PIPELINE_STAGE(SWITCH, IN, PRE_ACL, 3, "ls_in_pre_acl") \
107 PIPELINE_STAGE(SWITCH, IN, PRE_LB, 4, "ls_in_pre_lb") \
108 PIPELINE_STAGE(SWITCH, IN, PRE_STATEFUL, 5, "ls_in_pre_stateful") \
109 PIPELINE_STAGE(SWITCH, IN, ACL, 6, "ls_in_acl") \
110 PIPELINE_STAGE(SWITCH, IN, QOS_MARK, 7, "ls_in_qos_mark") \
111 PIPELINE_STAGE(SWITCH, IN, LB, 8, "ls_in_lb") \
112 PIPELINE_STAGE(SWITCH, IN, STATEFUL, 9, "ls_in_stateful") \
113 PIPELINE_STAGE(SWITCH, IN, ARP_ND_RSP, 10, "ls_in_arp_rsp") \
114 PIPELINE_STAGE(SWITCH, IN, DHCP_OPTIONS, 11, "ls_in_dhcp_options") \
115 PIPELINE_STAGE(SWITCH, IN, DHCP_RESPONSE, 12, "ls_in_dhcp_response") \
116 PIPELINE_STAGE(SWITCH, IN, DNS_LOOKUP, 13, "ls_in_dns_lookup") \
117 PIPELINE_STAGE(SWITCH, IN, DNS_RESPONSE, 14, "ls_in_dns_response") \
118 PIPELINE_STAGE(SWITCH, IN, L2_LKUP, 15, "ls_in_l2_lkup") \
119 \
120 /* Logical switch egress stages. */ \
121 PIPELINE_STAGE(SWITCH, OUT, PRE_LB, 0, "ls_out_pre_lb") \
122 PIPELINE_STAGE(SWITCH, OUT, PRE_ACL, 1, "ls_out_pre_acl") \
123 PIPELINE_STAGE(SWITCH, OUT, PRE_STATEFUL, 2, "ls_out_pre_stateful") \
124 PIPELINE_STAGE(SWITCH, OUT, LB, 3, "ls_out_lb") \
125 PIPELINE_STAGE(SWITCH, OUT, ACL, 4, "ls_out_acl") \
126 PIPELINE_STAGE(SWITCH, OUT, QOS_MARK, 5, "ls_out_qos_mark") \
127 PIPELINE_STAGE(SWITCH, OUT, STATEFUL, 6, "ls_out_stateful") \
128 PIPELINE_STAGE(SWITCH, OUT, PORT_SEC_IP, 7, "ls_out_port_sec_ip") \
129 PIPELINE_STAGE(SWITCH, OUT, PORT_SEC_L2, 8, "ls_out_port_sec_l2") \
130 \
131 /* Logical router ingress stages. */ \
132 PIPELINE_STAGE(ROUTER, IN, ADMISSION, 0, "lr_in_admission") \
133 PIPELINE_STAGE(ROUTER, IN, IP_INPUT, 1, "lr_in_ip_input") \
134 PIPELINE_STAGE(ROUTER, IN, DEFRAG, 2, "lr_in_defrag") \
135 PIPELINE_STAGE(ROUTER, IN, UNSNAT, 3, "lr_in_unsnat") \
136 PIPELINE_STAGE(ROUTER, IN, DNAT, 4, "lr_in_dnat") \
137 PIPELINE_STAGE(ROUTER, IN, IP_ROUTING, 5, "lr_in_ip_routing") \
138 PIPELINE_STAGE(ROUTER, IN, ARP_RESOLVE, 6, "lr_in_arp_resolve") \
139 PIPELINE_STAGE(ROUTER, IN, GW_REDIRECT, 7, "lr_in_gw_redirect") \
140 PIPELINE_STAGE(ROUTER, IN, ARP_REQUEST, 8, "lr_in_arp_request") \
141 \
142 /* Logical router egress stages. */ \
143 PIPELINE_STAGE(ROUTER, OUT, UNDNAT, 0, "lr_out_undnat") \
144 PIPELINE_STAGE(ROUTER, OUT, SNAT, 1, "lr_out_snat") \
145 PIPELINE_STAGE(ROUTER, OUT, EGR_LOOP, 2, "lr_out_egr_loop") \
146 PIPELINE_STAGE(ROUTER, OUT, DELIVERY, 3, "lr_out_delivery")
147
148 #define PIPELINE_STAGE(DP_TYPE, PIPELINE, STAGE, TABLE, NAME) \
149 S_##DP_TYPE##_##PIPELINE##_##STAGE \
150 = OVN_STAGE_BUILD(DP_##DP_TYPE, P_##PIPELINE, TABLE),
151 PIPELINE_STAGES
152 #undef PIPELINE_STAGE
153 };
154
155 /* Due to various hard-coded priorities need to implement ACLs, the
156 * northbound database supports a smaller range of ACL priorities than
157 * are available to logical flows. This value is added to an ACL
158 * priority to determine the ACL's logical flow priority. */
159 #define OVN_ACL_PRI_OFFSET 1000
160
161 /* Register definitions specific to switches. */
162 #define REGBIT_CONNTRACK_DEFRAG "reg0[0]"
163 #define REGBIT_CONNTRACK_COMMIT "reg0[1]"
164 #define REGBIT_CONNTRACK_NAT "reg0[2]"
165 #define REGBIT_DHCP_OPTS_RESULT "reg0[3]"
166 #define REGBIT_DNS_LOOKUP_RESULT "reg0[4]"
167
168 /* Register definitions for switches and routers. */
169 #define REGBIT_NAT_REDIRECT "reg9[0]"
170 /* Indicate that this packet has been recirculated using egress
171 * loopback. This allows certain checks to be bypassed, such as a
172 * logical router dropping packets with source IP address equals
173 * one of the logical router's own IP addresses. */
174 #define REGBIT_EGRESS_LOOPBACK "reg9[1]"
175
176 /* Returns an "enum ovn_stage" built from the arguments. */
177 static enum ovn_stage
178 ovn_stage_build(enum ovn_datapath_type dp_type, enum ovn_pipeline pipeline,
179 uint8_t table)
180 {
181 return OVN_STAGE_BUILD(dp_type, pipeline, table);
182 }
183
184 /* Returns the pipeline to which 'stage' belongs. */
185 static enum ovn_pipeline
186 ovn_stage_get_pipeline(enum ovn_stage stage)
187 {
188 return (stage >> 8) & 1;
189 }
190
191 /* Returns the table to which 'stage' belongs. */
192 static uint8_t
193 ovn_stage_get_table(enum ovn_stage stage)
194 {
195 return stage & 0xff;
196 }
197
198 /* Returns a string name for 'stage'. */
199 static const char *
200 ovn_stage_to_str(enum ovn_stage stage)
201 {
202 switch (stage) {
203 #define PIPELINE_STAGE(DP_TYPE, PIPELINE, STAGE, TABLE, NAME) \
204 case S_##DP_TYPE##_##PIPELINE##_##STAGE: return NAME;
205 PIPELINE_STAGES
206 #undef PIPELINE_STAGE
207 default: return "<unknown>";
208 }
209 }
210
211 /* Returns the type of the datapath to which a flow with the given 'stage' may
212 * be added. */
213 static enum ovn_datapath_type
214 ovn_stage_to_datapath_type(enum ovn_stage stage)
215 {
216 switch (stage) {
217 #define PIPELINE_STAGE(DP_TYPE, PIPELINE, STAGE, TABLE, NAME) \
218 case S_##DP_TYPE##_##PIPELINE##_##STAGE: return DP_##DP_TYPE;
219 PIPELINE_STAGES
220 #undef PIPELINE_STAGE
221 default: OVS_NOT_REACHED();
222 }
223 }
224 \f
225 static void
226 usage(void)
227 {
228 printf("\
229 %s: OVN northbound management daemon\n\
230 usage: %s [OPTIONS]\n\
231 \n\
232 Options:\n\
233 --ovnnb-db=DATABASE connect to ovn-nb database at DATABASE\n\
234 (default: %s)\n\
235 --ovnsb-db=DATABASE connect to ovn-sb database at DATABASE\n\
236 (default: %s)\n\
237 -h, --help display this help message\n\
238 -o, --options list available options\n\
239 -V, --version display version information\n\
240 ", program_name, program_name, default_nb_db(), default_sb_db());
241 daemon_usage();
242 vlog_usage();
243 stream_usage("database", true, true, false);
244 }
245 \f
246 struct tnlid_node {
247 struct hmap_node hmap_node;
248 uint32_t tnlid;
249 };
250
251 static void
252 destroy_tnlids(struct hmap *tnlids)
253 {
254 struct tnlid_node *node;
255 HMAP_FOR_EACH_POP (node, hmap_node, tnlids) {
256 free(node);
257 }
258 hmap_destroy(tnlids);
259 }
260
261 static void
262 add_tnlid(struct hmap *set, uint32_t tnlid)
263 {
264 struct tnlid_node *node = xmalloc(sizeof *node);
265 hmap_insert(set, &node->hmap_node, hash_int(tnlid, 0));
266 node->tnlid = tnlid;
267 }
268
269 static bool
270 tnlid_in_use(const struct hmap *set, uint32_t tnlid)
271 {
272 const struct tnlid_node *node;
273 HMAP_FOR_EACH_IN_BUCKET (node, hmap_node, hash_int(tnlid, 0), set) {
274 if (node->tnlid == tnlid) {
275 return true;
276 }
277 }
278 return false;
279 }
280
281 static uint32_t
282 allocate_tnlid(struct hmap *set, const char *name, uint32_t max,
283 uint32_t *hint)
284 {
285 for (uint32_t tnlid = *hint + 1; tnlid != *hint;
286 tnlid = tnlid + 1 <= max ? tnlid + 1 : 1) {
287 if (!tnlid_in_use(set, tnlid)) {
288 add_tnlid(set, tnlid);
289 *hint = tnlid;
290 return tnlid;
291 }
292 }
293
294 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
295 VLOG_WARN_RL(&rl, "all %s tunnel ids exhausted", name);
296 return 0;
297 }
298 \f
299 struct ovn_chassis_qdisc_queues {
300 struct hmap_node key_node;
301 uint32_t queue_id;
302 struct uuid chassis_uuid;
303 };
304
305 static void
306 destroy_chassis_queues(struct hmap *set)
307 {
308 struct ovn_chassis_qdisc_queues *node;
309 HMAP_FOR_EACH_POP (node, key_node, set) {
310 free(node);
311 }
312 hmap_destroy(set);
313 }
314
315 static void
316 add_chassis_queue(struct hmap *set, struct uuid *chassis_uuid,
317 uint32_t queue_id)
318 {
319 struct ovn_chassis_qdisc_queues *node = xmalloc(sizeof *node);
320 node->queue_id = queue_id;
321 memcpy(&node->chassis_uuid, chassis_uuid, sizeof node->chassis_uuid);
322 hmap_insert(set, &node->key_node, uuid_hash(chassis_uuid));
323 }
324
325 static bool
326 chassis_queueid_in_use(const struct hmap *set, struct uuid *chassis_uuid,
327 uint32_t queue_id)
328 {
329 const struct ovn_chassis_qdisc_queues *node;
330 HMAP_FOR_EACH_WITH_HASH (node, key_node, uuid_hash(chassis_uuid), set) {
331 if (uuid_equals(chassis_uuid, &node->chassis_uuid)
332 && node->queue_id == queue_id) {
333 return true;
334 }
335 }
336 return false;
337 }
338
339 static uint32_t
340 allocate_chassis_queueid(struct hmap *set, struct sbrec_chassis *chassis)
341 {
342 for (uint32_t queue_id = QDISC_MIN_QUEUE_ID + 1;
343 queue_id <= QDISC_MAX_QUEUE_ID;
344 queue_id++) {
345 if (!chassis_queueid_in_use(set, &chassis->header_.uuid, queue_id)) {
346 add_chassis_queue(set, &chassis->header_.uuid, queue_id);
347 return queue_id;
348 }
349 }
350
351 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
352 VLOG_WARN_RL(&rl, "all %s queue ids exhausted", chassis->name);
353 return 0;
354 }
355
356 static void
357 free_chassis_queueid(struct hmap *set, struct sbrec_chassis *chassis,
358 uint32_t queue_id)
359 {
360 struct ovn_chassis_qdisc_queues *node;
361 HMAP_FOR_EACH_WITH_HASH (node, key_node,
362 uuid_hash(&chassis->header_.uuid),
363 set) {
364 if (uuid_equals(&chassis->header_.uuid, &node->chassis_uuid)
365 && node->queue_id == queue_id) {
366 hmap_remove(set, &node->key_node);
367 break;
368 }
369 }
370 }
371
372 static inline bool
373 port_has_qos_params(const struct smap *opts)
374 {
375 return (smap_get(opts, "qos_max_rate") ||
376 smap_get(opts, "qos_burst"));
377 }
378 \f
379
380 struct ipam_info {
381 uint32_t start_ipv4;
382 size_t total_ipv4s;
383 unsigned long *allocated_ipv4s; /* A bitmap of allocated IPv4s */
384 bool ipv6_prefix_set;
385 struct in6_addr ipv6_prefix;
386 };
387
388 /* The 'key' comes from nbs->header_.uuid or nbr->header_.uuid or
389 * sb->external_ids:logical-switch. */
390 struct ovn_datapath {
391 struct hmap_node key_node; /* Index on 'key'. */
392 struct uuid key; /* (nbs/nbr)->header_.uuid. */
393
394 const struct nbrec_logical_switch *nbs; /* May be NULL. */
395 const struct nbrec_logical_router *nbr; /* May be NULL. */
396 const struct sbrec_datapath_binding *sb; /* May be NULL. */
397
398 struct ovs_list list; /* In list of similar records. */
399
400 /* Logical switch data. */
401 struct ovn_port **router_ports;
402 size_t n_router_ports;
403
404 struct hmap port_tnlids;
405 uint32_t port_key_hint;
406
407 bool has_unknown;
408
409 /* IPAM data. */
410 struct ipam_info *ipam_info;
411
412 /* OVN northd only needs to know about the logical router gateway port for
413 * NAT on a distributed router. This "distributed gateway port" is
414 * populated only when there is a "redirect-chassis" specified for one of
415 * the ports on the logical router. Otherwise this will be NULL. */
416 struct ovn_port *l3dgw_port;
417 /* The "derived" OVN port representing the instance of l3dgw_port on
418 * the "redirect-chassis". */
419 struct ovn_port *l3redirect_port;
420 struct ovn_port *localnet_port;
421 };
422
423 struct macam_node {
424 struct hmap_node hmap_node;
425 struct eth_addr mac_addr; /* Allocated MAC address. */
426 };
427
428 static void
429 cleanup_macam(struct hmap *macam)
430 {
431 struct macam_node *node;
432 HMAP_FOR_EACH_POP (node, hmap_node, macam) {
433 free(node);
434 }
435 }
436
437 static struct ovn_datapath *
438 ovn_datapath_create(struct hmap *datapaths, const struct uuid *key,
439 const struct nbrec_logical_switch *nbs,
440 const struct nbrec_logical_router *nbr,
441 const struct sbrec_datapath_binding *sb)
442 {
443 struct ovn_datapath *od = xzalloc(sizeof *od);
444 od->key = *key;
445 od->sb = sb;
446 od->nbs = nbs;
447 od->nbr = nbr;
448 hmap_init(&od->port_tnlids);
449 od->port_key_hint = 0;
450 hmap_insert(datapaths, &od->key_node, uuid_hash(&od->key));
451 return od;
452 }
453
454 static void
455 ovn_datapath_destroy(struct hmap *datapaths, struct ovn_datapath *od)
456 {
457 if (od) {
458 /* Don't remove od->list. It is used within build_datapaths() as a
459 * private list and once we've exited that function it is not safe to
460 * use it. */
461 hmap_remove(datapaths, &od->key_node);
462 destroy_tnlids(&od->port_tnlids);
463 if (od->ipam_info) {
464 bitmap_free(od->ipam_info->allocated_ipv4s);
465 free(od->ipam_info);
466 }
467 free(od->router_ports);
468 free(od);
469 }
470 }
471
472 /* Returns 'od''s datapath type. */
473 static enum ovn_datapath_type
474 ovn_datapath_get_type(const struct ovn_datapath *od)
475 {
476 return od->nbs ? DP_SWITCH : DP_ROUTER;
477 }
478
479 static struct ovn_datapath *
480 ovn_datapath_find(struct hmap *datapaths, const struct uuid *uuid)
481 {
482 struct ovn_datapath *od;
483
484 HMAP_FOR_EACH_WITH_HASH (od, key_node, uuid_hash(uuid), datapaths) {
485 if (uuid_equals(uuid, &od->key)) {
486 return od;
487 }
488 }
489 return NULL;
490 }
491
492 static struct ovn_datapath *
493 ovn_datapath_from_sbrec(struct hmap *datapaths,
494 const struct sbrec_datapath_binding *sb)
495 {
496 struct uuid key;
497
498 if (!smap_get_uuid(&sb->external_ids, "logical-switch", &key) &&
499 !smap_get_uuid(&sb->external_ids, "logical-router", &key)) {
500 return NULL;
501 }
502 return ovn_datapath_find(datapaths, &key);
503 }
504
505 static bool
506 lrouter_is_enabled(const struct nbrec_logical_router *lrouter)
507 {
508 return !lrouter->enabled || *lrouter->enabled;
509 }
510
511 static void
512 init_ipam_info_for_datapath(struct ovn_datapath *od)
513 {
514 if (!od->nbs) {
515 return;
516 }
517
518 const char *subnet_str = smap_get(&od->nbs->other_config, "subnet");
519 const char *ipv6_prefix = smap_get(&od->nbs->other_config, "ipv6_prefix");
520
521 if (ipv6_prefix) {
522 od->ipam_info = xzalloc(sizeof *od->ipam_info);
523 od->ipam_info->ipv6_prefix_set = ipv6_parse(
524 ipv6_prefix, &od->ipam_info->ipv6_prefix);
525 }
526
527 if (!subnet_str) {
528 return;
529 }
530
531 ovs_be32 subnet, mask;
532 char *error = ip_parse_masked(subnet_str, &subnet, &mask);
533 if (error || mask == OVS_BE32_MAX || !ip_is_cidr(mask)) {
534 static struct vlog_rate_limit rl
535 = VLOG_RATE_LIMIT_INIT(5, 1);
536 VLOG_WARN_RL(&rl, "bad 'subnet' %s", subnet_str);
537 free(error);
538 return;
539 }
540
541 if (!od->ipam_info) {
542 od->ipam_info = xzalloc(sizeof *od->ipam_info);
543 }
544 od->ipam_info->start_ipv4 = ntohl(subnet) + 1;
545 od->ipam_info->total_ipv4s = ~ntohl(mask);
546 od->ipam_info->allocated_ipv4s =
547 bitmap_allocate(od->ipam_info->total_ipv4s);
548
549 /* Mark first IP as taken */
550 bitmap_set1(od->ipam_info->allocated_ipv4s, 0);
551
552 /* Check if there are any reserver IPs (list) to be excluded from IPAM */
553 const char *exclude_ip_list = smap_get(&od->nbs->other_config,
554 "exclude_ips");
555 if (!exclude_ip_list) {
556 return;
557 }
558
559 struct lexer lexer;
560 lexer_init(&lexer, exclude_ip_list);
561 /* exclude_ip_list could be in the format -
562 * "10.0.0.4 10.0.0.10 10.0.0.20..10.0.0.50 10.0.0.100..10.0.0.110".
563 */
564 lexer_get(&lexer);
565 while (lexer.token.type != LEX_T_END) {
566 if (lexer.token.type != LEX_T_INTEGER) {
567 lexer_syntax_error(&lexer, "expecting address");
568 break;
569 }
570 uint32_t start = ntohl(lexer.token.value.ipv4);
571 lexer_get(&lexer);
572
573 uint32_t end = start + 1;
574 if (lexer_match(&lexer, LEX_T_ELLIPSIS)) {
575 if (lexer.token.type != LEX_T_INTEGER) {
576 lexer_syntax_error(&lexer, "expecting address range");
577 break;
578 }
579 end = ntohl(lexer.token.value.ipv4) + 1;
580 lexer_get(&lexer);
581 }
582
583 /* Clamp start...end to fit the subnet. */
584 start = MAX(od->ipam_info->start_ipv4, start);
585 end = MIN(od->ipam_info->start_ipv4 + od->ipam_info->total_ipv4s, end);
586 if (end > start) {
587 bitmap_set_multiple(od->ipam_info->allocated_ipv4s,
588 start - od->ipam_info->start_ipv4,
589 end - start, 1);
590 } else {
591 lexer_error(&lexer, "excluded addresses not in subnet");
592 }
593 }
594 if (lexer.error) {
595 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
596 VLOG_WARN_RL(&rl, "logical switch "UUID_FMT": bad exclude_ips (%s)",
597 UUID_ARGS(&od->key), lexer.error);
598 }
599 lexer_destroy(&lexer);
600 }
601
602 static void
603 ovn_datapath_update_external_ids(struct ovn_datapath *od)
604 {
605 /* Get the logical-switch or logical-router UUID to set in
606 * external-ids. */
607 char uuid_s[UUID_LEN + 1];
608 sprintf(uuid_s, UUID_FMT, UUID_ARGS(&od->key));
609 const char *key = od->nbs ? "logical-switch" : "logical-router";
610
611 /* Get names to set in external-ids. */
612 const char *name = od->nbs ? od->nbs->name : od->nbr->name;
613 const char *name2 = (od->nbs
614 ? smap_get(&od->nbs->external_ids,
615 "neutron:network_name")
616 : smap_get(&od->nbr->external_ids,
617 "neutron:router_name"));
618
619 /* Set external-ids. */
620 struct smap ids = SMAP_INITIALIZER(&ids);
621 smap_add(&ids, key, uuid_s);
622 smap_add(&ids, "name", name);
623 if (name2 && name2[0]) {
624 smap_add(&ids, "name2", name2);
625 }
626 sbrec_datapath_binding_set_external_ids(od->sb, &ids);
627 smap_destroy(&ids);
628 }
629
630 static void
631 join_datapaths(struct northd_context *ctx, struct hmap *datapaths,
632 struct ovs_list *sb_only, struct ovs_list *nb_only,
633 struct ovs_list *both)
634 {
635 hmap_init(datapaths);
636 ovs_list_init(sb_only);
637 ovs_list_init(nb_only);
638 ovs_list_init(both);
639
640 const struct sbrec_datapath_binding *sb, *sb_next;
641 SBREC_DATAPATH_BINDING_FOR_EACH_SAFE (sb, sb_next, ctx->ovnsb_idl) {
642 struct uuid key;
643 if (!smap_get_uuid(&sb->external_ids, "logical-switch", &key) &&
644 !smap_get_uuid(&sb->external_ids, "logical-router", &key)) {
645 ovsdb_idl_txn_add_comment(
646 ctx->ovnsb_txn,
647 "deleting Datapath_Binding "UUID_FMT" that lacks "
648 "external-ids:logical-switch and "
649 "external-ids:logical-router",
650 UUID_ARGS(&sb->header_.uuid));
651 sbrec_datapath_binding_delete(sb);
652 continue;
653 }
654
655 if (ovn_datapath_find(datapaths, &key)) {
656 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
657 VLOG_INFO_RL(
658 &rl, "deleting Datapath_Binding "UUID_FMT" with "
659 "duplicate external-ids:logical-switch/router "UUID_FMT,
660 UUID_ARGS(&sb->header_.uuid), UUID_ARGS(&key));
661 sbrec_datapath_binding_delete(sb);
662 continue;
663 }
664
665 struct ovn_datapath *od = ovn_datapath_create(datapaths, &key,
666 NULL, NULL, sb);
667 ovs_list_push_back(sb_only, &od->list);
668 }
669
670 const struct nbrec_logical_switch *nbs;
671 NBREC_LOGICAL_SWITCH_FOR_EACH (nbs, ctx->ovnnb_idl) {
672 struct ovn_datapath *od = ovn_datapath_find(datapaths,
673 &nbs->header_.uuid);
674 if (od) {
675 od->nbs = nbs;
676 ovs_list_remove(&od->list);
677 ovs_list_push_back(both, &od->list);
678 ovn_datapath_update_external_ids(od);
679 } else {
680 od = ovn_datapath_create(datapaths, &nbs->header_.uuid,
681 nbs, NULL, NULL);
682 ovs_list_push_back(nb_only, &od->list);
683 }
684
685 init_ipam_info_for_datapath(od);
686 }
687
688 const struct nbrec_logical_router *nbr;
689 NBREC_LOGICAL_ROUTER_FOR_EACH (nbr, ctx->ovnnb_idl) {
690 if (!lrouter_is_enabled(nbr)) {
691 continue;
692 }
693
694 struct ovn_datapath *od = ovn_datapath_find(datapaths,
695 &nbr->header_.uuid);
696 if (od) {
697 if (!od->nbs) {
698 od->nbr = nbr;
699 ovs_list_remove(&od->list);
700 ovs_list_push_back(both, &od->list);
701 ovn_datapath_update_external_ids(od);
702 } else {
703 /* Can't happen! */
704 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
705 VLOG_WARN_RL(&rl,
706 "duplicate UUID "UUID_FMT" in OVN_Northbound",
707 UUID_ARGS(&nbr->header_.uuid));
708 continue;
709 }
710 } else {
711 od = ovn_datapath_create(datapaths, &nbr->header_.uuid,
712 NULL, nbr, NULL);
713 ovs_list_push_back(nb_only, &od->list);
714 }
715 }
716 }
717
718 static uint32_t
719 ovn_datapath_allocate_key(struct hmap *dp_tnlids)
720 {
721 static uint32_t hint;
722 return allocate_tnlid(dp_tnlids, "datapath", (1u << 24) - 1, &hint);
723 }
724
725 /* Updates the southbound Datapath_Binding table so that it contains the
726 * logical switches and routers specified by the northbound database.
727 *
728 * Initializes 'datapaths' to contain a "struct ovn_datapath" for every logical
729 * switch and router. */
730 static void
731 build_datapaths(struct northd_context *ctx, struct hmap *datapaths)
732 {
733 struct ovs_list sb_only, nb_only, both;
734
735 join_datapaths(ctx, datapaths, &sb_only, &nb_only, &both);
736
737 if (!ovs_list_is_empty(&nb_only)) {
738 /* First index the in-use datapath tunnel IDs. */
739 struct hmap dp_tnlids = HMAP_INITIALIZER(&dp_tnlids);
740 struct ovn_datapath *od;
741 LIST_FOR_EACH (od, list, &both) {
742 add_tnlid(&dp_tnlids, od->sb->tunnel_key);
743 }
744
745 /* Add southbound record for each unmatched northbound record. */
746 LIST_FOR_EACH (od, list, &nb_only) {
747 uint16_t tunnel_key = ovn_datapath_allocate_key(&dp_tnlids);
748 if (!tunnel_key) {
749 break;
750 }
751
752 od->sb = sbrec_datapath_binding_insert(ctx->ovnsb_txn);
753 ovn_datapath_update_external_ids(od);
754 sbrec_datapath_binding_set_tunnel_key(od->sb, tunnel_key);
755 }
756 destroy_tnlids(&dp_tnlids);
757 }
758
759 /* Delete southbound records without northbound matches. */
760 struct ovn_datapath *od, *next;
761 LIST_FOR_EACH_SAFE (od, next, list, &sb_only) {
762 ovs_list_remove(&od->list);
763 sbrec_datapath_binding_delete(od->sb);
764 ovn_datapath_destroy(datapaths, od);
765 }
766 }
767 \f
768 struct ovn_port {
769 struct hmap_node key_node; /* Index on 'key'. */
770 char *key; /* nbs->name, nbr->name, sb->logical_port. */
771 char *json_key; /* 'key', quoted for use in JSON. */
772
773 const struct sbrec_port_binding *sb; /* May be NULL. */
774
775 /* Logical switch port data. */
776 const struct nbrec_logical_switch_port *nbsp; /* May be NULL. */
777
778 struct lport_addresses *lsp_addrs; /* Logical switch port addresses. */
779 unsigned int n_lsp_addrs;
780
781 struct lport_addresses *ps_addrs; /* Port security addresses. */
782 unsigned int n_ps_addrs;
783
784 /* Logical router port data. */
785 const struct nbrec_logical_router_port *nbrp; /* May be NULL. */
786
787 struct lport_addresses lrp_networks;
788
789 bool derived; /* Indicates whether this is an additional port
790 * derived from nbsp or nbrp. */
791
792 /* The port's peer:
793 *
794 * - A switch port S of type "router" has a router port R as a peer,
795 * and R in turn has S has its peer.
796 *
797 * - Two connected logical router ports have each other as peer. */
798 struct ovn_port *peer;
799
800 struct ovn_datapath *od;
801
802 struct ovs_list list; /* In list of similar records. */
803 };
804
805 static struct ovn_port *
806 ovn_port_create(struct hmap *ports, const char *key,
807 const struct nbrec_logical_switch_port *nbsp,
808 const struct nbrec_logical_router_port *nbrp,
809 const struct sbrec_port_binding *sb)
810 {
811 struct ovn_port *op = xzalloc(sizeof *op);
812
813 struct ds json_key = DS_EMPTY_INITIALIZER;
814 json_string_escape(key, &json_key);
815 op->json_key = ds_steal_cstr(&json_key);
816
817 op->key = xstrdup(key);
818 op->sb = sb;
819 op->nbsp = nbsp;
820 op->nbrp = nbrp;
821 op->derived = false;
822 hmap_insert(ports, &op->key_node, hash_string(op->key, 0));
823 return op;
824 }
825
826 static void
827 ovn_port_destroy(struct hmap *ports, struct ovn_port *port)
828 {
829 if (port) {
830 /* Don't remove port->list. It is used within build_ports() as a
831 * private list and once we've exited that function it is not safe to
832 * use it. */
833 hmap_remove(ports, &port->key_node);
834
835 for (int i = 0; i < port->n_lsp_addrs; i++) {
836 destroy_lport_addresses(&port->lsp_addrs[i]);
837 }
838 free(port->lsp_addrs);
839
840 for (int i = 0; i < port->n_ps_addrs; i++) {
841 destroy_lport_addresses(&port->ps_addrs[i]);
842 }
843 free(port->ps_addrs);
844
845 destroy_lport_addresses(&port->lrp_networks);
846 free(port->json_key);
847 free(port->key);
848 free(port);
849 }
850 }
851
852 static struct ovn_port *
853 ovn_port_find(struct hmap *ports, const char *name)
854 {
855 struct ovn_port *op;
856
857 HMAP_FOR_EACH_WITH_HASH (op, key_node, hash_string(name, 0), ports) {
858 if (!strcmp(op->key, name)) {
859 return op;
860 }
861 }
862 return NULL;
863 }
864
865 static uint32_t
866 ovn_port_allocate_key(struct ovn_datapath *od)
867 {
868 return allocate_tnlid(&od->port_tnlids, "port",
869 (1u << 15) - 1, &od->port_key_hint);
870 }
871
872 static char *
873 chassis_redirect_name(const char *port_name)
874 {
875 return xasprintf("cr-%s", port_name);
876 }
877
878 static bool
879 ipam_is_duplicate_mac(struct eth_addr *ea, uint64_t mac64, bool warn)
880 {
881 struct macam_node *macam_node;
882 HMAP_FOR_EACH_WITH_HASH (macam_node, hmap_node, hash_uint64(mac64),
883 &macam) {
884 if (eth_addr_equals(*ea, macam_node->mac_addr)) {
885 if (warn) {
886 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
887 VLOG_WARN_RL(&rl, "Duplicate MAC set: "ETH_ADDR_FMT,
888 ETH_ADDR_ARGS(macam_node->mac_addr));
889 }
890 return true;
891 }
892 }
893 return false;
894 }
895
896 static void
897 ipam_insert_mac(struct eth_addr *ea, bool check)
898 {
899 if (!ea) {
900 return;
901 }
902
903 uint64_t mac64 = eth_addr_to_uint64(*ea);
904 /* If the new MAC was not assigned by this address management system or
905 * check is true and the new MAC is a duplicate, do not insert it into the
906 * macam hmap. */
907 if (((mac64 ^ MAC_ADDR_PREFIX) >> 24)
908 || (check && ipam_is_duplicate_mac(ea, mac64, true))) {
909 return;
910 }
911
912 struct macam_node *new_macam_node = xmalloc(sizeof *new_macam_node);
913 new_macam_node->mac_addr = *ea;
914 hmap_insert(&macam, &new_macam_node->hmap_node, hash_uint64(mac64));
915 }
916
917 static void
918 ipam_insert_ip(struct ovn_datapath *od, uint32_t ip)
919 {
920 if (!od || !od->ipam_info || !od->ipam_info->allocated_ipv4s) {
921 return;
922 }
923
924 if (ip >= od->ipam_info->start_ipv4 &&
925 ip < (od->ipam_info->start_ipv4 + od->ipam_info->total_ipv4s)) {
926 bitmap_set1(od->ipam_info->allocated_ipv4s,
927 ip - od->ipam_info->start_ipv4);
928 }
929 }
930
931 static void
932 ipam_insert_lsp_addresses(struct ovn_datapath *od, struct ovn_port *op,
933 char *address)
934 {
935 if (!od || !op || !address || !strcmp(address, "unknown")
936 || !strcmp(address, "router") || is_dynamic_lsp_address(address)) {
937 return;
938 }
939
940 struct lport_addresses laddrs;
941 if (!extract_lsp_addresses(address, &laddrs)) {
942 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
943 VLOG_WARN_RL(&rl, "Extract addresses failed.");
944 return;
945 }
946 ipam_insert_mac(&laddrs.ea, true);
947
948 /* IP is only added to IPAM if the switch's subnet option
949 * is set, whereas MAC is always added to MACAM. */
950 if (!od->ipam_info || !od->ipam_info->allocated_ipv4s) {
951 destroy_lport_addresses(&laddrs);
952 return;
953 }
954
955 for (size_t j = 0; j < laddrs.n_ipv4_addrs; j++) {
956 uint32_t ip = ntohl(laddrs.ipv4_addrs[j].addr);
957 ipam_insert_ip(od, ip);
958 }
959
960 destroy_lport_addresses(&laddrs);
961 }
962
963 static void
964 ipam_add_port_addresses(struct ovn_datapath *od, struct ovn_port *op)
965 {
966 if (!od || !op) {
967 return;
968 }
969
970 if (op->nbsp) {
971 /* Add all the port's addresses to address data structures. */
972 for (size_t i = 0; i < op->nbsp->n_addresses; i++) {
973 ipam_insert_lsp_addresses(od, op, op->nbsp->addresses[i]);
974 }
975 if (op->nbsp->dynamic_addresses) {
976 ipam_insert_lsp_addresses(od, op, op->nbsp->dynamic_addresses);
977 }
978 } else if (op->nbrp) {
979 struct lport_addresses lrp_networks;
980 if (!extract_lrp_networks(op->nbrp, &lrp_networks)) {
981 static struct vlog_rate_limit rl
982 = VLOG_RATE_LIMIT_INIT(1, 1);
983 VLOG_WARN_RL(&rl, "Extract addresses failed.");
984 return;
985 }
986 ipam_insert_mac(&lrp_networks.ea, true);
987
988 if (!op->peer || !op->peer->nbsp || !op->peer->od || !op->peer->od->nbs
989 || !smap_get(&op->peer->od->nbs->other_config, "subnet")) {
990 destroy_lport_addresses(&lrp_networks);
991 return;
992 }
993
994 for (size_t i = 0; i < lrp_networks.n_ipv4_addrs; i++) {
995 uint32_t ip = ntohl(lrp_networks.ipv4_addrs[i].addr);
996 ipam_insert_ip(op->peer->od, ip);
997 }
998
999 destroy_lport_addresses(&lrp_networks);
1000 }
1001 }
1002
1003 static uint64_t
1004 ipam_get_unused_mac(void)
1005 {
1006 /* Stores the suffix of the most recently ipam-allocated MAC address. */
1007 static uint32_t last_mac;
1008
1009 uint64_t mac64;
1010 struct eth_addr mac;
1011 uint32_t mac_addr_suffix, i;
1012 for (i = 0; i < MAC_ADDR_SPACE - 1; i++) {
1013 /* The tentative MAC's suffix will be in the interval (1, 0xfffffe). */
1014 mac_addr_suffix = ((last_mac + i) % (MAC_ADDR_SPACE - 1)) + 1;
1015 mac64 = MAC_ADDR_PREFIX | mac_addr_suffix;
1016 eth_addr_from_uint64(mac64, &mac);
1017 if (!ipam_is_duplicate_mac(&mac, mac64, false)) {
1018 last_mac = mac_addr_suffix;
1019 break;
1020 }
1021 }
1022
1023 if (i == MAC_ADDR_SPACE) {
1024 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
1025 VLOG_WARN_RL(&rl, "MAC address space exhausted.");
1026 mac64 = 0;
1027 }
1028
1029 return mac64;
1030 }
1031
1032 static uint32_t
1033 ipam_get_unused_ip(struct ovn_datapath *od)
1034 {
1035 if (!od || !od->ipam_info || !od->ipam_info->allocated_ipv4s) {
1036 return 0;
1037 }
1038
1039 size_t new_ip_index = bitmap_scan(od->ipam_info->allocated_ipv4s, 0, 0,
1040 od->ipam_info->total_ipv4s - 1);
1041 if (new_ip_index == od->ipam_info->total_ipv4s - 1) {
1042 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
1043 VLOG_WARN_RL( &rl, "Subnet address space has been exhausted.");
1044 return 0;
1045 }
1046
1047 return od->ipam_info->start_ipv4 + new_ip_index;
1048 }
1049
1050 static bool
1051 ipam_allocate_addresses(struct ovn_datapath *od, struct ovn_port *op,
1052 const char *addrspec)
1053 {
1054 if (!op->nbsp || !od->ipam_info) {
1055 return false;
1056 }
1057
1058 /* Get or generate MAC address. */
1059 struct eth_addr mac;
1060 bool dynamic_mac;
1061 int n = 0;
1062 if (ovs_scan(addrspec, ETH_ADDR_SCAN_FMT" dynamic%n",
1063 ETH_ADDR_SCAN_ARGS(mac), &n)
1064 && addrspec[n] == '\0') {
1065 dynamic_mac = false;
1066 } else {
1067 uint64_t mac64 = ipam_get_unused_mac();
1068 if (!mac64) {
1069 return false;
1070 }
1071 eth_addr_from_uint64(mac64, &mac);
1072 dynamic_mac = true;
1073 }
1074
1075 /* Generate IPv4 address, if desirable. */
1076 bool dynamic_ip4 = od->ipam_info->allocated_ipv4s != NULL;
1077 uint32_t ip4 = dynamic_ip4 ? ipam_get_unused_ip(od) : 0;
1078
1079 /* Generate IPv6 address, if desirable. */
1080 bool dynamic_ip6 = od->ipam_info->ipv6_prefix_set;
1081 struct in6_addr ip6;
1082 if (dynamic_ip6) {
1083 in6_generate_eui64(mac, &od->ipam_info->ipv6_prefix, &ip6);
1084 }
1085
1086 /* If we didn't generate anything, bail out. */
1087 if (!dynamic_ip4 && !dynamic_ip6) {
1088 return false;
1089 }
1090
1091 /* Save the dynamic addresses. */
1092 struct ds new_addr = DS_EMPTY_INITIALIZER;
1093 ds_put_format(&new_addr, ETH_ADDR_FMT, ETH_ADDR_ARGS(mac));
1094 if (dynamic_ip4 && ip4) {
1095 ipam_insert_ip(od, ip4);
1096 ds_put_format(&new_addr, " "IP_FMT, IP_ARGS(htonl(ip4)));
1097 }
1098 if (dynamic_ip6) {
1099 char ip6_s[INET6_ADDRSTRLEN + 1];
1100 ipv6_string_mapped(ip6_s, &ip6);
1101 ds_put_format(&new_addr, " %s", ip6_s);
1102 }
1103 ipam_insert_mac(&mac, !dynamic_mac);
1104 nbrec_logical_switch_port_set_dynamic_addresses(op->nbsp,
1105 ds_cstr(&new_addr));
1106 ds_destroy(&new_addr);
1107 return true;
1108 }
1109
1110 static void
1111 build_ipam(struct hmap *datapaths, struct hmap *ports)
1112 {
1113 /* IPAM generally stands for IP address management. In non-virtualized
1114 * world, MAC addresses come with the hardware. But, with virtualized
1115 * workloads, they need to be assigned and managed. This function
1116 * does both IP address management (ipam) and MAC address management
1117 * (macam). */
1118
1119 /* If the switch's other_config:subnet is set, allocate new addresses for
1120 * ports that have the "dynamic" keyword in their addresses column. */
1121 struct ovn_datapath *od;
1122 HMAP_FOR_EACH (od, key_node, datapaths) {
1123 if (!od->nbs || !od->ipam_info) {
1124 continue;
1125 }
1126
1127 struct ovn_port *op;
1128 for (size_t i = 0; i < od->nbs->n_ports; i++) {
1129 const struct nbrec_logical_switch_port *nbsp =
1130 od->nbs->ports[i];
1131
1132 if (!nbsp) {
1133 continue;
1134 }
1135
1136 op = ovn_port_find(ports, nbsp->name);
1137 if (!op || (op->nbsp && op->peer)) {
1138 /* Do not allocate addresses for logical switch ports that
1139 * have a peer. */
1140 continue;
1141 }
1142
1143 for (size_t j = 0; j < nbsp->n_addresses; j++) {
1144 if (is_dynamic_lsp_address(nbsp->addresses[j])
1145 && !nbsp->dynamic_addresses) {
1146 if (!ipam_allocate_addresses(od, op, nbsp->addresses[j])
1147 || !extract_lsp_addresses(nbsp->dynamic_addresses,
1148 &op->lsp_addrs[op->n_lsp_addrs])) {
1149 static struct vlog_rate_limit rl
1150 = VLOG_RATE_LIMIT_INIT(1, 1);
1151 VLOG_INFO_RL(&rl, "Failed to allocate address.");
1152 } else {
1153 op->n_lsp_addrs++;
1154 }
1155 break;
1156 }
1157 }
1158
1159 if (!nbsp->n_addresses && nbsp->dynamic_addresses) {
1160 nbrec_logical_switch_port_set_dynamic_addresses(op->nbsp,
1161 NULL);
1162 }
1163 }
1164 }
1165 }
1166 \f
1167 /* Tag allocation for nested containers.
1168 *
1169 * For a logical switch port with 'parent_name' and a request to allocate tags,
1170 * keeps a track of all allocated tags. */
1171 struct tag_alloc_node {
1172 struct hmap_node hmap_node;
1173 char *parent_name;
1174 unsigned long *allocated_tags; /* A bitmap to track allocated tags. */
1175 };
1176
1177 static void
1178 tag_alloc_destroy(struct hmap *tag_alloc_table)
1179 {
1180 struct tag_alloc_node *node;
1181 HMAP_FOR_EACH_POP (node, hmap_node, tag_alloc_table) {
1182 bitmap_free(node->allocated_tags);
1183 free(node->parent_name);
1184 free(node);
1185 }
1186 hmap_destroy(tag_alloc_table);
1187 }
1188
1189 static struct tag_alloc_node *
1190 tag_alloc_get_node(struct hmap *tag_alloc_table, const char *parent_name)
1191 {
1192 /* If a node for the 'parent_name' exists, return it. */
1193 struct tag_alloc_node *tag_alloc_node;
1194 HMAP_FOR_EACH_WITH_HASH (tag_alloc_node, hmap_node,
1195 hash_string(parent_name, 0),
1196 tag_alloc_table) {
1197 if (!strcmp(tag_alloc_node->parent_name, parent_name)) {
1198 return tag_alloc_node;
1199 }
1200 }
1201
1202 /* Create a new node. */
1203 tag_alloc_node = xmalloc(sizeof *tag_alloc_node);
1204 tag_alloc_node->parent_name = xstrdup(parent_name);
1205 tag_alloc_node->allocated_tags = bitmap_allocate(MAX_OVN_TAGS);
1206 /* Tag 0 is invalid for nested containers. */
1207 bitmap_set1(tag_alloc_node->allocated_tags, 0);
1208 hmap_insert(tag_alloc_table, &tag_alloc_node->hmap_node,
1209 hash_string(parent_name, 0));
1210
1211 return tag_alloc_node;
1212 }
1213
1214 static void
1215 tag_alloc_add_existing_tags(struct hmap *tag_alloc_table,
1216 const struct nbrec_logical_switch_port *nbsp)
1217 {
1218 /* Add the tags of already existing nested containers. If there is no
1219 * 'nbsp->parent_name' or no 'nbsp->tag' set, there is nothing to do. */
1220 if (!nbsp->parent_name || !nbsp->parent_name[0] || !nbsp->tag) {
1221 return;
1222 }
1223
1224 struct tag_alloc_node *tag_alloc_node;
1225 tag_alloc_node = tag_alloc_get_node(tag_alloc_table, nbsp->parent_name);
1226 bitmap_set1(tag_alloc_node->allocated_tags, *nbsp->tag);
1227 }
1228
1229 static void
1230 tag_alloc_create_new_tag(struct hmap *tag_alloc_table,
1231 const struct nbrec_logical_switch_port *nbsp)
1232 {
1233 if (!nbsp->tag_request) {
1234 return;
1235 }
1236
1237 if (nbsp->parent_name && nbsp->parent_name[0]
1238 && *nbsp->tag_request == 0) {
1239 /* For nested containers that need allocation, do the allocation. */
1240
1241 if (nbsp->tag) {
1242 /* This has already been allocated. */
1243 return;
1244 }
1245
1246 struct tag_alloc_node *tag_alloc_node;
1247 int64_t tag;
1248 tag_alloc_node = tag_alloc_get_node(tag_alloc_table,
1249 nbsp->parent_name);
1250 tag = bitmap_scan(tag_alloc_node->allocated_tags, 0, 1, MAX_OVN_TAGS);
1251 if (tag == MAX_OVN_TAGS) {
1252 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
1253 VLOG_ERR_RL(&rl, "out of vlans for logical switch ports with "
1254 "parent %s", nbsp->parent_name);
1255 return;
1256 }
1257 bitmap_set1(tag_alloc_node->allocated_tags, tag);
1258 nbrec_logical_switch_port_set_tag(nbsp, &tag, 1);
1259 } else if (*nbsp->tag_request != 0) {
1260 /* For everything else, copy the contents of 'tag_request' to 'tag'. */
1261 nbrec_logical_switch_port_set_tag(nbsp, nbsp->tag_request, 1);
1262 }
1263 }
1264 \f
1265
1266 /*
1267 * This function checks if the MAC in "address" parameter (if present) is
1268 * different from the one stored in Logical_Switch_Port.dynamic_addresses
1269 * and updates it.
1270 */
1271 static void
1272 check_and_update_mac_in_dynamic_addresses(
1273 const char *address,
1274 const struct nbrec_logical_switch_port *nbsp)
1275 {
1276 if (!nbsp->dynamic_addresses) {
1277 return;
1278 }
1279 int buf_index = 0;
1280 struct eth_addr ea;
1281 if (!ovs_scan_len(address, &buf_index,
1282 ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(ea))) {
1283 return;
1284 }
1285
1286 struct eth_addr present_ea;
1287 buf_index = 0;
1288 if (ovs_scan_len(nbsp->dynamic_addresses, &buf_index,
1289 ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(present_ea))
1290 && !eth_addr_equals(ea, present_ea)) {
1291 /* MAC address has changed. Update it */
1292 char *new_addr = xasprintf(
1293 ETH_ADDR_FMT"%s", ETH_ADDR_ARGS(ea),
1294 &nbsp->dynamic_addresses[buf_index]);
1295 nbrec_logical_switch_port_set_dynamic_addresses(
1296 nbsp, new_addr);
1297 free(new_addr);
1298 }
1299 }
1300
1301 static void
1302 join_logical_ports(struct northd_context *ctx,
1303 struct hmap *datapaths, struct hmap *ports,
1304 struct hmap *chassis_qdisc_queues,
1305 struct hmap *tag_alloc_table, struct ovs_list *sb_only,
1306 struct ovs_list *nb_only, struct ovs_list *both)
1307 {
1308 hmap_init(ports);
1309 ovs_list_init(sb_only);
1310 ovs_list_init(nb_only);
1311 ovs_list_init(both);
1312
1313 const struct sbrec_port_binding *sb;
1314 SBREC_PORT_BINDING_FOR_EACH (sb, ctx->ovnsb_idl) {
1315 struct ovn_port *op = ovn_port_create(ports, sb->logical_port,
1316 NULL, NULL, sb);
1317 ovs_list_push_back(sb_only, &op->list);
1318 }
1319
1320 struct ovn_datapath *od;
1321 HMAP_FOR_EACH (od, key_node, datapaths) {
1322 if (od->nbs) {
1323 for (size_t i = 0; i < od->nbs->n_ports; i++) {
1324 const struct nbrec_logical_switch_port *nbsp
1325 = od->nbs->ports[i];
1326 struct ovn_port *op = ovn_port_find(ports, nbsp->name);
1327 if (op) {
1328 if (op->nbsp || op->nbrp) {
1329 static struct vlog_rate_limit rl
1330 = VLOG_RATE_LIMIT_INIT(5, 1);
1331 VLOG_WARN_RL(&rl, "duplicate logical port %s",
1332 nbsp->name);
1333 continue;
1334 }
1335 op->nbsp = nbsp;
1336 ovs_list_remove(&op->list);
1337
1338 uint32_t queue_id = smap_get_int(&op->sb->options,
1339 "qdisc_queue_id", 0);
1340 if (queue_id && op->sb->chassis) {
1341 add_chassis_queue(
1342 chassis_qdisc_queues, &op->sb->chassis->header_.uuid,
1343 queue_id);
1344 }
1345
1346 ovs_list_push_back(both, &op->list);
1347
1348 /* This port exists due to a SB binding, but should
1349 * not have been initialized fully. */
1350 ovs_assert(!op->n_lsp_addrs && !op->n_ps_addrs);
1351 } else {
1352 op = ovn_port_create(ports, nbsp->name, nbsp, NULL, NULL);
1353 ovs_list_push_back(nb_only, &op->list);
1354 }
1355
1356 if (!strcmp(nbsp->type, "localnet")) {
1357 od->localnet_port = op;
1358 }
1359
1360 op->lsp_addrs
1361 = xmalloc(sizeof *op->lsp_addrs * nbsp->n_addresses);
1362 for (size_t j = 0; j < nbsp->n_addresses; j++) {
1363 if (!strcmp(nbsp->addresses[j], "unknown")
1364 || !strcmp(nbsp->addresses[j], "router")) {
1365 continue;
1366 }
1367 if (is_dynamic_lsp_address(nbsp->addresses[j])) {
1368 if (nbsp->dynamic_addresses) {
1369 check_and_update_mac_in_dynamic_addresses(
1370 nbsp->addresses[j], nbsp);
1371 if (!extract_lsp_addresses(nbsp->dynamic_addresses,
1372 &op->lsp_addrs[op->n_lsp_addrs])) {
1373 static struct vlog_rate_limit rl
1374 = VLOG_RATE_LIMIT_INIT(1, 1);
1375 VLOG_INFO_RL(&rl, "invalid syntax '%s' in "
1376 "logical switch port "
1377 "dynamic_addresses. No "
1378 "MAC address found",
1379 op->nbsp->dynamic_addresses);
1380 continue;
1381 }
1382 } else {
1383 continue;
1384 }
1385 } else if (!extract_lsp_addresses(nbsp->addresses[j],
1386 &op->lsp_addrs[op->n_lsp_addrs])) {
1387 static struct vlog_rate_limit rl
1388 = VLOG_RATE_LIMIT_INIT(1, 1);
1389 VLOG_INFO_RL(&rl, "invalid syntax '%s' in logical "
1390 "switch port addresses. No MAC "
1391 "address found",
1392 op->nbsp->addresses[j]);
1393 continue;
1394 }
1395 op->n_lsp_addrs++;
1396 }
1397
1398 op->ps_addrs
1399 = xmalloc(sizeof *op->ps_addrs * nbsp->n_port_security);
1400 for (size_t j = 0; j < nbsp->n_port_security; j++) {
1401 if (!extract_lsp_addresses(nbsp->port_security[j],
1402 &op->ps_addrs[op->n_ps_addrs])) {
1403 static struct vlog_rate_limit rl
1404 = VLOG_RATE_LIMIT_INIT(1, 1);
1405 VLOG_INFO_RL(&rl, "invalid syntax '%s' in port "
1406 "security. No MAC address found",
1407 op->nbsp->port_security[j]);
1408 continue;
1409 }
1410 op->n_ps_addrs++;
1411 }
1412
1413 op->od = od;
1414 ipam_add_port_addresses(od, op);
1415 tag_alloc_add_existing_tags(tag_alloc_table, nbsp);
1416 }
1417 } else {
1418 for (size_t i = 0; i < od->nbr->n_ports; i++) {
1419 const struct nbrec_logical_router_port *nbrp
1420 = od->nbr->ports[i];
1421
1422 struct lport_addresses lrp_networks;
1423 if (!extract_lrp_networks(nbrp, &lrp_networks)) {
1424 static struct vlog_rate_limit rl
1425 = VLOG_RATE_LIMIT_INIT(5, 1);
1426 VLOG_WARN_RL(&rl, "bad 'mac' %s", nbrp->mac);
1427 continue;
1428 }
1429
1430 if (!lrp_networks.n_ipv4_addrs && !lrp_networks.n_ipv6_addrs) {
1431 continue;
1432 }
1433
1434 struct ovn_port *op = ovn_port_find(ports, nbrp->name);
1435 if (op) {
1436 if (op->nbsp || op->nbrp) {
1437 static struct vlog_rate_limit rl
1438 = VLOG_RATE_LIMIT_INIT(5, 1);
1439 VLOG_WARN_RL(&rl, "duplicate logical router port %s",
1440 nbrp->name);
1441 continue;
1442 }
1443 op->nbrp = nbrp;
1444 ovs_list_remove(&op->list);
1445 ovs_list_push_back(both, &op->list);
1446
1447 /* This port exists but should not have been
1448 * initialized fully. */
1449 ovs_assert(!op->lrp_networks.n_ipv4_addrs
1450 && !op->lrp_networks.n_ipv6_addrs);
1451 } else {
1452 op = ovn_port_create(ports, nbrp->name, NULL, nbrp, NULL);
1453 ovs_list_push_back(nb_only, &op->list);
1454 }
1455
1456 op->lrp_networks = lrp_networks;
1457 op->od = od;
1458 ipam_add_port_addresses(op->od, op);
1459
1460 const char *redirect_chassis = smap_get(&op->nbrp->options,
1461 "redirect-chassis");
1462 if (redirect_chassis || op->nbrp->n_gateway_chassis) {
1463 /* Additional "derived" ovn_port crp represents the
1464 * instance of op on the "redirect-chassis". */
1465 const char *gw_chassis = smap_get(&op->od->nbr->options,
1466 "chassis");
1467 if (gw_chassis) {
1468 static struct vlog_rate_limit rl
1469 = VLOG_RATE_LIMIT_INIT(1, 1);
1470 VLOG_WARN_RL(&rl, "Bad configuration: "
1471 "redirect-chassis configured on port %s "
1472 "on L3 gateway router", nbrp->name);
1473 continue;
1474 }
1475 if (od->l3dgw_port || od->l3redirect_port) {
1476 static struct vlog_rate_limit rl
1477 = VLOG_RATE_LIMIT_INIT(1, 1);
1478 VLOG_WARN_RL(&rl, "Bad configuration: multiple ports "
1479 "with redirect-chassis on same logical "
1480 "router %s", od->nbr->name);
1481 continue;
1482 }
1483
1484 char *redirect_name = chassis_redirect_name(nbrp->name);
1485 struct ovn_port *crp = ovn_port_find(ports, redirect_name);
1486 if (crp) {
1487 crp->derived = true;
1488 crp->nbrp = nbrp;
1489 ovs_list_remove(&crp->list);
1490 ovs_list_push_back(both, &crp->list);
1491 } else {
1492 crp = ovn_port_create(ports, redirect_name,
1493 NULL, nbrp, NULL);
1494 crp->derived = true;
1495 ovs_list_push_back(nb_only, &crp->list);
1496 }
1497 crp->od = od;
1498 free(redirect_name);
1499
1500 /* Set l3dgw_port and l3redirect_port in od, for later
1501 * use during flow creation. */
1502 od->l3dgw_port = op;
1503 od->l3redirect_port = crp;
1504 }
1505 }
1506 }
1507 }
1508
1509 /* Connect logical router ports, and logical switch ports of type "router",
1510 * to their peers. */
1511 struct ovn_port *op;
1512 HMAP_FOR_EACH (op, key_node, ports) {
1513 if (op->nbsp && !strcmp(op->nbsp->type, "router") && !op->derived) {
1514 const char *peer_name = smap_get(&op->nbsp->options, "router-port");
1515 if (!peer_name) {
1516 continue;
1517 }
1518
1519 struct ovn_port *peer = ovn_port_find(ports, peer_name);
1520 if (!peer || !peer->nbrp) {
1521 continue;
1522 }
1523
1524 peer->peer = op;
1525 op->peer = peer;
1526 op->od->router_ports = xrealloc(
1527 op->od->router_ports,
1528 sizeof *op->od->router_ports * (op->od->n_router_ports + 1));
1529 op->od->router_ports[op->od->n_router_ports++] = op;
1530
1531 /* Fill op->lsp_addrs for op->nbsp->addresses[] with
1532 * contents "router", which was skipped in the loop above. */
1533 for (size_t j = 0; j < op->nbsp->n_addresses; j++) {
1534 if (!strcmp(op->nbsp->addresses[j], "router")) {
1535 if (extract_lrp_networks(peer->nbrp,
1536 &op->lsp_addrs[op->n_lsp_addrs])) {
1537 op->n_lsp_addrs++;
1538 }
1539 break;
1540 }
1541 }
1542 } else if (op->nbrp && op->nbrp->peer && !op->derived) {
1543 struct ovn_port *peer = ovn_port_find(ports, op->nbrp->peer);
1544 if (peer) {
1545 if (peer->nbrp) {
1546 op->peer = peer;
1547 } else if (peer->nbsp) {
1548 /* An ovn_port for a switch port of type "router" does have
1549 * a router port as its peer (see the case above for
1550 * "router" ports), but this is set via options:router-port
1551 * in Logical_Switch_Port and does not involve the
1552 * Logical_Router_Port's 'peer' column. */
1553 static struct vlog_rate_limit rl =
1554 VLOG_RATE_LIMIT_INIT(5, 1);
1555 VLOG_WARN_RL(&rl, "Bad configuration: The peer of router "
1556 "port %s is a switch port", op->key);
1557 }
1558 }
1559 }
1560 }
1561 }
1562
1563 static void
1564 ip_address_and_port_from_lb_key(const char *key, char **ip_address,
1565 uint16_t *port, int *addr_family);
1566
1567 static void
1568 get_router_load_balancer_ips(const struct ovn_datapath *od,
1569 struct sset *all_ips, int *addr_family)
1570 {
1571 if (!od->nbr) {
1572 return;
1573 }
1574
1575 for (int i = 0; i < od->nbr->n_load_balancer; i++) {
1576 struct nbrec_load_balancer *lb = od->nbr->load_balancer[i];
1577 struct smap *vips = &lb->vips;
1578 struct smap_node *node;
1579
1580 SMAP_FOR_EACH (node, vips) {
1581 /* node->key contains IP:port or just IP. */
1582 char *ip_address = NULL;
1583 uint16_t port;
1584
1585 ip_address_and_port_from_lb_key(node->key, &ip_address, &port,
1586 addr_family);
1587 if (!ip_address) {
1588 continue;
1589 }
1590
1591 if (!sset_contains(all_ips, ip_address)) {
1592 sset_add(all_ips, ip_address);
1593 }
1594
1595 free(ip_address);
1596 }
1597 }
1598 }
1599
1600 /* Returns an array of strings, each consisting of a MAC address followed
1601 * by one or more IP addresses, and if the port is a distributed gateway
1602 * port, followed by 'is_chassis_resident("LPORT_NAME")', where the
1603 * LPORT_NAME is the name of the L3 redirect port or the name of the
1604 * logical_port specified in a NAT rule. These strings include the
1605 * external IP addresses of all NAT rules defined on that router, and all
1606 * of the IP addresses used in load balancer VIPs defined on that router.
1607 *
1608 * The caller must free each of the n returned strings with free(),
1609 * and must free the returned array when it is no longer needed. */
1610 static char **
1611 get_nat_addresses(const struct ovn_port *op, size_t *n)
1612 {
1613 size_t n_nats = 0;
1614 struct eth_addr mac;
1615 if (!op->nbrp || !op->od || !op->od->nbr
1616 || (!op->od->nbr->n_nat && !op->od->nbr->n_load_balancer)
1617 || !eth_addr_from_string(op->nbrp->mac, &mac)) {
1618 *n = n_nats;
1619 return NULL;
1620 }
1621
1622 struct ds c_addresses = DS_EMPTY_INITIALIZER;
1623 ds_put_format(&c_addresses, ETH_ADDR_FMT, ETH_ADDR_ARGS(mac));
1624 bool central_ip_address = false;
1625
1626 char **addresses;
1627 addresses = xmalloc(sizeof *addresses * (op->od->nbr->n_nat + 1));
1628
1629 /* Get NAT IP addresses. */
1630 for (size_t i = 0; i < op->od->nbr->n_nat; i++) {
1631 const struct nbrec_nat *nat = op->od->nbr->nat[i];
1632 ovs_be32 ip, mask;
1633
1634 char *error = ip_parse_masked(nat->external_ip, &ip, &mask);
1635 if (error || mask != OVS_BE32_MAX) {
1636 free(error);
1637 continue;
1638 }
1639
1640 /* Determine whether this NAT rule satisfies the conditions for
1641 * distributed NAT processing. */
1642 if (op->od->l3redirect_port && !strcmp(nat->type, "dnat_and_snat")
1643 && nat->logical_port && nat->external_mac) {
1644 /* Distributed NAT rule. */
1645 if (eth_addr_from_string(nat->external_mac, &mac)) {
1646 struct ds address = DS_EMPTY_INITIALIZER;
1647 ds_put_format(&address, ETH_ADDR_FMT, ETH_ADDR_ARGS(mac));
1648 ds_put_format(&address, " %s", nat->external_ip);
1649 ds_put_format(&address, " is_chassis_resident(\"%s\")",
1650 nat->logical_port);
1651 addresses[n_nats++] = ds_steal_cstr(&address);
1652 }
1653 } else {
1654 /* Centralized NAT rule, either on gateway router or distributed
1655 * router. */
1656 ds_put_format(&c_addresses, " %s", nat->external_ip);
1657 central_ip_address = true;
1658 }
1659 }
1660
1661 /* A set to hold all load-balancer vips. */
1662 struct sset all_ips = SSET_INITIALIZER(&all_ips);
1663 int addr_family;
1664 get_router_load_balancer_ips(op->od, &all_ips, &addr_family);
1665
1666 const char *ip_address;
1667 SSET_FOR_EACH (ip_address, &all_ips) {
1668 ds_put_format(&c_addresses, " %s", ip_address);
1669 central_ip_address = true;
1670 }
1671 sset_destroy(&all_ips);
1672
1673 if (central_ip_address) {
1674 /* Gratuitous ARP for centralized NAT rules on distributed gateway
1675 * ports should be restricted to the "redirect-chassis". */
1676 if (op->od->l3redirect_port) {
1677 ds_put_format(&c_addresses, " is_chassis_resident(%s)",
1678 op->od->l3redirect_port->json_key);
1679 }
1680
1681 addresses[n_nats++] = ds_steal_cstr(&c_addresses);
1682 }
1683
1684 *n = n_nats;
1685
1686 return addresses;
1687 }
1688
1689 static bool
1690 gateway_chassis_equal(const struct nbrec_gateway_chassis *nb_gwc,
1691 const struct sbrec_chassis *nb_gwc_c,
1692 const struct sbrec_gateway_chassis *sb_gwc)
1693 {
1694 bool equal = !strcmp(nb_gwc->name, sb_gwc->name)
1695 && nb_gwc->priority == sb_gwc->priority
1696 && smap_equal(&nb_gwc->options, &sb_gwc->options)
1697 && smap_equal(&nb_gwc->external_ids, &sb_gwc->external_ids);
1698
1699 if (!equal) {
1700 return false;
1701 }
1702
1703 /* If everything else matched and we were unable to find the SBDB
1704 * Chassis entry at this time, assume a match and return true.
1705 * This happens when an ovn-controller is restarting and the Chassis
1706 * entry is gone away momentarily */
1707 return !nb_gwc_c
1708 || (sb_gwc->chassis && !strcmp(nb_gwc_c->name,
1709 sb_gwc->chassis->name));
1710 }
1711
1712 static bool
1713 sbpb_gw_chassis_needs_update(
1714 const struct sbrec_port_binding *port_binding,
1715 const struct nbrec_logical_router_port *lrp,
1716 const struct chassis_index *chassis_index)
1717 {
1718 if (!lrp || !port_binding) {
1719 return false;
1720 }
1721
1722 /* These arrays are used to collect valid Gateway_Chassis and valid
1723 * Chassis records from the Logical_Router_Port Gateway_Chassis list,
1724 * we ignore the ones we can't match on the SBDB */
1725 struct nbrec_gateway_chassis **lrp_gwc = xzalloc(lrp->n_gateway_chassis *
1726 sizeof *lrp_gwc);
1727 const struct sbrec_chassis **lrp_gwc_c = xzalloc(lrp->n_gateway_chassis *
1728 sizeof *lrp_gwc_c);
1729
1730 /* Count the number of gateway chassis chassis names from the logical
1731 * router port that we are able to match on the southbound database */
1732 int lrp_n_gateway_chassis = 0;
1733 int n;
1734 for (n = 0; n < lrp->n_gateway_chassis; n++) {
1735
1736 if (!lrp->gateway_chassis[n]->chassis_name) {
1737 continue;
1738 }
1739
1740 const struct sbrec_chassis *chassis =
1741 chassis_lookup_by_name(chassis_index,
1742 lrp->gateway_chassis[n]->chassis_name);
1743
1744 lrp_gwc_c[lrp_n_gateway_chassis] = chassis;
1745 lrp_gwc[lrp_n_gateway_chassis] = lrp->gateway_chassis[n];
1746 lrp_n_gateway_chassis++;
1747 if (!chassis) {
1748 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
1749 VLOG_WARN_RL(
1750 &rl, "Chassis name %s referenced in NBDB via Gateway_Chassis "
1751 "on logical router port %s does not exist in SBDB",
1752 lrp->gateway_chassis[n]->chassis_name, lrp->name);
1753 }
1754 }
1755
1756 /* Basic check, different amount of Gateway_Chassis means that we
1757 * need to update southbound database Port_Binding */
1758 if (lrp_n_gateway_chassis != port_binding->n_gateway_chassis) {
1759 free(lrp_gwc_c);
1760 free(lrp_gwc);
1761 return true;
1762 }
1763
1764 for (n = 0; n < lrp_n_gateway_chassis; n++) {
1765 int i;
1766 /* For each of the valid gw chassis on the lrp, check if there's
1767 * a match on the Port_Binding list, we assume order is not
1768 * persisted */
1769 for (i = 0; i < port_binding->n_gateway_chassis; i++) {
1770 if (gateway_chassis_equal(lrp_gwc[n],
1771 lrp_gwc_c[n],
1772 port_binding->gateway_chassis[i])) {
1773 break; /* we found a match */
1774 }
1775 }
1776
1777 /* if no Port_Binding gateway chassis matched for the entry... */
1778 if (i == port_binding->n_gateway_chassis) {
1779 free(lrp_gwc_c);
1780 free(lrp_gwc);
1781 return true; /* found no match for this gateway chassis on lrp */
1782 }
1783 }
1784
1785 /* no need for update, all ports matched */
1786 free(lrp_gwc_c);
1787 free(lrp_gwc);
1788 return false;
1789 }
1790
1791 /* This functions translates the gw chassis on the nb database
1792 * to sb database entries, the only difference is that SB database
1793 * Gateway_Chassis table references the chassis directly instead
1794 * of using the name */
1795 static void
1796 copy_gw_chassis_from_nbrp_to_sbpb(
1797 struct northd_context *ctx,
1798 const struct nbrec_logical_router_port *lrp,
1799 const struct chassis_index *chassis_index,
1800 const struct sbrec_port_binding *port_binding) {
1801
1802 if (!lrp || !port_binding || !lrp->n_gateway_chassis) {
1803 return;
1804 }
1805
1806 struct sbrec_gateway_chassis **gw_chassis = NULL;
1807 int n_gwc = 0;
1808 int n;
1809
1810 /* XXX: This can be improved. This code will generate a set of new
1811 * Gateway_Chassis and push them all in a single transaction, instead
1812 * this would be more optimal if we just add/update/remove the rows in
1813 * the southbound db that need to change. We don't expect lots of
1814 * changes to the Gateway_Chassis table, but if that proves to be wrong
1815 * we should optimize this. */
1816 for (n = 0; n < lrp->n_gateway_chassis; n++) {
1817 struct nbrec_gateway_chassis *lrp_gwc = lrp->gateway_chassis[n];
1818 if (!lrp_gwc->chassis_name) {
1819 continue;
1820 }
1821
1822 const struct sbrec_chassis *chassis =
1823 chassis_lookup_by_name(chassis_index, lrp_gwc->chassis_name);
1824
1825 gw_chassis = xrealloc(gw_chassis, (n_gwc + 1) * sizeof *gw_chassis);
1826
1827 struct sbrec_gateway_chassis *pb_gwc =
1828 sbrec_gateway_chassis_insert(ctx->ovnsb_txn);
1829
1830 sbrec_gateway_chassis_set_name(pb_gwc, lrp_gwc->name);
1831 sbrec_gateway_chassis_set_priority(pb_gwc, lrp_gwc->priority);
1832 sbrec_gateway_chassis_set_chassis(pb_gwc, chassis);
1833 sbrec_gateway_chassis_set_options(pb_gwc, &lrp_gwc->options);
1834 sbrec_gateway_chassis_set_external_ids(pb_gwc, &lrp_gwc->external_ids);
1835
1836 gw_chassis[n_gwc++] = pb_gwc;
1837 }
1838 sbrec_port_binding_set_gateway_chassis(port_binding, gw_chassis, n_gwc);
1839 free(gw_chassis);
1840 }
1841
1842 static void
1843 ovn_port_update_sbrec(struct northd_context *ctx,
1844 const struct ovn_port *op,
1845 const struct chassis_index *chassis_index,
1846 struct hmap *chassis_qdisc_queues)
1847 {
1848 sbrec_port_binding_set_datapath(op->sb, op->od->sb);
1849 if (op->nbrp) {
1850 /* If the router is for l3 gateway, it resides on a chassis
1851 * and its port type is "l3gateway". */
1852 const char *chassis_name = smap_get(&op->od->nbr->options, "chassis");
1853 if (op->derived) {
1854 sbrec_port_binding_set_type(op->sb, "chassisredirect");
1855 } else if (chassis_name) {
1856 sbrec_port_binding_set_type(op->sb, "l3gateway");
1857 } else {
1858 sbrec_port_binding_set_type(op->sb, "patch");
1859 }
1860
1861 struct smap new;
1862 smap_init(&new);
1863 if (op->derived) {
1864 const char *redirect_chassis = smap_get(&op->nbrp->options,
1865 "redirect-chassis");
1866 if (op->nbrp->n_gateway_chassis && redirect_chassis) {
1867 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
1868 VLOG_WARN_RL(
1869 &rl, "logical router port %s has both options:"
1870 "redirect-chassis and gateway_chassis populated "
1871 "redirect-chassis will be ignored in favour of "
1872 "gateway chassis", op->nbrp->name);
1873 }
1874
1875 if (op->nbrp->n_gateway_chassis) {
1876 if (sbpb_gw_chassis_needs_update(op->sb, op->nbrp,
1877 chassis_index)) {
1878 copy_gw_chassis_from_nbrp_to_sbpb(ctx, op->nbrp,
1879 chassis_index, op->sb);
1880 }
1881
1882 } else if (redirect_chassis) {
1883 /* Handle ports that had redirect-chassis option attached
1884 * to them, and for backwards compatibility convert them
1885 * to a single Gateway_Chassis entry */
1886 const struct sbrec_chassis *chassis =
1887 chassis_lookup_by_name(chassis_index, redirect_chassis);
1888 if (chassis) {
1889 /* If we found the chassis, and the gw chassis on record
1890 * differs from what we expect go ahead and update */
1891 if (op->sb->n_gateway_chassis != 1
1892 || !op->sb->gateway_chassis[0]->chassis
1893 || strcmp(op->sb->gateway_chassis[0]->chassis->name,
1894 chassis->name)
1895 || op->sb->gateway_chassis[0]->priority != 0) {
1896 /* Construct a single Gateway_Chassis entry on the
1897 * Port_Binding attached to the redirect_chassis
1898 * name */
1899 struct sbrec_gateway_chassis *gw_chassis =
1900 sbrec_gateway_chassis_insert(ctx->ovnsb_txn);
1901
1902 char *gwc_name = xasprintf("%s_%s", op->nbrp->name,
1903 chassis->name);
1904
1905 /* XXX: Again, here, we could just update an existing
1906 * Gateway_Chassis, instead of creating a new one
1907 * and replacing it */
1908 sbrec_gateway_chassis_set_name(gw_chassis, gwc_name);
1909 sbrec_gateway_chassis_set_priority(gw_chassis, 0);
1910 sbrec_gateway_chassis_set_chassis(gw_chassis, chassis);
1911 sbrec_gateway_chassis_set_external_ids(gw_chassis,
1912 &op->nbrp->external_ids);
1913 sbrec_port_binding_set_gateway_chassis(op->sb,
1914 &gw_chassis, 1);
1915 free(gwc_name);
1916 }
1917 } else {
1918 VLOG_WARN("chassis name '%s' from redirect from logical "
1919 " router port '%s' redirect-chassis not found",
1920 redirect_chassis, op->nbrp->name);
1921 if (op->sb->n_gateway_chassis) {
1922 sbrec_port_binding_set_gateway_chassis(op->sb, NULL,
1923 0);
1924 }
1925 }
1926 }
1927 smap_add(&new, "distributed-port", op->nbrp->name);
1928 } else {
1929 if (op->peer) {
1930 smap_add(&new, "peer", op->peer->key);
1931 }
1932 if (chassis_name) {
1933 smap_add(&new, "l3gateway-chassis", chassis_name);
1934 }
1935 }
1936 sbrec_port_binding_set_options(op->sb, &new);
1937 smap_destroy(&new);
1938
1939 sbrec_port_binding_set_parent_port(op->sb, NULL);
1940 sbrec_port_binding_set_tag(op->sb, NULL, 0);
1941 sbrec_port_binding_set_mac(op->sb, NULL, 0);
1942
1943 struct smap ids = SMAP_INITIALIZER(&ids);
1944 sbrec_port_binding_set_external_ids(op->sb, &ids);
1945 } else {
1946 if (strcmp(op->nbsp->type, "router")) {
1947 uint32_t queue_id = smap_get_int(
1948 &op->sb->options, "qdisc_queue_id", 0);
1949 bool has_qos = port_has_qos_params(&op->nbsp->options);
1950 struct smap options;
1951
1952 if (op->sb->chassis && has_qos && !queue_id) {
1953 queue_id = allocate_chassis_queueid(chassis_qdisc_queues,
1954 op->sb->chassis);
1955 } else if (!has_qos && queue_id) {
1956 free_chassis_queueid(chassis_qdisc_queues,
1957 op->sb->chassis,
1958 queue_id);
1959 queue_id = 0;
1960 }
1961
1962 smap_clone(&options, &op->nbsp->options);
1963 if (queue_id) {
1964 smap_add_format(&options,
1965 "qdisc_queue_id", "%d", queue_id);
1966 }
1967 sbrec_port_binding_set_options(op->sb, &options);
1968 smap_destroy(&options);
1969 if (ovn_is_known_nb_lsp_type(op->nbsp->type)) {
1970 sbrec_port_binding_set_type(op->sb, op->nbsp->type);
1971 } else {
1972 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
1973 VLOG_WARN_RL(
1974 &rl, "Unknown port type '%s' set on logical switch '%s'.",
1975 op->nbsp->type, op->nbsp->name);
1976 }
1977 } else {
1978 const char *chassis = NULL;
1979 if (op->peer && op->peer->od && op->peer->od->nbr) {
1980 chassis = smap_get(&op->peer->od->nbr->options, "chassis");
1981 }
1982
1983 /* A switch port connected to a gateway router is also of
1984 * type "l3gateway". */
1985 if (chassis) {
1986 sbrec_port_binding_set_type(op->sb, "l3gateway");
1987 } else {
1988 sbrec_port_binding_set_type(op->sb, "patch");
1989 }
1990
1991 const char *router_port = smap_get(&op->nbsp->options,
1992 "router-port");
1993 if (router_port || chassis) {
1994 struct smap new;
1995 smap_init(&new);
1996 if (router_port) {
1997 smap_add(&new, "peer", router_port);
1998 }
1999 if (chassis) {
2000 smap_add(&new, "l3gateway-chassis", chassis);
2001 }
2002 sbrec_port_binding_set_options(op->sb, &new);
2003 smap_destroy(&new);
2004 }
2005
2006 const char *nat_addresses = smap_get(&op->nbsp->options,
2007 "nat-addresses");
2008 if (nat_addresses && !strcmp(nat_addresses, "router")) {
2009 if (op->peer && op->peer->od
2010 && (chassis || op->peer->od->l3redirect_port)) {
2011 size_t n_nats;
2012 char **nats = get_nat_addresses(op->peer, &n_nats);
2013 if (n_nats) {
2014 sbrec_port_binding_set_nat_addresses(op->sb,
2015 (const char **) nats, n_nats);
2016 for (size_t i = 0; i < n_nats; i++) {
2017 free(nats[i]);
2018 }
2019 free(nats);
2020 } else {
2021 sbrec_port_binding_set_nat_addresses(op->sb, NULL, 0);
2022 }
2023 } else {
2024 sbrec_port_binding_set_nat_addresses(op->sb, NULL, 0);
2025 }
2026 /* Only accept manual specification of ethernet address
2027 * followed by IPv4 addresses on type "l3gateway" ports. */
2028 } else if (nat_addresses && chassis) {
2029 struct lport_addresses laddrs;
2030 if (!extract_lsp_addresses(nat_addresses, &laddrs)) {
2031 static struct vlog_rate_limit rl =
2032 VLOG_RATE_LIMIT_INIT(1, 1);
2033 VLOG_WARN_RL(&rl, "Error extracting nat-addresses.");
2034 sbrec_port_binding_set_nat_addresses(op->sb, NULL, 0);
2035 } else {
2036 sbrec_port_binding_set_nat_addresses(op->sb,
2037 &nat_addresses, 1);
2038 destroy_lport_addresses(&laddrs);
2039 }
2040 } else {
2041 sbrec_port_binding_set_nat_addresses(op->sb, NULL, 0);
2042 }
2043 }
2044 sbrec_port_binding_set_parent_port(op->sb, op->nbsp->parent_name);
2045 sbrec_port_binding_set_tag(op->sb, op->nbsp->tag, op->nbsp->n_tag);
2046 sbrec_port_binding_set_mac(op->sb, (const char **) op->nbsp->addresses,
2047 op->nbsp->n_addresses);
2048
2049 struct smap ids = SMAP_INITIALIZER(&ids);
2050 smap_clone(&ids, &op->nbsp->external_ids);
2051 const char *name = smap_get(&ids, "neutron:port_name");
2052 if (name && name[0]) {
2053 smap_add(&ids, "name", name);
2054 }
2055 sbrec_port_binding_set_external_ids(op->sb, &ids);
2056 smap_destroy(&ids);
2057 }
2058 }
2059
2060 /* Remove mac_binding entries that refer to logical_ports which are
2061 * deleted. */
2062 static void
2063 cleanup_mac_bindings(struct northd_context *ctx, struct hmap *ports)
2064 {
2065 const struct sbrec_mac_binding *b, *n;
2066 SBREC_MAC_BINDING_FOR_EACH_SAFE (b, n, ctx->ovnsb_idl) {
2067 if (!ovn_port_find(ports, b->logical_port)) {
2068 sbrec_mac_binding_delete(b);
2069 }
2070 }
2071 }
2072
2073 /* Updates the southbound Port_Binding table so that it contains the logical
2074 * switch ports specified by the northbound database.
2075 *
2076 * Initializes 'ports' to contain a "struct ovn_port" for every logical port,
2077 * using the "struct ovn_datapath"s in 'datapaths' to look up logical
2078 * datapaths. */
2079 static void
2080 build_ports(struct northd_context *ctx, struct hmap *datapaths,
2081 const struct chassis_index *chassis_index, struct hmap *ports)
2082 {
2083 struct ovs_list sb_only, nb_only, both;
2084 struct hmap tag_alloc_table = HMAP_INITIALIZER(&tag_alloc_table);
2085 struct hmap chassis_qdisc_queues = HMAP_INITIALIZER(&chassis_qdisc_queues);
2086
2087 join_logical_ports(ctx, datapaths, ports, &chassis_qdisc_queues,
2088 &tag_alloc_table, &sb_only, &nb_only, &both);
2089
2090 struct ovn_port *op, *next;
2091 /* For logical ports that are in both databases, update the southbound
2092 * record based on northbound data. Also index the in-use tunnel_keys.
2093 * For logical ports that are in NB database, do any tag allocation
2094 * needed. */
2095 LIST_FOR_EACH_SAFE (op, next, list, &both) {
2096 if (op->nbsp) {
2097 tag_alloc_create_new_tag(&tag_alloc_table, op->nbsp);
2098 }
2099 ovn_port_update_sbrec(ctx, op, chassis_index, &chassis_qdisc_queues);
2100
2101 add_tnlid(&op->od->port_tnlids, op->sb->tunnel_key);
2102 if (op->sb->tunnel_key > op->od->port_key_hint) {
2103 op->od->port_key_hint = op->sb->tunnel_key;
2104 }
2105 }
2106
2107 /* Add southbound record for each unmatched northbound record. */
2108 LIST_FOR_EACH_SAFE (op, next, list, &nb_only) {
2109 uint16_t tunnel_key = ovn_port_allocate_key(op->od);
2110 if (!tunnel_key) {
2111 continue;
2112 }
2113
2114 op->sb = sbrec_port_binding_insert(ctx->ovnsb_txn);
2115 ovn_port_update_sbrec(ctx, op, chassis_index, &chassis_qdisc_queues);
2116
2117 sbrec_port_binding_set_logical_port(op->sb, op->key);
2118 sbrec_port_binding_set_tunnel_key(op->sb, tunnel_key);
2119 }
2120
2121 bool remove_mac_bindings = false;
2122 if (!ovs_list_is_empty(&sb_only)) {
2123 remove_mac_bindings = true;
2124 }
2125
2126 /* Delete southbound records without northbound matches. */
2127 LIST_FOR_EACH_SAFE(op, next, list, &sb_only) {
2128 ovs_list_remove(&op->list);
2129 sbrec_port_binding_delete(op->sb);
2130 ovn_port_destroy(ports, op);
2131 }
2132 if (remove_mac_bindings) {
2133 cleanup_mac_bindings(ctx, ports);
2134 }
2135
2136 tag_alloc_destroy(&tag_alloc_table);
2137 destroy_chassis_queues(&chassis_qdisc_queues);
2138 }
2139 \f
2140 #define OVN_MIN_MULTICAST 32768
2141 #define OVN_MAX_MULTICAST 65535
2142
2143 struct multicast_group {
2144 const char *name;
2145 uint16_t key; /* OVN_MIN_MULTICAST...OVN_MAX_MULTICAST. */
2146 };
2147
2148 #define MC_FLOOD "_MC_flood"
2149 static const struct multicast_group mc_flood = { MC_FLOOD, 65535 };
2150
2151 #define MC_UNKNOWN "_MC_unknown"
2152 static const struct multicast_group mc_unknown = { MC_UNKNOWN, 65534 };
2153
2154 static bool
2155 multicast_group_equal(const struct multicast_group *a,
2156 const struct multicast_group *b)
2157 {
2158 return !strcmp(a->name, b->name) && a->key == b->key;
2159 }
2160
2161 /* Multicast group entry. */
2162 struct ovn_multicast {
2163 struct hmap_node hmap_node; /* Index on 'datapath' and 'key'. */
2164 struct ovn_datapath *datapath;
2165 const struct multicast_group *group;
2166
2167 struct ovn_port **ports;
2168 size_t n_ports, allocated_ports;
2169 };
2170
2171 static uint32_t
2172 ovn_multicast_hash(const struct ovn_datapath *datapath,
2173 const struct multicast_group *group)
2174 {
2175 return hash_pointer(datapath, group->key);
2176 }
2177
2178 static struct ovn_multicast *
2179 ovn_multicast_find(struct hmap *mcgroups, struct ovn_datapath *datapath,
2180 const struct multicast_group *group)
2181 {
2182 struct ovn_multicast *mc;
2183
2184 HMAP_FOR_EACH_WITH_HASH (mc, hmap_node,
2185 ovn_multicast_hash(datapath, group), mcgroups) {
2186 if (mc->datapath == datapath
2187 && multicast_group_equal(mc->group, group)) {
2188 return mc;
2189 }
2190 }
2191 return NULL;
2192 }
2193
2194 static void
2195 ovn_multicast_add(struct hmap *mcgroups, const struct multicast_group *group,
2196 struct ovn_port *port)
2197 {
2198 struct ovn_datapath *od = port->od;
2199 struct ovn_multicast *mc = ovn_multicast_find(mcgroups, od, group);
2200 if (!mc) {
2201 mc = xmalloc(sizeof *mc);
2202 hmap_insert(mcgroups, &mc->hmap_node, ovn_multicast_hash(od, group));
2203 mc->datapath = od;
2204 mc->group = group;
2205 mc->n_ports = 0;
2206 mc->allocated_ports = 4;
2207 mc->ports = xmalloc(mc->allocated_ports * sizeof *mc->ports);
2208 }
2209 if (mc->n_ports >= mc->allocated_ports) {
2210 mc->ports = x2nrealloc(mc->ports, &mc->allocated_ports,
2211 sizeof *mc->ports);
2212 }
2213 mc->ports[mc->n_ports++] = port;
2214 }
2215
2216 static void
2217 ovn_multicast_destroy(struct hmap *mcgroups, struct ovn_multicast *mc)
2218 {
2219 if (mc) {
2220 hmap_remove(mcgroups, &mc->hmap_node);
2221 free(mc->ports);
2222 free(mc);
2223 }
2224 }
2225
2226 static void
2227 ovn_multicast_update_sbrec(const struct ovn_multicast *mc,
2228 const struct sbrec_multicast_group *sb)
2229 {
2230 struct sbrec_port_binding **ports = xmalloc(mc->n_ports * sizeof *ports);
2231 for (size_t i = 0; i < mc->n_ports; i++) {
2232 ports[i] = CONST_CAST(struct sbrec_port_binding *, mc->ports[i]->sb);
2233 }
2234 sbrec_multicast_group_set_ports(sb, ports, mc->n_ports);
2235 free(ports);
2236 }
2237 \f
2238 /* Logical flow generation.
2239 *
2240 * This code generates the Logical_Flow table in the southbound database, as a
2241 * function of most of the northbound database.
2242 */
2243
2244 struct ovn_lflow {
2245 struct hmap_node hmap_node;
2246
2247 struct ovn_datapath *od;
2248 enum ovn_stage stage;
2249 uint16_t priority;
2250 char *match;
2251 char *actions;
2252 char *stage_hint;
2253 const char *where;
2254 };
2255
2256 static size_t
2257 ovn_lflow_hash(const struct ovn_lflow *lflow)
2258 {
2259 size_t hash = uuid_hash(&lflow->od->key);
2260 hash = hash_2words((lflow->stage << 16) | lflow->priority, hash);
2261 hash = hash_string(lflow->match, hash);
2262 return hash_string(lflow->actions, hash);
2263 }
2264
2265 static bool
2266 ovn_lflow_equal(const struct ovn_lflow *a, const struct ovn_lflow *b)
2267 {
2268 return (a->od == b->od
2269 && a->stage == b->stage
2270 && a->priority == b->priority
2271 && !strcmp(a->match, b->match)
2272 && !strcmp(a->actions, b->actions));
2273 }
2274
2275 static void
2276 ovn_lflow_init(struct ovn_lflow *lflow, struct ovn_datapath *od,
2277 enum ovn_stage stage, uint16_t priority,
2278 char *match, char *actions, char *stage_hint,
2279 const char *where)
2280 {
2281 lflow->od = od;
2282 lflow->stage = stage;
2283 lflow->priority = priority;
2284 lflow->match = match;
2285 lflow->actions = actions;
2286 lflow->stage_hint = stage_hint;
2287 lflow->where = where;
2288 }
2289
2290 /* Adds a row with the specified contents to the Logical_Flow table. */
2291 static void
2292 ovn_lflow_add_at(struct hmap *lflow_map, struct ovn_datapath *od,
2293 enum ovn_stage stage, uint16_t priority,
2294 const char *match, const char *actions,
2295 const char *stage_hint, const char *where)
2296 {
2297 ovs_assert(ovn_stage_to_datapath_type(stage) == ovn_datapath_get_type(od));
2298
2299 struct ovn_lflow *lflow = xmalloc(sizeof *lflow);
2300 ovn_lflow_init(lflow, od, stage, priority,
2301 xstrdup(match), xstrdup(actions),
2302 nullable_xstrdup(stage_hint), where);
2303 hmap_insert(lflow_map, &lflow->hmap_node, ovn_lflow_hash(lflow));
2304 }
2305
2306 /* Adds a row with the specified contents to the Logical_Flow table. */
2307 #define ovn_lflow_add_with_hint(LFLOW_MAP, OD, STAGE, PRIORITY, MATCH, \
2308 ACTIONS, STAGE_HINT) \
2309 ovn_lflow_add_at(LFLOW_MAP, OD, STAGE, PRIORITY, MATCH, ACTIONS, \
2310 STAGE_HINT, OVS_SOURCE_LOCATOR)
2311
2312 #define ovn_lflow_add(LFLOW_MAP, OD, STAGE, PRIORITY, MATCH, ACTIONS) \
2313 ovn_lflow_add_with_hint(LFLOW_MAP, OD, STAGE, PRIORITY, MATCH, \
2314 ACTIONS, NULL)
2315
2316 static struct ovn_lflow *
2317 ovn_lflow_find(struct hmap *lflows, struct ovn_datapath *od,
2318 enum ovn_stage stage, uint16_t priority,
2319 const char *match, const char *actions)
2320 {
2321 struct ovn_lflow target;
2322 ovn_lflow_init(&target, od, stage, priority,
2323 CONST_CAST(char *, match), CONST_CAST(char *, actions),
2324 NULL, NULL);
2325
2326 struct ovn_lflow *lflow;
2327 HMAP_FOR_EACH_WITH_HASH (lflow, hmap_node, ovn_lflow_hash(&target),
2328 lflows) {
2329 if (ovn_lflow_equal(lflow, &target)) {
2330 return lflow;
2331 }
2332 }
2333 return NULL;
2334 }
2335
2336 static void
2337 ovn_lflow_destroy(struct hmap *lflows, struct ovn_lflow *lflow)
2338 {
2339 if (lflow) {
2340 hmap_remove(lflows, &lflow->hmap_node);
2341 free(lflow->match);
2342 free(lflow->actions);
2343 free(lflow->stage_hint);
2344 free(lflow);
2345 }
2346 }
2347
2348 /* Appends port security constraints on L2 address field 'eth_addr_field'
2349 * (e.g. "eth.src" or "eth.dst") to 'match'. 'ps_addrs', with 'n_ps_addrs'
2350 * elements, is the collection of port_security constraints from an
2351 * OVN_NB Logical_Switch_Port row generated by extract_lsp_addresses(). */
2352 static void
2353 build_port_security_l2(const char *eth_addr_field,
2354 struct lport_addresses *ps_addrs,
2355 unsigned int n_ps_addrs,
2356 struct ds *match)
2357 {
2358 if (!n_ps_addrs) {
2359 return;
2360 }
2361
2362 ds_put_format(match, " && %s == {", eth_addr_field);
2363
2364 for (size_t i = 0; i < n_ps_addrs; i++) {
2365 ds_put_format(match, "%s ", ps_addrs[i].ea_s);
2366 }
2367 ds_chomp(match, ' ');
2368 ds_put_cstr(match, "}");
2369 }
2370
2371 static void
2372 build_port_security_ipv6_nd_flow(
2373 struct ds *match, struct eth_addr ea, struct ipv6_netaddr *ipv6_addrs,
2374 int n_ipv6_addrs)
2375 {
2376 ds_put_format(match, " && ip6 && nd && ((nd.sll == "ETH_ADDR_FMT" || "
2377 "nd.sll == "ETH_ADDR_FMT") || ((nd.tll == "ETH_ADDR_FMT" || "
2378 "nd.tll == "ETH_ADDR_FMT")", ETH_ADDR_ARGS(eth_addr_zero),
2379 ETH_ADDR_ARGS(ea), ETH_ADDR_ARGS(eth_addr_zero),
2380 ETH_ADDR_ARGS(ea));
2381 if (!n_ipv6_addrs) {
2382 ds_put_cstr(match, "))");
2383 return;
2384 }
2385
2386 char ip6_str[INET6_ADDRSTRLEN + 1];
2387 struct in6_addr lla;
2388 in6_generate_lla(ea, &lla);
2389 memset(ip6_str, 0, sizeof(ip6_str));
2390 ipv6_string_mapped(ip6_str, &lla);
2391 ds_put_format(match, " && (nd.target == %s", ip6_str);
2392
2393 for(int i = 0; i < n_ipv6_addrs; i++) {
2394 memset(ip6_str, 0, sizeof(ip6_str));
2395 ipv6_string_mapped(ip6_str, &ipv6_addrs[i].addr);
2396 ds_put_format(match, " || nd.target == %s", ip6_str);
2397 }
2398
2399 ds_put_format(match, ")))");
2400 }
2401
2402 static void
2403 build_port_security_ipv6_flow(
2404 enum ovn_pipeline pipeline, struct ds *match, struct eth_addr ea,
2405 struct ipv6_netaddr *ipv6_addrs, int n_ipv6_addrs)
2406 {
2407 char ip6_str[INET6_ADDRSTRLEN + 1];
2408
2409 ds_put_format(match, " && %s == {",
2410 pipeline == P_IN ? "ip6.src" : "ip6.dst");
2411
2412 /* Allow link-local address. */
2413 struct in6_addr lla;
2414 in6_generate_lla(ea, &lla);
2415 ipv6_string_mapped(ip6_str, &lla);
2416 ds_put_format(match, "%s, ", ip6_str);
2417
2418 /* Allow ip6.dst=ff00::/8 for multicast packets */
2419 if (pipeline == P_OUT) {
2420 ds_put_cstr(match, "ff00::/8, ");
2421 }
2422 for(int i = 0; i < n_ipv6_addrs; i++) {
2423 ipv6_string_mapped(ip6_str, &ipv6_addrs[i].addr);
2424 ds_put_format(match, "%s, ", ip6_str);
2425 }
2426 /* Replace ", " by "}". */
2427 ds_chomp(match, ' ');
2428 ds_chomp(match, ',');
2429 ds_put_cstr(match, "}");
2430 }
2431
2432 /**
2433 * Build port security constraints on ARP and IPv6 ND fields
2434 * and add logical flows to S_SWITCH_IN_PORT_SEC_ND stage.
2435 *
2436 * For each port security of the logical port, following
2437 * logical flows are added
2438 * - If the port security has no IP (both IPv4 and IPv6) or
2439 * if it has IPv4 address(es)
2440 * - Priority 90 flow to allow ARP packets for known MAC addresses
2441 * in the eth.src and arp.spa fields. If the port security
2442 * has IPv4 addresses, allow known IPv4 addresses in the arp.tpa field.
2443 *
2444 * - If the port security has no IP (both IPv4 and IPv6) or
2445 * if it has IPv6 address(es)
2446 * - Priority 90 flow to allow IPv6 ND packets for known MAC addresses
2447 * in the eth.src and nd.sll/nd.tll fields. If the port security
2448 * has IPv6 addresses, allow known IPv6 addresses in the nd.target field
2449 * for IPv6 Neighbor Advertisement packet.
2450 *
2451 * - Priority 80 flow to drop ARP and IPv6 ND packets.
2452 */
2453 static void
2454 build_port_security_nd(struct ovn_port *op, struct hmap *lflows)
2455 {
2456 struct ds match = DS_EMPTY_INITIALIZER;
2457
2458 for (size_t i = 0; i < op->n_ps_addrs; i++) {
2459 struct lport_addresses *ps = &op->ps_addrs[i];
2460
2461 bool no_ip = !(ps->n_ipv4_addrs || ps->n_ipv6_addrs);
2462
2463 ds_clear(&match);
2464 if (ps->n_ipv4_addrs || no_ip) {
2465 ds_put_format(&match,
2466 "inport == %s && eth.src == %s && arp.sha == %s",
2467 op->json_key, ps->ea_s, ps->ea_s);
2468
2469 if (ps->n_ipv4_addrs) {
2470 ds_put_cstr(&match, " && arp.spa == {");
2471 for (size_t j = 0; j < ps->n_ipv4_addrs; j++) {
2472 /* When the netmask is applied, if the host portion is
2473 * non-zero, the host can only use the specified
2474 * address in the arp.spa. If zero, the host is allowed
2475 * to use any address in the subnet. */
2476 if (ps->ipv4_addrs[j].plen == 32
2477 || ps->ipv4_addrs[j].addr & ~ps->ipv4_addrs[j].mask) {
2478 ds_put_cstr(&match, ps->ipv4_addrs[j].addr_s);
2479 } else {
2480 ds_put_format(&match, "%s/%d",
2481 ps->ipv4_addrs[j].network_s,
2482 ps->ipv4_addrs[j].plen);
2483 }
2484 ds_put_cstr(&match, ", ");
2485 }
2486 ds_chomp(&match, ' ');
2487 ds_chomp(&match, ',');
2488 ds_put_cstr(&match, "}");
2489 }
2490 ovn_lflow_add(lflows, op->od, S_SWITCH_IN_PORT_SEC_ND, 90,
2491 ds_cstr(&match), "next;");
2492 }
2493
2494 if (ps->n_ipv6_addrs || no_ip) {
2495 ds_clear(&match);
2496 ds_put_format(&match, "inport == %s && eth.src == %s",
2497 op->json_key, ps->ea_s);
2498 build_port_security_ipv6_nd_flow(&match, ps->ea, ps->ipv6_addrs,
2499 ps->n_ipv6_addrs);
2500 ovn_lflow_add(lflows, op->od, S_SWITCH_IN_PORT_SEC_ND, 90,
2501 ds_cstr(&match), "next;");
2502 }
2503 }
2504
2505 ds_clear(&match);
2506 ds_put_format(&match, "inport == %s && (arp || nd)", op->json_key);
2507 ovn_lflow_add(lflows, op->od, S_SWITCH_IN_PORT_SEC_ND, 80,
2508 ds_cstr(&match), "drop;");
2509 ds_destroy(&match);
2510 }
2511
2512 /**
2513 * Build port security constraints on IPv4 and IPv6 src and dst fields
2514 * and add logical flows to S_SWITCH_(IN/OUT)_PORT_SEC_IP stage.
2515 *
2516 * For each port security of the logical port, following
2517 * logical flows are added
2518 * - If the port security has IPv4 addresses,
2519 * - Priority 90 flow to allow IPv4 packets for known IPv4 addresses
2520 *
2521 * - If the port security has IPv6 addresses,
2522 * - Priority 90 flow to allow IPv6 packets for known IPv6 addresses
2523 *
2524 * - If the port security has IPv4 addresses or IPv6 addresses or both
2525 * - Priority 80 flow to drop all IPv4 and IPv6 traffic
2526 */
2527 static void
2528 build_port_security_ip(enum ovn_pipeline pipeline, struct ovn_port *op,
2529 struct hmap *lflows)
2530 {
2531 char *port_direction;
2532 enum ovn_stage stage;
2533 if (pipeline == P_IN) {
2534 port_direction = "inport";
2535 stage = S_SWITCH_IN_PORT_SEC_IP;
2536 } else {
2537 port_direction = "outport";
2538 stage = S_SWITCH_OUT_PORT_SEC_IP;
2539 }
2540
2541 for (size_t i = 0; i < op->n_ps_addrs; i++) {
2542 struct lport_addresses *ps = &op->ps_addrs[i];
2543
2544 if (!(ps->n_ipv4_addrs || ps->n_ipv6_addrs)) {
2545 continue;
2546 }
2547
2548 if (ps->n_ipv4_addrs) {
2549 struct ds match = DS_EMPTY_INITIALIZER;
2550 if (pipeline == P_IN) {
2551 /* Permit use of the unspecified address for DHCP discovery */
2552 struct ds dhcp_match = DS_EMPTY_INITIALIZER;
2553 ds_put_format(&dhcp_match, "inport == %s"
2554 " && eth.src == %s"
2555 " && ip4.src == 0.0.0.0"
2556 " && ip4.dst == 255.255.255.255"
2557 " && udp.src == 68 && udp.dst == 67",
2558 op->json_key, ps->ea_s);
2559 ovn_lflow_add(lflows, op->od, stage, 90,
2560 ds_cstr(&dhcp_match), "next;");
2561 ds_destroy(&dhcp_match);
2562 ds_put_format(&match, "inport == %s && eth.src == %s"
2563 " && ip4.src == {", op->json_key,
2564 ps->ea_s);
2565 } else {
2566 ds_put_format(&match, "outport == %s && eth.dst == %s"
2567 " && ip4.dst == {255.255.255.255, 224.0.0.0/4, ",
2568 op->json_key, ps->ea_s);
2569 }
2570
2571 for (int j = 0; j < ps->n_ipv4_addrs; j++) {
2572 ovs_be32 mask = ps->ipv4_addrs[j].mask;
2573 /* When the netmask is applied, if the host portion is
2574 * non-zero, the host can only use the specified
2575 * address. If zero, the host is allowed to use any
2576 * address in the subnet.
2577 */
2578 if (ps->ipv4_addrs[j].plen == 32
2579 || ps->ipv4_addrs[j].addr & ~mask) {
2580 ds_put_format(&match, "%s", ps->ipv4_addrs[j].addr_s);
2581 if (pipeline == P_OUT && ps->ipv4_addrs[j].plen != 32) {
2582 /* Host is also allowed to receive packets to the
2583 * broadcast address in the specified subnet. */
2584 ds_put_format(&match, ", %s",
2585 ps->ipv4_addrs[j].bcast_s);
2586 }
2587 } else {
2588 /* host portion is zero */
2589 ds_put_format(&match, "%s/%d", ps->ipv4_addrs[j].network_s,
2590 ps->ipv4_addrs[j].plen);
2591 }
2592 ds_put_cstr(&match, ", ");
2593 }
2594
2595 /* Replace ", " by "}". */
2596 ds_chomp(&match, ' ');
2597 ds_chomp(&match, ',');
2598 ds_put_cstr(&match, "}");
2599 ovn_lflow_add(lflows, op->od, stage, 90, ds_cstr(&match), "next;");
2600 ds_destroy(&match);
2601 }
2602
2603 if (ps->n_ipv6_addrs) {
2604 struct ds match = DS_EMPTY_INITIALIZER;
2605 if (pipeline == P_IN) {
2606 /* Permit use of unspecified address for duplicate address
2607 * detection */
2608 struct ds dad_match = DS_EMPTY_INITIALIZER;
2609 ds_put_format(&dad_match, "inport == %s"
2610 " && eth.src == %s"
2611 " && ip6.src == ::"
2612 " && ip6.dst == ff02::/16"
2613 " && icmp6.type == {131, 135, 143}", op->json_key,
2614 ps->ea_s);
2615 ovn_lflow_add(lflows, op->od, stage, 90,
2616 ds_cstr(&dad_match), "next;");
2617 ds_destroy(&dad_match);
2618 }
2619 ds_put_format(&match, "%s == %s && %s == %s",
2620 port_direction, op->json_key,
2621 pipeline == P_IN ? "eth.src" : "eth.dst", ps->ea_s);
2622 build_port_security_ipv6_flow(pipeline, &match, ps->ea,
2623 ps->ipv6_addrs, ps->n_ipv6_addrs);
2624 ovn_lflow_add(lflows, op->od, stage, 90,
2625 ds_cstr(&match), "next;");
2626 ds_destroy(&match);
2627 }
2628
2629 char *match = xasprintf("%s == %s && %s == %s && ip",
2630 port_direction, op->json_key,
2631 pipeline == P_IN ? "eth.src" : "eth.dst",
2632 ps->ea_s);
2633 ovn_lflow_add(lflows, op->od, stage, 80, match, "drop;");
2634 free(match);
2635 }
2636
2637 }
2638
2639 static bool
2640 lsp_is_enabled(const struct nbrec_logical_switch_port *lsp)
2641 {
2642 return !lsp->enabled || *lsp->enabled;
2643 }
2644
2645 static bool
2646 lsp_is_up(const struct nbrec_logical_switch_port *lsp)
2647 {
2648 return !lsp->up || *lsp->up;
2649 }
2650
2651 static bool
2652 build_dhcpv4_action(struct ovn_port *op, ovs_be32 offer_ip,
2653 struct ds *options_action, struct ds *response_action,
2654 struct ds *ipv4_addr_match)
2655 {
2656 if (!op->nbsp->dhcpv4_options) {
2657 /* CMS has disabled native DHCPv4 for this lport. */
2658 return false;
2659 }
2660
2661 ovs_be32 host_ip, mask;
2662 char *error = ip_parse_masked(op->nbsp->dhcpv4_options->cidr, &host_ip,
2663 &mask);
2664 if (error || ((offer_ip ^ host_ip) & mask)) {
2665 /* Either
2666 * - cidr defined is invalid or
2667 * - the offer ip of the logical port doesn't belong to the cidr
2668 * defined in the DHCPv4 options.
2669 * */
2670 free(error);
2671 return false;
2672 }
2673
2674 const char *server_ip = smap_get(
2675 &op->nbsp->dhcpv4_options->options, "server_id");
2676 const char *server_mac = smap_get(
2677 &op->nbsp->dhcpv4_options->options, "server_mac");
2678 const char *lease_time = smap_get(
2679 &op->nbsp->dhcpv4_options->options, "lease_time");
2680
2681 if (!(server_ip && server_mac && lease_time)) {
2682 /* "server_id", "server_mac" and "lease_time" should be
2683 * present in the dhcp_options. */
2684 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
2685 VLOG_WARN_RL(&rl, "Required DHCPv4 options not defined for lport - %s",
2686 op->json_key);
2687 return false;
2688 }
2689
2690 struct smap dhcpv4_options = SMAP_INITIALIZER(&dhcpv4_options);
2691 smap_clone(&dhcpv4_options, &op->nbsp->dhcpv4_options->options);
2692
2693 /* server_mac is not DHCPv4 option, delete it from the smap. */
2694 smap_remove(&dhcpv4_options, "server_mac");
2695 char *netmask = xasprintf(IP_FMT, IP_ARGS(mask));
2696 smap_add(&dhcpv4_options, "netmask", netmask);
2697 free(netmask);
2698
2699 ds_put_format(options_action,
2700 REGBIT_DHCP_OPTS_RESULT" = put_dhcp_opts(offerip = "
2701 IP_FMT", ", IP_ARGS(offer_ip));
2702
2703 /* We're not using SMAP_FOR_EACH because we want a consistent order of the
2704 * options on different architectures (big or little endian, SSE4.2) */
2705 const struct smap_node **sorted_opts = smap_sort(&dhcpv4_options);
2706 for (size_t i = 0; i < smap_count(&dhcpv4_options); i++) {
2707 const struct smap_node *node = sorted_opts[i];
2708 ds_put_format(options_action, "%s = %s, ", node->key, node->value);
2709 }
2710 free(sorted_opts);
2711
2712 ds_chomp(options_action, ' ');
2713 ds_chomp(options_action, ',');
2714 ds_put_cstr(options_action, "); next;");
2715
2716 ds_put_format(response_action, "eth.dst = eth.src; eth.src = %s; "
2717 "ip4.dst = "IP_FMT"; ip4.src = %s; udp.src = 67; "
2718 "udp.dst = 68; outport = inport; flags.loopback = 1; "
2719 "output;",
2720 server_mac, IP_ARGS(offer_ip), server_ip);
2721
2722 ds_put_format(ipv4_addr_match,
2723 "ip4.src == "IP_FMT" && ip4.dst == {%s, 255.255.255.255}",
2724 IP_ARGS(offer_ip), server_ip);
2725 smap_destroy(&dhcpv4_options);
2726 return true;
2727 }
2728
2729 static bool
2730 build_dhcpv6_action(struct ovn_port *op, struct in6_addr *offer_ip,
2731 struct ds *options_action, struct ds *response_action)
2732 {
2733 if (!op->nbsp->dhcpv6_options) {
2734 /* CMS has disabled native DHCPv6 for this lport. */
2735 return false;
2736 }
2737
2738 struct in6_addr host_ip, mask;
2739
2740 char *error = ipv6_parse_masked(op->nbsp->dhcpv6_options->cidr, &host_ip,
2741 &mask);
2742 if (error) {
2743 free(error);
2744 return false;
2745 }
2746 struct in6_addr ip6_mask = ipv6_addr_bitxor(offer_ip, &host_ip);
2747 ip6_mask = ipv6_addr_bitand(&ip6_mask, &mask);
2748 if (!ipv6_mask_is_any(&ip6_mask)) {
2749 /* offer_ip doesn't belongs to the cidr defined in lport's DHCPv6
2750 * options.*/
2751 return false;
2752 }
2753
2754 const struct smap *options_map = &op->nbsp->dhcpv6_options->options;
2755 /* "server_id" should be the MAC address. */
2756 const char *server_mac = smap_get(options_map, "server_id");
2757 struct eth_addr ea;
2758 if (!server_mac || !eth_addr_from_string(server_mac, &ea)) {
2759 /* "server_id" should be present in the dhcpv6_options. */
2760 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
2761 VLOG_WARN_RL(&rl, "server_id not present in the DHCPv6 options"
2762 " for lport %s", op->json_key);
2763 return false;
2764 }
2765
2766 /* Get the link local IP of the DHCPv6 server from the server MAC. */
2767 struct in6_addr lla;
2768 in6_generate_lla(ea, &lla);
2769
2770 char server_ip[INET6_ADDRSTRLEN + 1];
2771 ipv6_string_mapped(server_ip, &lla);
2772
2773 char ia_addr[INET6_ADDRSTRLEN + 1];
2774 ipv6_string_mapped(ia_addr, offer_ip);
2775
2776 ds_put_format(options_action,
2777 REGBIT_DHCP_OPTS_RESULT" = put_dhcpv6_opts(");
2778
2779 /* Check whether the dhcpv6 options should be configured as stateful.
2780 * Only reply with ia_addr option for dhcpv6 stateful address mode. */
2781 if (!smap_get_bool(options_map, "dhcpv6_stateless", false)) {
2782 ipv6_string_mapped(ia_addr, offer_ip);
2783 ds_put_format(options_action, "ia_addr = %s, ", ia_addr);
2784 }
2785
2786 /* We're not using SMAP_FOR_EACH because we want a consistent order of the
2787 * options on different architectures (big or little endian, SSE4.2) */
2788 const struct smap_node **sorted_opts = smap_sort(options_map);
2789 for (size_t i = 0; i < smap_count(options_map); i++) {
2790 const struct smap_node *node = sorted_opts[i];
2791 if (strcmp(node->key, "dhcpv6_stateless")) {
2792 ds_put_format(options_action, "%s = %s, ", node->key, node->value);
2793 }
2794 }
2795 free(sorted_opts);
2796
2797 ds_chomp(options_action, ' ');
2798 ds_chomp(options_action, ',');
2799 ds_put_cstr(options_action, "); next;");
2800
2801 ds_put_format(response_action, "eth.dst = eth.src; eth.src = %s; "
2802 "ip6.dst = ip6.src; ip6.src = %s; udp.src = 547; "
2803 "udp.dst = 546; outport = inport; flags.loopback = 1; "
2804 "output;",
2805 server_mac, server_ip);
2806
2807 return true;
2808 }
2809
2810 static bool
2811 has_stateful_acl(struct ovn_datapath *od)
2812 {
2813 for (size_t i = 0; i < od->nbs->n_acls; i++) {
2814 struct nbrec_acl *acl = od->nbs->acls[i];
2815 if (!strcmp(acl->action, "allow-related")) {
2816 return true;
2817 }
2818 }
2819
2820 return false;
2821 }
2822
2823 static void
2824 build_pre_acls(struct ovn_datapath *od, struct hmap *lflows)
2825 {
2826 bool has_stateful = has_stateful_acl(od);
2827
2828 /* Ingress and Egress Pre-ACL Table (Priority 0): Packets are
2829 * allowed by default. */
2830 ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_ACL, 0, "1", "next;");
2831 ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_ACL, 0, "1", "next;");
2832
2833 /* If there are any stateful ACL rules in this datapath, we must
2834 * send all IP packets through the conntrack action, which handles
2835 * defragmentation, in order to match L4 headers. */
2836 if (has_stateful) {
2837 for (size_t i = 0; i < od->n_router_ports; i++) {
2838 struct ovn_port *op = od->router_ports[i];
2839 /* Can't use ct() for router ports. Consider the
2840 * following configuration: lp1(10.0.0.2) on
2841 * hostA--ls1--lr0--ls2--lp2(10.0.1.2) on hostB, For a
2842 * ping from lp1 to lp2, First, the response will go
2843 * through ct() with a zone for lp2 in the ls2 ingress
2844 * pipeline on hostB. That ct zone knows about this
2845 * connection. Next, it goes through ct() with the zone
2846 * for the router port in the egress pipeline of ls2 on
2847 * hostB. This zone does not know about the connection,
2848 * as the icmp request went through the logical router
2849 * on hostA, not hostB. This would only work with
2850 * distributed conntrack state across all chassis. */
2851 struct ds match_in = DS_EMPTY_INITIALIZER;
2852 struct ds match_out = DS_EMPTY_INITIALIZER;
2853
2854 ds_put_format(&match_in, "ip && inport == %s", op->json_key);
2855 ds_put_format(&match_out, "ip && outport == %s", op->json_key);
2856 ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_ACL, 110,
2857 ds_cstr(&match_in), "next;");
2858 ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_ACL, 110,
2859 ds_cstr(&match_out), "next;");
2860
2861 ds_destroy(&match_in);
2862 ds_destroy(&match_out);
2863 }
2864 if (od->localnet_port) {
2865 struct ds match_in = DS_EMPTY_INITIALIZER;
2866 struct ds match_out = DS_EMPTY_INITIALIZER;
2867
2868 ds_put_format(&match_in, "ip && inport == %s",
2869 od->localnet_port->json_key);
2870 ds_put_format(&match_out, "ip && outport == %s",
2871 od->localnet_port->json_key);
2872 ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_ACL, 110,
2873 ds_cstr(&match_in), "next;");
2874 ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_ACL, 110,
2875 ds_cstr(&match_out), "next;");
2876
2877 ds_destroy(&match_in);
2878 ds_destroy(&match_out);
2879 }
2880
2881 /* Ingress and Egress Pre-ACL Table (Priority 110).
2882 *
2883 * Not to do conntrack on ND packets. */
2884 ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_ACL, 110, "nd", "next;");
2885 ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_ACL, 110, "nd", "next;");
2886
2887 /* Ingress and Egress Pre-ACL Table (Priority 100).
2888 *
2889 * Regardless of whether the ACL is "from-lport" or "to-lport",
2890 * we need rules in both the ingress and egress table, because
2891 * the return traffic needs to be followed.
2892 *
2893 * 'REGBIT_CONNTRACK_DEFRAG' is set to let the pre-stateful table send
2894 * it to conntrack for tracking and defragmentation. */
2895 ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_ACL, 100, "ip",
2896 REGBIT_CONNTRACK_DEFRAG" = 1; next;");
2897 ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_ACL, 100, "ip",
2898 REGBIT_CONNTRACK_DEFRAG" = 1; next;");
2899 }
2900 }
2901
2902 /* For a 'key' of the form "IP:port" or just "IP", sets 'port' and
2903 * 'ip_address'. The caller must free() the memory allocated for
2904 * 'ip_address'. */
2905 static void
2906 ip_address_and_port_from_lb_key(const char *key, char **ip_address,
2907 uint16_t *port, int *addr_family)
2908 {
2909 struct sockaddr_storage ss;
2910 char ip_addr_buf[INET6_ADDRSTRLEN];
2911 char *error;
2912
2913 error = ipv46_parse(key, PORT_OPTIONAL, &ss);
2914 if (error) {
2915 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
2916 VLOG_WARN_RL(&rl, "bad ip address or port for load balancer key %s",
2917 key);
2918 free(error);
2919 return;
2920 }
2921
2922 if (ss.ss_family == AF_INET) {
2923 struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *, &ss);
2924 *port = sin->sin_port == 0 ? 0 : ntohs(sin->sin_port);
2925 inet_ntop(AF_INET, &sin->sin_addr, ip_addr_buf, sizeof ip_addr_buf);
2926 } else {
2927 struct sockaddr_in6 *sin6 = ALIGNED_CAST(struct sockaddr_in6 *, &ss);
2928 *port = sin6->sin6_port == 0 ? 0 : ntohs(sin6->sin6_port);
2929 inet_ntop(AF_INET6, &sin6->sin6_addr, ip_addr_buf, sizeof ip_addr_buf);
2930 }
2931
2932 *ip_address = xstrdup(ip_addr_buf);
2933 *addr_family = ss.ss_family;
2934 }
2935
2936 /*
2937 * Returns true if logical switch is configured with DNS records, false
2938 * otherwise.
2939 */
2940 static bool
2941 ls_has_dns_records(const struct nbrec_logical_switch *nbs)
2942 {
2943 for (size_t i = 0; i < nbs->n_dns_records; i++) {
2944 if (!smap_is_empty(&nbs->dns_records[i]->records)) {
2945 return true;
2946 }
2947 }
2948
2949 return false;
2950 }
2951
2952 static void
2953 build_pre_lb(struct ovn_datapath *od, struct hmap *lflows)
2954 {
2955 /* Allow all packets to go to next tables by default. */
2956 ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_LB, 0, "1", "next;");
2957 ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_LB, 0, "1", "next;");
2958
2959 struct sset all_ips = SSET_INITIALIZER(&all_ips);
2960 bool vip_configured = false;
2961 int addr_family = AF_INET;
2962 for (int i = 0; i < od->nbs->n_load_balancer; i++) {
2963 struct nbrec_load_balancer *lb = od->nbs->load_balancer[i];
2964 struct smap *vips = &lb->vips;
2965 struct smap_node *node;
2966
2967 SMAP_FOR_EACH (node, vips) {
2968 vip_configured = true;
2969
2970 /* node->key contains IP:port or just IP. */
2971 char *ip_address = NULL;
2972 uint16_t port;
2973 ip_address_and_port_from_lb_key(node->key, &ip_address, &port,
2974 &addr_family);
2975 if (!ip_address) {
2976 continue;
2977 }
2978
2979 if (!sset_contains(&all_ips, ip_address)) {
2980 sset_add(&all_ips, ip_address);
2981 }
2982
2983 free(ip_address);
2984
2985 /* Ignore L4 port information in the key because fragmented packets
2986 * may not have L4 information. The pre-stateful table will send
2987 * the packet through ct() action to de-fragment. In stateful
2988 * table, we will eventually look at L4 information. */
2989 }
2990 }
2991
2992 /* 'REGBIT_CONNTRACK_DEFRAG' is set to let the pre-stateful table send
2993 * packet to conntrack for defragmentation. */
2994 const char *ip_address;
2995 SSET_FOR_EACH(ip_address, &all_ips) {
2996 char *match;
2997
2998 if (addr_family == AF_INET) {
2999 match = xasprintf("ip && ip4.dst == %s", ip_address);
3000 } else {
3001 match = xasprintf("ip && ip6.dst == %s", ip_address);
3002 }
3003 ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_LB,
3004 100, match, REGBIT_CONNTRACK_DEFRAG" = 1; next;");
3005 free(match);
3006 }
3007
3008 sset_destroy(&all_ips);
3009
3010 if (vip_configured) {
3011 ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_LB,
3012 100, "ip", REGBIT_CONNTRACK_DEFRAG" = 1; next;");
3013 }
3014 }
3015
3016 static void
3017 build_pre_stateful(struct ovn_datapath *od, struct hmap *lflows)
3018 {
3019 /* Ingress and Egress pre-stateful Table (Priority 0): Packets are
3020 * allowed by default. */
3021 ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_STATEFUL, 0, "1", "next;");
3022 ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_STATEFUL, 0, "1", "next;");
3023
3024 /* If REGBIT_CONNTRACK_DEFRAG is set as 1, then the packets should be
3025 * sent to conntrack for tracking and defragmentation. */
3026 ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_STATEFUL, 100,
3027 REGBIT_CONNTRACK_DEFRAG" == 1", "ct_next;");
3028 ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_STATEFUL, 100,
3029 REGBIT_CONNTRACK_DEFRAG" == 1", "ct_next;");
3030 }
3031
3032 static void
3033 build_acl_log(struct ds *actions, const struct nbrec_acl *acl)
3034 {
3035 if (!acl->log) {
3036 return;
3037 }
3038
3039 ds_put_cstr(actions, "log(");
3040
3041 if (acl->name) {
3042 ds_put_format(actions, "name=\"%s\", ", acl->name);
3043 }
3044
3045 /* If a severity level isn't specified, default to "info". */
3046 if (acl->severity) {
3047 ds_put_format(actions, "severity=%s, ", acl->severity);
3048 } else {
3049 ds_put_format(actions, "severity=info, ");
3050 }
3051
3052 if (!strcmp(acl->action, "drop")) {
3053 ds_put_cstr(actions, "verdict=drop, ");
3054 } else if (!strcmp(acl->action, "reject")) {
3055 ds_put_cstr(actions, "verdict=reject, ");
3056 } else if (!strcmp(acl->action, "allow")
3057 || !strcmp(acl->action, "allow-related")) {
3058 ds_put_cstr(actions, "verdict=allow, ");
3059 }
3060
3061 ds_chomp(actions, ' ');
3062 ds_chomp(actions, ',');
3063 ds_put_cstr(actions, "); ");
3064 }
3065
3066 static void
3067 build_acls(struct ovn_datapath *od, struct hmap *lflows)
3068 {
3069 bool has_stateful = has_stateful_acl(od);
3070
3071 /* Ingress and Egress ACL Table (Priority 0): Packets are allowed by
3072 * default. A related rule at priority 1 is added below if there
3073 * are any stateful ACLs in this datapath. */
3074 ovn_lflow_add(lflows, od, S_SWITCH_IN_ACL, 0, "1", "next;");
3075 ovn_lflow_add(lflows, od, S_SWITCH_OUT_ACL, 0, "1", "next;");
3076
3077 if (has_stateful) {
3078 /* Ingress and Egress ACL Table (Priority 1).
3079 *
3080 * By default, traffic is allowed. This is partially handled by
3081 * the Priority 0 ACL flows added earlier, but we also need to
3082 * commit IP flows. This is because, while the initiater's
3083 * direction may not have any stateful rules, the server's may
3084 * and then its return traffic would not have an associated
3085 * conntrack entry and would return "+invalid".
3086 *
3087 * We use "ct_commit" for a connection that is not already known
3088 * by the connection tracker. Once a connection is committed,
3089 * subsequent packets will hit the flow at priority 0 that just
3090 * uses "next;"
3091 *
3092 * We also check for established connections that have ct_label.blocked
3093 * set on them. That's a connection that was disallowed, but is
3094 * now allowed by policy again since it hit this default-allow flow.
3095 * We need to set ct_label.blocked=0 to let the connection continue,
3096 * which will be done by ct_commit() in the "stateful" stage.
3097 * Subsequent packets will hit the flow at priority 0 that just
3098 * uses "next;". */
3099 ovn_lflow_add(lflows, od, S_SWITCH_IN_ACL, 1,
3100 "ip && (!ct.est || (ct.est && ct_label.blocked == 1))",
3101 REGBIT_CONNTRACK_COMMIT" = 1; next;");
3102 ovn_lflow_add(lflows, od, S_SWITCH_OUT_ACL, 1,
3103 "ip && (!ct.est || (ct.est && ct_label.blocked == 1))",
3104 REGBIT_CONNTRACK_COMMIT" = 1; next;");
3105
3106 /* Ingress and Egress ACL Table (Priority 65535).
3107 *
3108 * Always drop traffic that's in an invalid state. Also drop
3109 * reply direction packets for connections that have been marked
3110 * for deletion (bit 0 of ct_label is set).
3111 *
3112 * This is enforced at a higher priority than ACLs can be defined. */
3113 ovn_lflow_add(lflows, od, S_SWITCH_IN_ACL, UINT16_MAX,
3114 "ct.inv || (ct.est && ct.rpl && ct_label.blocked == 1)",
3115 "drop;");
3116 ovn_lflow_add(lflows, od, S_SWITCH_OUT_ACL, UINT16_MAX,
3117 "ct.inv || (ct.est && ct.rpl && ct_label.blocked == 1)",
3118 "drop;");
3119
3120 /* Ingress and Egress ACL Table (Priority 65535).
3121 *
3122 * Allow reply traffic that is part of an established
3123 * conntrack entry that has not been marked for deletion
3124 * (bit 0 of ct_label). We only match traffic in the
3125 * reply direction because we want traffic in the request
3126 * direction to hit the currently defined policy from ACLs.
3127 *
3128 * This is enforced at a higher priority than ACLs can be defined. */
3129 ovn_lflow_add(lflows, od, S_SWITCH_IN_ACL, UINT16_MAX,
3130 "ct.est && !ct.rel && !ct.new && !ct.inv "
3131 "&& ct.rpl && ct_label.blocked == 0",
3132 "next;");
3133 ovn_lflow_add(lflows, od, S_SWITCH_OUT_ACL, UINT16_MAX,
3134 "ct.est && !ct.rel && !ct.new && !ct.inv "
3135 "&& ct.rpl && ct_label.blocked == 0",
3136 "next;");
3137
3138 /* Ingress and Egress ACL Table (Priority 65535).
3139 *
3140 * Allow traffic that is related to an existing conntrack entry that
3141 * has not been marked for deletion (bit 0 of ct_label).
3142 *
3143 * This is enforced at a higher priority than ACLs can be defined.
3144 *
3145 * NOTE: This does not support related data sessions (eg,
3146 * a dynamically negotiated FTP data channel), but will allow
3147 * related traffic such as an ICMP Port Unreachable through
3148 * that's generated from a non-listening UDP port. */
3149 ovn_lflow_add(lflows, od, S_SWITCH_IN_ACL, UINT16_MAX,
3150 "!ct.est && ct.rel && !ct.new && !ct.inv "
3151 "&& ct_label.blocked == 0",
3152 "next;");
3153 ovn_lflow_add(lflows, od, S_SWITCH_OUT_ACL, UINT16_MAX,
3154 "!ct.est && ct.rel && !ct.new && !ct.inv "
3155 "&& ct_label.blocked == 0",
3156 "next;");
3157
3158 /* Ingress and Egress ACL Table (Priority 65535).
3159 *
3160 * Not to do conntrack on ND packets. */
3161 ovn_lflow_add(lflows, od, S_SWITCH_IN_ACL, UINT16_MAX, "nd", "next;");
3162 ovn_lflow_add(lflows, od, S_SWITCH_OUT_ACL, UINT16_MAX, "nd", "next;");
3163 }
3164
3165 /* Ingress or Egress ACL Table (Various priorities). */
3166 for (size_t i = 0; i < od->nbs->n_acls; i++) {
3167 struct nbrec_acl *acl = od->nbs->acls[i];
3168 bool ingress = !strcmp(acl->direction, "from-lport") ? true :false;
3169 enum ovn_stage stage = ingress ? S_SWITCH_IN_ACL : S_SWITCH_OUT_ACL;
3170
3171 char *stage_hint = xasprintf("%08x", acl->header_.uuid.parts[0]);
3172 if (!strcmp(acl->action, "allow")
3173 || !strcmp(acl->action, "allow-related")) {
3174 /* If there are any stateful flows, we must even commit "allow"
3175 * actions. This is because, while the initiater's
3176 * direction may not have any stateful rules, the server's
3177 * may and then its return traffic would not have an
3178 * associated conntrack entry and would return "+invalid". */
3179 if (!has_stateful) {
3180 struct ds actions = DS_EMPTY_INITIALIZER;
3181 build_acl_log(&actions, acl);
3182 ds_put_cstr(&actions, "next;");
3183 ovn_lflow_add_with_hint(lflows, od, stage,
3184 acl->priority + OVN_ACL_PRI_OFFSET,
3185 acl->match, ds_cstr(&actions),
3186 stage_hint);
3187 ds_destroy(&actions);
3188 } else {
3189 struct ds match = DS_EMPTY_INITIALIZER;
3190 struct ds actions = DS_EMPTY_INITIALIZER;
3191
3192 /* Commit the connection tracking entry if it's a new
3193 * connection that matches this ACL. After this commit,
3194 * the reply traffic is allowed by a flow we create at
3195 * priority 65535, defined earlier.
3196 *
3197 * It's also possible that a known connection was marked for
3198 * deletion after a policy was deleted, but the policy was
3199 * re-added while that connection is still known. We catch
3200 * that case here and un-set ct_label.blocked (which will be done
3201 * by ct_commit in the "stateful" stage) to indicate that the
3202 * connection should be allowed to resume.
3203 */
3204 ds_put_format(&match, "((ct.new && !ct.est)"
3205 " || (!ct.new && ct.est && !ct.rpl "
3206 "&& ct_label.blocked == 1)) "
3207 "&& (%s)", acl->match);
3208 ds_put_cstr(&actions, REGBIT_CONNTRACK_COMMIT" = 1; ");
3209 build_acl_log(&actions, acl);
3210 ds_put_cstr(&actions, "next;");
3211 ovn_lflow_add_with_hint(lflows, od, stage,
3212 acl->priority + OVN_ACL_PRI_OFFSET,
3213 ds_cstr(&match),
3214 ds_cstr(&actions),
3215 stage_hint);
3216
3217 /* Match on traffic in the request direction for an established
3218 * connection tracking entry that has not been marked for
3219 * deletion. There is no need to commit here, so we can just
3220 * proceed to the next table. We use this to ensure that this
3221 * connection is still allowed by the currently defined
3222 * policy. */
3223 ds_clear(&match);
3224 ds_clear(&actions);
3225 ds_put_format(&match,
3226 "!ct.new && ct.est && !ct.rpl"
3227 " && ct_label.blocked == 0 && (%s)",
3228 acl->match);
3229
3230 build_acl_log(&actions, acl);
3231 ds_put_cstr(&actions, "next;");
3232 ovn_lflow_add_with_hint(lflows, od, stage,
3233 acl->priority + OVN_ACL_PRI_OFFSET,
3234 ds_cstr(&match), ds_cstr(&actions),
3235 stage_hint);
3236
3237 ds_destroy(&match);
3238 ds_destroy(&actions);
3239 }
3240 } else if (!strcmp(acl->action, "drop")
3241 || !strcmp(acl->action, "reject")) {
3242 struct ds match = DS_EMPTY_INITIALIZER;
3243 struct ds actions = DS_EMPTY_INITIALIZER;
3244
3245 /* XXX Need to support "reject", treat it as "drop;" for now. */
3246 if (!strcmp(acl->action, "reject")) {
3247 VLOG_INFO("reject is not a supported action");
3248 }
3249
3250 /* The implementation of "drop" differs if stateful ACLs are in
3251 * use for this datapath. In that case, the actions differ
3252 * depending on whether the connection was previously committed
3253 * to the connection tracker with ct_commit. */
3254 if (has_stateful) {
3255 /* If the packet is not part of an established connection, then
3256 * we can simply drop it. */
3257 ds_put_format(&match,
3258 "(!ct.est || (ct.est && ct_label.blocked == 1)) "
3259 "&& (%s)",
3260 acl->match);
3261 ds_clear(&actions);
3262 build_acl_log(&actions, acl);
3263 ds_put_cstr(&actions, "/* drop */");
3264 ovn_lflow_add_with_hint(lflows, od, stage,
3265 acl->priority + OVN_ACL_PRI_OFFSET,
3266 ds_cstr(&match), ds_cstr(&actions),
3267 stage_hint);
3268
3269 /* For an existing connection without ct_label set, we've
3270 * encountered a policy change. ACLs previously allowed
3271 * this connection and we committed the connection tracking
3272 * entry. Current policy says that we should drop this
3273 * connection. First, we set bit 0 of ct_label to indicate
3274 * that this connection is set for deletion. By not
3275 * specifying "next;", we implicitly drop the packet after
3276 * updating conntrack state. We would normally defer
3277 * ct_commit() to the "stateful" stage, but since we're
3278 * dropping the packet, we go ahead and do it here. */
3279 ds_clear(&match);
3280 ds_clear(&actions);
3281 ds_put_format(&match,
3282 "ct.est && ct_label.blocked == 0 && (%s)",
3283 acl->match);
3284 ds_put_cstr(&actions, "ct_commit(ct_label=1/1); ");
3285 build_acl_log(&actions, acl);
3286 ds_put_cstr(&actions, "/* drop */");
3287 ovn_lflow_add_with_hint(lflows, od, stage,
3288 acl->priority + OVN_ACL_PRI_OFFSET,
3289 ds_cstr(&match), ds_cstr(&actions),
3290 stage_hint);
3291
3292 } else {
3293 /* There are no stateful ACLs in use on this datapath,
3294 * so a "drop" ACL is simply the "drop" logical flow action
3295 * in all cases. */
3296 ds_clear(&actions);
3297 build_acl_log(&actions, acl);
3298 ds_put_cstr(&actions, "/* drop */");
3299 ovn_lflow_add_with_hint(lflows, od, stage,
3300 acl->priority + OVN_ACL_PRI_OFFSET,
3301 acl->match, ds_cstr(&actions),
3302 stage_hint);
3303 }
3304 ds_destroy(&match);
3305 ds_destroy(&actions);
3306 }
3307 free(stage_hint);
3308 }
3309
3310 /* Add 34000 priority flow to allow DHCP reply from ovn-controller to all
3311 * logical ports of the datapath if the CMS has configured DHCPv4 options.
3312 * */
3313 for (size_t i = 0; i < od->nbs->n_ports; i++) {
3314 if (od->nbs->ports[i]->dhcpv4_options) {
3315 const char *server_id = smap_get(
3316 &od->nbs->ports[i]->dhcpv4_options->options, "server_id");
3317 const char *server_mac = smap_get(
3318 &od->nbs->ports[i]->dhcpv4_options->options, "server_mac");
3319 const char *lease_time = smap_get(
3320 &od->nbs->ports[i]->dhcpv4_options->options, "lease_time");
3321 if (server_id && server_mac && lease_time) {
3322 struct ds match = DS_EMPTY_INITIALIZER;
3323 const char *actions =
3324 has_stateful ? "ct_commit; next;" : "next;";
3325 ds_put_format(&match, "outport == \"%s\" && eth.src == %s "
3326 "&& ip4.src == %s && udp && udp.src == 67 "
3327 "&& udp.dst == 68", od->nbs->ports[i]->name,
3328 server_mac, server_id);
3329 ovn_lflow_add(
3330 lflows, od, S_SWITCH_OUT_ACL, 34000, ds_cstr(&match),
3331 actions);
3332 ds_destroy(&match);
3333 }
3334 }
3335
3336 if (od->nbs->ports[i]->dhcpv6_options) {
3337 const char *server_mac = smap_get(
3338 &od->nbs->ports[i]->dhcpv6_options->options, "server_id");
3339 struct eth_addr ea;
3340 if (server_mac && eth_addr_from_string(server_mac, &ea)) {
3341 /* Get the link local IP of the DHCPv6 server from the
3342 * server MAC. */
3343 struct in6_addr lla;
3344 in6_generate_lla(ea, &lla);
3345
3346 char server_ip[INET6_ADDRSTRLEN + 1];
3347 ipv6_string_mapped(server_ip, &lla);
3348
3349 struct ds match = DS_EMPTY_INITIALIZER;
3350 const char *actions = has_stateful ? "ct_commit; next;" :
3351 "next;";
3352 ds_put_format(&match, "outport == \"%s\" && eth.src == %s "
3353 "&& ip6.src == %s && udp && udp.src == 547 "
3354 "&& udp.dst == 546", od->nbs->ports[i]->name,
3355 server_mac, server_ip);
3356 ovn_lflow_add(
3357 lflows, od, S_SWITCH_OUT_ACL, 34000, ds_cstr(&match),
3358 actions);
3359 ds_destroy(&match);
3360 }
3361 }
3362 }
3363
3364 /* Add a 34000 priority flow to advance the DNS reply from ovn-controller,
3365 * if the CMS has configured DNS records for the datapath.
3366 */
3367 if (ls_has_dns_records(od->nbs)) {
3368 const char *actions = has_stateful ? "ct_commit; next;" : "next;";
3369 ovn_lflow_add(
3370 lflows, od, S_SWITCH_OUT_ACL, 34000, "udp.src == 53",
3371 actions);
3372 }
3373 }
3374
3375 static void
3376 build_qos(struct ovn_datapath *od, struct hmap *lflows) {
3377 ovn_lflow_add(lflows, od, S_SWITCH_IN_QOS_MARK, 0, "1", "next;");
3378 ovn_lflow_add(lflows, od, S_SWITCH_OUT_QOS_MARK, 0, "1", "next;");
3379
3380 for (size_t i = 0; i < od->nbs->n_qos_rules; i++) {
3381 struct nbrec_qos *qos = od->nbs->qos_rules[i];
3382 bool ingress = !strcmp(qos->direction, "from-lport") ? true :false;
3383 enum ovn_stage stage = ingress ? S_SWITCH_IN_QOS_MARK : S_SWITCH_OUT_QOS_MARK;
3384
3385 if (!strcmp(qos->key_action, "dscp")) {
3386 struct ds dscp_action = DS_EMPTY_INITIALIZER;
3387
3388 ds_put_format(&dscp_action, "ip.dscp = %d; next;",
3389 (uint8_t)qos->value_action);
3390 ovn_lflow_add(lflows, od, stage,
3391 qos->priority,
3392 qos->match, ds_cstr(&dscp_action));
3393 ds_destroy(&dscp_action);
3394 }
3395 }
3396 }
3397
3398 static void
3399 build_lb(struct ovn_datapath *od, struct hmap *lflows)
3400 {
3401 /* Ingress and Egress LB Table (Priority 0): Packets are allowed by
3402 * default. */
3403 ovn_lflow_add(lflows, od, S_SWITCH_IN_LB, 0, "1", "next;");
3404 ovn_lflow_add(lflows, od, S_SWITCH_OUT_LB, 0, "1", "next;");
3405
3406 if (od->nbs->load_balancer) {
3407 /* Ingress and Egress LB Table (Priority 65535).
3408 *
3409 * Send established traffic through conntrack for just NAT. */
3410 ovn_lflow_add(lflows, od, S_SWITCH_IN_LB, UINT16_MAX,
3411 "ct.est && !ct.rel && !ct.new && !ct.inv",
3412 REGBIT_CONNTRACK_NAT" = 1; next;");
3413 ovn_lflow_add(lflows, od, S_SWITCH_OUT_LB, UINT16_MAX,
3414 "ct.est && !ct.rel && !ct.new && !ct.inv",
3415 REGBIT_CONNTRACK_NAT" = 1; next;");
3416 }
3417 }
3418
3419 static void
3420 build_stateful(struct ovn_datapath *od, struct hmap *lflows)
3421 {
3422 /* Ingress and Egress stateful Table (Priority 0): Packets are
3423 * allowed by default. */
3424 ovn_lflow_add(lflows, od, S_SWITCH_IN_STATEFUL, 0, "1", "next;");
3425 ovn_lflow_add(lflows, od, S_SWITCH_OUT_STATEFUL, 0, "1", "next;");
3426
3427 /* If REGBIT_CONNTRACK_COMMIT is set as 1, then the packets should be
3428 * committed to conntrack. We always set ct_label.blocked to 0 here as
3429 * any packet that makes it this far is part of a connection we
3430 * want to allow to continue. */
3431 ovn_lflow_add(lflows, od, S_SWITCH_IN_STATEFUL, 100,
3432 REGBIT_CONNTRACK_COMMIT" == 1", "ct_commit(ct_label=0/1); next;");
3433 ovn_lflow_add(lflows, od, S_SWITCH_OUT_STATEFUL, 100,
3434 REGBIT_CONNTRACK_COMMIT" == 1", "ct_commit(ct_label=0/1); next;");
3435
3436 /* If REGBIT_CONNTRACK_NAT is set as 1, then packets should just be sent
3437 * through nat (without committing).
3438 *
3439 * REGBIT_CONNTRACK_COMMIT is set for new connections and
3440 * REGBIT_CONNTRACK_NAT is set for established connections. So they
3441 * don't overlap.
3442 */
3443 ovn_lflow_add(lflows, od, S_SWITCH_IN_STATEFUL, 100,
3444 REGBIT_CONNTRACK_NAT" == 1", "ct_lb;");
3445 ovn_lflow_add(lflows, od, S_SWITCH_OUT_STATEFUL, 100,
3446 REGBIT_CONNTRACK_NAT" == 1", "ct_lb;");
3447
3448 /* Load balancing rules for new connections get committed to conntrack
3449 * table. So even if REGBIT_CONNTRACK_COMMIT is set in a previous table
3450 * a higher priority rule for load balancing below also commits the
3451 * connection, so it is okay if we do not hit the above match on
3452 * REGBIT_CONNTRACK_COMMIT. */
3453 for (int i = 0; i < od->nbs->n_load_balancer; i++) {
3454 struct nbrec_load_balancer *lb = od->nbs->load_balancer[i];
3455 struct smap *vips = &lb->vips;
3456 struct smap_node *node;
3457
3458 SMAP_FOR_EACH (node, vips) {
3459 uint16_t port = 0;
3460 int addr_family;
3461
3462 /* node->key contains IP:port or just IP. */
3463 char *ip_address = NULL;
3464 ip_address_and_port_from_lb_key(node->key, &ip_address, &port,
3465 &addr_family);
3466 if (!ip_address) {
3467 continue;
3468 }
3469
3470 /* New connections in Ingress table. */
3471 char *action = xasprintf("ct_lb(%s);", node->value);
3472 struct ds match = DS_EMPTY_INITIALIZER;
3473 if (addr_family == AF_INET) {
3474 ds_put_format(&match, "ct.new && ip4.dst == %s", ip_address);
3475 } else {
3476 ds_put_format(&match, "ct.new && ip6.dst == %s", ip_address);
3477 }
3478 if (port) {
3479 if (lb->protocol && !strcmp(lb->protocol, "udp")) {
3480 ds_put_format(&match, " && udp.dst == %d", port);
3481 } else {
3482 ds_put_format(&match, " && tcp.dst == %d", port);
3483 }
3484 ovn_lflow_add(lflows, od, S_SWITCH_IN_STATEFUL,
3485 120, ds_cstr(&match), action);
3486 } else {
3487 ovn_lflow_add(lflows, od, S_SWITCH_IN_STATEFUL,
3488 110, ds_cstr(&match), action);
3489 }
3490
3491 free(ip_address);
3492 ds_destroy(&match);
3493 free(action);
3494 }
3495 }
3496 }
3497
3498 static void
3499 build_lswitch_flows(struct hmap *datapaths, struct hmap *ports,
3500 struct hmap *lflows, struct hmap *mcgroups)
3501 {
3502 /* This flow table structure is documented in ovn-northd(8), so please
3503 * update ovn-northd.8.xml if you change anything. */
3504
3505 struct ds match = DS_EMPTY_INITIALIZER;
3506 struct ds actions = DS_EMPTY_INITIALIZER;
3507
3508 /* Build pre-ACL and ACL tables for both ingress and egress.
3509 * Ingress tables 3 through 9. Egress tables 0 through 6. */
3510 struct ovn_datapath *od;
3511 HMAP_FOR_EACH (od, key_node, datapaths) {
3512 if (!od->nbs) {
3513 continue;
3514 }
3515
3516 build_pre_acls(od, lflows);
3517 build_pre_lb(od, lflows);
3518 build_pre_stateful(od, lflows);
3519 build_acls(od, lflows);
3520 build_qos(od, lflows);
3521 build_lb(od, lflows);
3522 build_stateful(od, lflows);
3523 }
3524
3525 /* Logical switch ingress table 0: Admission control framework (priority
3526 * 100). */
3527 HMAP_FOR_EACH (od, key_node, datapaths) {
3528 if (!od->nbs) {
3529 continue;
3530 }
3531
3532 /* Logical VLANs not supported. */
3533 ovn_lflow_add(lflows, od, S_SWITCH_IN_PORT_SEC_L2, 100, "vlan.present",
3534 "drop;");
3535
3536 /* Broadcast/multicast source address is invalid. */
3537 ovn_lflow_add(lflows, od, S_SWITCH_IN_PORT_SEC_L2, 100, "eth.src[40]",
3538 "drop;");
3539
3540 /* Port security flows have priority 50 (see below) and will continue
3541 * to the next table if packet source is acceptable. */
3542 }
3543
3544 /* Logical switch ingress table 0: Ingress port security - L2
3545 * (priority 50).
3546 * Ingress table 1: Ingress port security - IP (priority 90 and 80)
3547 * Ingress table 2: Ingress port security - ND (priority 90 and 80)
3548 */
3549 struct ovn_port *op;
3550 HMAP_FOR_EACH (op, key_node, ports) {
3551 if (!op->nbsp) {
3552 continue;
3553 }
3554
3555 if (!lsp_is_enabled(op->nbsp)) {
3556 /* Drop packets from disabled logical ports (since logical flow
3557 * tables are default-drop). */
3558 continue;
3559 }
3560
3561 ds_clear(&match);
3562 ds_clear(&actions);
3563 ds_put_format(&match, "inport == %s", op->json_key);
3564 build_port_security_l2("eth.src", op->ps_addrs, op->n_ps_addrs,
3565 &match);
3566
3567 const char *queue_id = smap_get(&op->sb->options, "qdisc_queue_id");
3568 if (queue_id) {
3569 ds_put_format(&actions, "set_queue(%s); ", queue_id);
3570 }
3571 ds_put_cstr(&actions, "next;");
3572 ovn_lflow_add(lflows, op->od, S_SWITCH_IN_PORT_SEC_L2, 50,
3573 ds_cstr(&match), ds_cstr(&actions));
3574
3575 if (op->nbsp->n_port_security) {
3576 build_port_security_ip(P_IN, op, lflows);
3577 build_port_security_nd(op, lflows);
3578 }
3579 }
3580
3581 /* Ingress table 1 and 2: Port security - IP and ND, by default goto next.
3582 * (priority 0)*/
3583 HMAP_FOR_EACH (od, key_node, datapaths) {
3584 if (!od->nbs) {
3585 continue;
3586 }
3587
3588 ovn_lflow_add(lflows, od, S_SWITCH_IN_PORT_SEC_ND, 0, "1", "next;");
3589 ovn_lflow_add(lflows, od, S_SWITCH_IN_PORT_SEC_IP, 0, "1", "next;");
3590 }
3591
3592 /* Ingress table 10: ARP/ND responder, skip requests coming from localnet
3593 * and vtep ports. (priority 100); see ovn-northd.8.xml for the
3594 * rationale. */
3595 HMAP_FOR_EACH (op, key_node, ports) {
3596 if (!op->nbsp) {
3597 continue;
3598 }
3599
3600 if ((!strcmp(op->nbsp->type, "localnet")) ||
3601 (!strcmp(op->nbsp->type, "vtep"))) {
3602 ds_clear(&match);
3603 ds_put_format(&match, "inport == %s", op->json_key);
3604 ovn_lflow_add(lflows, op->od, S_SWITCH_IN_ARP_ND_RSP, 100,
3605 ds_cstr(&match), "next;");
3606 }
3607 }
3608
3609 /* Ingress table 10: ARP/ND responder, reply for known IPs.
3610 * (priority 50). */
3611 HMAP_FOR_EACH (op, key_node, ports) {
3612 if (!op->nbsp) {
3613 continue;
3614 }
3615
3616 /*
3617 * Add ARP/ND reply flows if either the
3618 * - port is up or
3619 * - port type is router or
3620 * - port type is localport
3621 */
3622 if (!lsp_is_up(op->nbsp) && strcmp(op->nbsp->type, "router") &&
3623 strcmp(op->nbsp->type, "localport")) {
3624 continue;
3625 }
3626
3627 for (size_t i = 0; i < op->n_lsp_addrs; i++) {
3628 for (size_t j = 0; j < op->lsp_addrs[i].n_ipv4_addrs; j++) {
3629 ds_clear(&match);
3630 ds_put_format(&match, "arp.tpa == %s && arp.op == 1",
3631 op->lsp_addrs[i].ipv4_addrs[j].addr_s);
3632 ds_clear(&actions);
3633 ds_put_format(&actions,
3634 "eth.dst = eth.src; "
3635 "eth.src = %s; "
3636 "arp.op = 2; /* ARP reply */ "
3637 "arp.tha = arp.sha; "
3638 "arp.sha = %s; "
3639 "arp.tpa = arp.spa; "
3640 "arp.spa = %s; "
3641 "outport = inport; "
3642 "flags.loopback = 1; "
3643 "output;",
3644 op->lsp_addrs[i].ea_s, op->lsp_addrs[i].ea_s,
3645 op->lsp_addrs[i].ipv4_addrs[j].addr_s);
3646 ovn_lflow_add(lflows, op->od, S_SWITCH_IN_ARP_ND_RSP, 50,
3647 ds_cstr(&match), ds_cstr(&actions));
3648
3649 /* Do not reply to an ARP request from the port that owns the
3650 * address (otherwise a DHCP client that ARPs to check for a
3651 * duplicate address will fail). Instead, forward it the usual
3652 * way.
3653 *
3654 * (Another alternative would be to simply drop the packet. If
3655 * everything is working as it is configured, then this would
3656 * produce equivalent results, since no one should reply to the
3657 * request. But ARPing for one's own IP address is intended to
3658 * detect situations where the network is not working as
3659 * configured, so dropping the request would frustrate that
3660 * intent.) */
3661 ds_put_format(&match, " && inport == %s", op->json_key);
3662 ovn_lflow_add(lflows, op->od, S_SWITCH_IN_ARP_ND_RSP, 100,
3663 ds_cstr(&match), "next;");
3664 }
3665
3666 /* For ND solicitations, we need to listen for both the
3667 * unicast IPv6 address and its all-nodes multicast address,
3668 * but always respond with the unicast IPv6 address. */
3669 for (size_t j = 0; j < op->lsp_addrs[i].n_ipv6_addrs; j++) {
3670 ds_clear(&match);
3671 ds_put_format(&match,
3672 "nd_ns && ip6.dst == {%s, %s} && nd.target == %s",
3673 op->lsp_addrs[i].ipv6_addrs[j].addr_s,
3674 op->lsp_addrs[i].ipv6_addrs[j].sn_addr_s,
3675 op->lsp_addrs[i].ipv6_addrs[j].addr_s);
3676
3677 ds_clear(&actions);
3678 ds_put_format(&actions,
3679 "nd_na { "
3680 "eth.src = %s; "
3681 "ip6.src = %s; "
3682 "nd.target = %s; "
3683 "nd.tll = %s; "
3684 "outport = inport; "
3685 "flags.loopback = 1; "
3686 "output; "
3687 "};",
3688 op->lsp_addrs[i].ea_s,
3689 op->lsp_addrs[i].ipv6_addrs[j].addr_s,
3690 op->lsp_addrs[i].ipv6_addrs[j].addr_s,
3691 op->lsp_addrs[i].ea_s);
3692 ovn_lflow_add(lflows, op->od, S_SWITCH_IN_ARP_ND_RSP, 50,
3693 ds_cstr(&match), ds_cstr(&actions));
3694
3695 /* Do not reply to a solicitation from the port that owns the
3696 * address (otherwise DAD detection will fail). */
3697 ds_put_format(&match, " && inport == %s", op->json_key);
3698 ovn_lflow_add(lflows, op->od, S_SWITCH_IN_ARP_ND_RSP, 100,
3699 ds_cstr(&match), "next;");
3700 }
3701 }
3702 }
3703
3704 /* Ingress table 10: ARP/ND responder, by default goto next.
3705 * (priority 0)*/
3706 HMAP_FOR_EACH (od, key_node, datapaths) {
3707 if (!od->nbs) {
3708 continue;
3709 }
3710
3711 ovn_lflow_add(lflows, od, S_SWITCH_IN_ARP_ND_RSP, 0, "1", "next;");
3712 }
3713
3714 /* Logical switch ingress table 11 and 12: DHCP options and response
3715 * priority 100 flows. */
3716 HMAP_FOR_EACH (op, key_node, ports) {
3717 if (!op->nbsp) {
3718 continue;
3719 }
3720
3721 if (!lsp_is_enabled(op->nbsp) || !strcmp(op->nbsp->type, "router")) {
3722 /* Don't add the DHCP flows if the port is not enabled or if the
3723 * port is a router port. */
3724 continue;
3725 }
3726
3727 if (!op->nbsp->dhcpv4_options && !op->nbsp->dhcpv6_options) {
3728 /* CMS has disabled both native DHCPv4 and DHCPv6 for this lport.
3729 */
3730 continue;
3731 }
3732
3733 for (size_t i = 0; i < op->n_lsp_addrs; i++) {
3734 for (size_t j = 0; j < op->lsp_addrs[i].n_ipv4_addrs; j++) {
3735 struct ds options_action = DS_EMPTY_INITIALIZER;
3736 struct ds response_action = DS_EMPTY_INITIALIZER;
3737 struct ds ipv4_addr_match = DS_EMPTY_INITIALIZER;
3738 if (build_dhcpv4_action(
3739 op, op->lsp_addrs[i].ipv4_addrs[j].addr,
3740 &options_action, &response_action, &ipv4_addr_match)) {
3741 ds_clear(&match);
3742 ds_put_format(
3743 &match, "inport == %s && eth.src == %s && "
3744 "ip4.src == 0.0.0.0 && ip4.dst == 255.255.255.255 && "
3745 "udp.src == 68 && udp.dst == 67", op->json_key,
3746 op->lsp_addrs[i].ea_s);
3747
3748 ovn_lflow_add(lflows, op->od, S_SWITCH_IN_DHCP_OPTIONS,
3749 100, ds_cstr(&match),
3750 ds_cstr(&options_action));
3751 ds_clear(&match);
3752 /* Allow ip4.src = OFFER_IP and
3753 * ip4.dst = {SERVER_IP, 255.255.255.255} for the below
3754 * cases
3755 * - When the client wants to renew the IP by sending
3756 * the DHCPREQUEST to the server ip.
3757 * - When the client wants to renew the IP by
3758 * broadcasting the DHCPREQUEST.
3759 */
3760 ds_put_format(
3761 &match, "inport == %s && eth.src == %s && "
3762 "%s && udp.src == 68 && udp.dst == 67", op->json_key,
3763 op->lsp_addrs[i].ea_s, ds_cstr(&ipv4_addr_match));
3764
3765 ovn_lflow_add(lflows, op->od, S_SWITCH_IN_DHCP_OPTIONS,
3766 100, ds_cstr(&match),
3767 ds_cstr(&options_action));
3768 ds_clear(&match);
3769
3770 /* If REGBIT_DHCP_OPTS_RESULT is set, it means the
3771 * put_dhcp_opts action is successful. */
3772 ds_put_format(
3773 &match, "inport == %s && eth.src == %s && "
3774 "ip4 && udp.src == 68 && udp.dst == 67"
3775 " && "REGBIT_DHCP_OPTS_RESULT, op->json_key,
3776 op->lsp_addrs[i].ea_s);
3777 ovn_lflow_add(lflows, op->od, S_SWITCH_IN_DHCP_RESPONSE,
3778 100, ds_cstr(&match),
3779 ds_cstr(&response_action));
3780 ds_destroy(&options_action);
3781 ds_destroy(&response_action);
3782 ds_destroy(&ipv4_addr_match);
3783 break;
3784 }
3785 }
3786
3787 for (size_t j = 0; j < op->lsp_addrs[i].n_ipv6_addrs; j++) {
3788 struct ds options_action = DS_EMPTY_INITIALIZER;
3789 struct ds response_action = DS_EMPTY_INITIALIZER;
3790 if (build_dhcpv6_action(
3791 op, &op->lsp_addrs[i].ipv6_addrs[j].addr,
3792 &options_action, &response_action)) {
3793 ds_clear(&match);
3794 ds_put_format(
3795 &match, "inport == %s && eth.src == %s"
3796 " && ip6.dst == ff02::1:2 && udp.src == 546 &&"
3797 " udp.dst == 547", op->json_key,
3798 op->lsp_addrs[i].ea_s);
3799
3800 ovn_lflow_add(lflows, op->od, S_SWITCH_IN_DHCP_OPTIONS, 100,
3801 ds_cstr(&match), ds_cstr(&options_action));
3802
3803 /* If REGBIT_DHCP_OPTS_RESULT is set to 1, it means the
3804 * put_dhcpv6_opts action is successful */
3805 ds_put_cstr(&match, " && "REGBIT_DHCP_OPTS_RESULT);
3806 ovn_lflow_add(lflows, op->od, S_SWITCH_IN_DHCP_RESPONSE, 100,
3807 ds_cstr(&match), ds_cstr(&response_action));
3808 ds_destroy(&options_action);
3809 ds_destroy(&response_action);
3810 break;
3811 }
3812 }
3813 }
3814 }
3815
3816 /* Logical switch ingress table 13 and 14: DNS lookup and response
3817 * priority 100 flows.
3818 */
3819 HMAP_FOR_EACH (od, key_node, datapaths) {
3820 if (!od->nbs || !ls_has_dns_records(od->nbs)) {
3821 continue;
3822 }
3823
3824 struct ds action = DS_EMPTY_INITIALIZER;
3825
3826 ds_clear(&match);
3827 ds_put_cstr(&match, "udp.dst == 53");
3828 ds_put_format(&action,
3829 REGBIT_DNS_LOOKUP_RESULT" = dns_lookup(); next;");
3830 ovn_lflow_add(lflows, od, S_SWITCH_IN_DNS_LOOKUP, 100,
3831 ds_cstr(&match), ds_cstr(&action));
3832 ds_clear(&action);
3833 ds_put_cstr(&match, " && "REGBIT_DNS_LOOKUP_RESULT);
3834 ds_put_format(&action, "eth.dst <-> eth.src; ip4.src <-> ip4.dst; "
3835 "udp.dst = udp.src; udp.src = 53; outport = inport; "
3836 "flags.loopback = 1; output;");
3837 ovn_lflow_add(lflows, od, S_SWITCH_IN_DNS_RESPONSE, 100,
3838 ds_cstr(&match), ds_cstr(&action));
3839 ds_clear(&action);
3840 ds_put_format(&action, "eth.dst <-> eth.src; ip6.src <-> ip6.dst; "
3841 "udp.dst = udp.src; udp.src = 53; outport = inport; "
3842 "flags.loopback = 1; output;");
3843 ovn_lflow_add(lflows, od, S_SWITCH_IN_DNS_RESPONSE, 100,
3844 ds_cstr(&match), ds_cstr(&action));
3845 ds_destroy(&action);
3846 }
3847
3848 /* Ingress table 11 and 12: DHCP options and response, by default goto next.
3849 * (priority 0).
3850 * Ingress table 13 and 14: DNS lookup and response, by default goto next.
3851 * (priority 0).*/
3852
3853 HMAP_FOR_EACH (od, key_node, datapaths) {
3854 if (!od->nbs) {
3855 continue;
3856 }
3857
3858 ovn_lflow_add(lflows, od, S_SWITCH_IN_DHCP_OPTIONS, 0, "1", "next;");
3859 ovn_lflow_add(lflows, od, S_SWITCH_IN_DHCP_RESPONSE, 0, "1", "next;");
3860 ovn_lflow_add(lflows, od, S_SWITCH_IN_DNS_LOOKUP, 0, "1", "next;");
3861 ovn_lflow_add(lflows, od, S_SWITCH_IN_DNS_RESPONSE, 0, "1", "next;");
3862 }
3863
3864 /* Ingress table 15: Destination lookup, broadcast and multicast handling
3865 * (priority 100). */
3866 HMAP_FOR_EACH (op, key_node, ports) {
3867 if (!op->nbsp) {
3868 continue;
3869 }
3870
3871 if (lsp_is_enabled(op->nbsp)) {
3872 ovn_multicast_add(mcgroups, &mc_flood, op);
3873 }
3874 }
3875 HMAP_FOR_EACH (od, key_node, datapaths) {
3876 if (!od->nbs) {
3877 continue;
3878 }
3879
3880 ovn_lflow_add(lflows, od, S_SWITCH_IN_L2_LKUP, 100, "eth.mcast",
3881 "outport = \""MC_FLOOD"\"; output;");
3882 }
3883
3884 /* Ingress table 13: Destination lookup, unicast handling (priority 50), */
3885 HMAP_FOR_EACH (op, key_node, ports) {
3886 if (!op->nbsp) {
3887 continue;
3888 }
3889
3890 for (size_t i = 0; i < op->nbsp->n_addresses; i++) {
3891 /* Addresses are owned by the logical port.
3892 * Ethernet address followed by zero or more IPv4
3893 * or IPv6 addresses (or both). */
3894 struct eth_addr mac;
3895 if (ovs_scan(op->nbsp->addresses[i],
3896 ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))) {
3897 ds_clear(&match);
3898 ds_put_format(&match, "eth.dst == "ETH_ADDR_FMT,
3899 ETH_ADDR_ARGS(mac));
3900
3901 ds_clear(&actions);
3902 ds_put_format(&actions, "outport = %s; output;", op->json_key);
3903 ovn_lflow_add(lflows, op->od, S_SWITCH_IN_L2_LKUP, 50,
3904 ds_cstr(&match), ds_cstr(&actions));
3905 } else if (!strcmp(op->nbsp->addresses[i], "unknown")) {
3906 if (lsp_is_enabled(op->nbsp)) {
3907 ovn_multicast_add(mcgroups, &mc_unknown, op);
3908 op->od->has_unknown = true;
3909 }
3910 } else if (is_dynamic_lsp_address(op->nbsp->addresses[i])) {
3911 if (!op->nbsp->dynamic_addresses
3912 || !ovs_scan(op->nbsp->dynamic_addresses,
3913 ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))) {
3914 continue;
3915 }
3916 ds_clear(&match);
3917 ds_put_format(&match, "eth.dst == "ETH_ADDR_FMT,
3918 ETH_ADDR_ARGS(mac));
3919
3920 ds_clear(&actions);
3921 ds_put_format(&actions, "outport = %s; output;", op->json_key);
3922 ovn_lflow_add(lflows, op->od, S_SWITCH_IN_L2_LKUP, 50,
3923 ds_cstr(&match), ds_cstr(&actions));
3924 } else if (!strcmp(op->nbsp->addresses[i], "router")) {
3925 if (!op->peer || !op->peer->nbrp
3926 || !ovs_scan(op->peer->nbrp->mac,
3927 ETH_ADDR_SCAN_FMT, ETH_ADDR_SCAN_ARGS(mac))) {
3928 continue;
3929 }
3930 ds_clear(&match);
3931 ds_put_format(&match, "eth.dst == "ETH_ADDR_FMT,
3932 ETH_ADDR_ARGS(mac));
3933 if (op->peer->od->l3dgw_port
3934 && op->peer == op->peer->od->l3dgw_port
3935 && op->peer->od->l3redirect_port) {
3936 /* The destination lookup flow for the router's
3937 * distributed gateway port MAC address should only be
3938 * programmed on the "redirect-chassis". */
3939 ds_put_format(&match, " && is_chassis_resident(%s)",
3940 op->peer->od->l3redirect_port->json_key);
3941 }
3942
3943 ds_clear(&actions);
3944 ds_put_format(&actions, "outport = %s; output;", op->json_key);
3945 ovn_lflow_add(lflows, op->od, S_SWITCH_IN_L2_LKUP, 50,
3946 ds_cstr(&match), ds_cstr(&actions));
3947
3948 /* Add ethernet addresses specified in NAT rules on
3949 * distributed logical routers. */
3950 if (op->peer->od->l3dgw_port
3951 && op->peer == op->peer->od->l3dgw_port) {
3952 for (int j = 0; j < op->peer->od->nbr->n_nat; j++) {
3953 const struct nbrec_nat *nat
3954 = op->peer->od->nbr->nat[j];
3955 if (!strcmp(nat->type, "dnat_and_snat")
3956 && nat->logical_port && nat->external_mac
3957 && eth_addr_from_string(nat->external_mac, &mac)) {
3958
3959 ds_clear(&match);
3960 ds_put_format(&match, "eth.dst == "ETH_ADDR_FMT
3961 " && is_chassis_resident(\"%s\")",
3962 ETH_ADDR_ARGS(mac),
3963 nat->logical_port);
3964
3965 ds_clear(&actions);
3966 ds_put_format(&actions, "outport = %s; output;",
3967 op->json_key);
3968 ovn_lflow_add(lflows, op->od, S_SWITCH_IN_L2_LKUP,
3969 50, ds_cstr(&match),
3970 ds_cstr(&actions));
3971 }
3972 }
3973 }
3974 } else {
3975 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
3976
3977 VLOG_INFO_RL(&rl,
3978 "%s: invalid syntax '%s' in addresses column",
3979 op->nbsp->name, op->nbsp->addresses[i]);
3980 }
3981 }
3982 }
3983
3984 /* Ingress table 13: Destination lookup for unknown MACs (priority 0). */
3985 HMAP_FOR_EACH (od, key_node, datapaths) {
3986 if (!od->nbs) {
3987 continue;
3988 }
3989
3990 if (od->has_unknown) {
3991 ovn_lflow_add(lflows, od, S_SWITCH_IN_L2_LKUP, 0, "1",
3992 "outport = \""MC_UNKNOWN"\"; output;");
3993 }
3994 }
3995
3996 /* Egress tables 6: Egress port security - IP (priority 0)
3997 * Egress table 7: Egress port security L2 - multicast/broadcast
3998 * (priority 100). */
3999 HMAP_FOR_EACH (od, key_node, datapaths) {
4000 if (!od->nbs) {
4001 continue;
4002 }
4003
4004 ovn_lflow_add(lflows, od, S_SWITCH_OUT_PORT_SEC_IP, 0, "1", "next;");
4005 ovn_lflow_add(lflows, od, S_SWITCH_OUT_PORT_SEC_L2, 100, "eth.mcast",
4006 "output;");
4007 }
4008
4009 /* Egress table 6: Egress port security - IP (priorities 90 and 80)
4010 * if port security enabled.
4011 *
4012 * Egress table 7: Egress port security - L2 (priorities 50 and 150).
4013 *
4014 * Priority 50 rules implement port security for enabled logical port.
4015 *
4016 * Priority 150 rules drop packets to disabled logical ports, so that they
4017 * don't even receive multicast or broadcast packets. */
4018 HMAP_FOR_EACH (op, key_node, ports) {
4019 if (!op->nbsp) {
4020 continue;
4021 }
4022
4023 ds_clear(&match);
4024 ds_put_format(&match, "outport == %s", op->json_key);
4025 if (lsp_is_enabled(op->nbsp)) {
4026 build_port_security_l2("eth.dst", op->ps_addrs, op->n_ps_addrs,
4027 &match);
4028 ovn_lflow_add(lflows, op->od, S_SWITCH_OUT_PORT_SEC_L2, 50,
4029 ds_cstr(&match), "output;");
4030 } else {
4031 ovn_lflow_add(lflows, op->od, S_SWITCH_OUT_PORT_SEC_L2, 150,
4032 ds_cstr(&match), "drop;");
4033 }
4034
4035 if (op->nbsp->n_port_security) {
4036 build_port_security_ip(P_OUT, op, lflows);
4037 }
4038 }
4039
4040 ds_destroy(&match);
4041 ds_destroy(&actions);
4042 }
4043
4044 static bool
4045 lrport_is_enabled(const struct nbrec_logical_router_port *lrport)
4046 {
4047 return !lrport->enabled || *lrport->enabled;
4048 }
4049
4050 /* Returns a string of the IP address of the router port 'op' that
4051 * overlaps with 'ip_s". If one is not found, returns NULL.
4052 *
4053 * The caller must not free the returned string. */
4054 static const char *
4055 find_lrp_member_ip(const struct ovn_port *op, const char *ip_s)
4056 {
4057 bool is_ipv4 = strchr(ip_s, '.') ? true : false;
4058
4059 if (is_ipv4) {
4060 ovs_be32 ip;
4061
4062 if (!ip_parse(ip_s, &ip)) {
4063 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
4064 VLOG_WARN_RL(&rl, "bad ip address %s", ip_s);
4065 return NULL;
4066 }
4067
4068 for (int i = 0; i < op->lrp_networks.n_ipv4_addrs; i++) {
4069 const struct ipv4_netaddr *na = &op->lrp_networks.ipv4_addrs[i];
4070
4071 if (!((na->network ^ ip) & na->mask)) {
4072 /* There should be only 1 interface that matches the
4073 * supplied IP. Otherwise, it's a configuration error,
4074 * because subnets of a router's interfaces should NOT
4075 * overlap. */
4076 return na->addr_s;
4077 }
4078 }
4079 } else {
4080 struct in6_addr ip6;
4081
4082 if (!ipv6_parse(ip_s, &ip6)) {
4083 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
4084 VLOG_WARN_RL(&rl, "bad ipv6 address %s", ip_s);
4085 return NULL;
4086 }
4087
4088 for (int i = 0; i < op->lrp_networks.n_ipv6_addrs; i++) {
4089 const struct ipv6_netaddr *na = &op->lrp_networks.ipv6_addrs[i];
4090 struct in6_addr xor_addr = ipv6_addr_bitxor(&na->network, &ip6);
4091 struct in6_addr and_addr = ipv6_addr_bitand(&xor_addr, &na->mask);
4092
4093 if (ipv6_is_zero(&and_addr)) {
4094 /* There should be only 1 interface that matches the
4095 * supplied IP. Otherwise, it's a configuration error,
4096 * because subnets of a router's interfaces should NOT
4097 * overlap. */
4098 return na->addr_s;
4099 }
4100 }
4101 }
4102
4103 return NULL;
4104 }
4105
4106 static void
4107 add_route(struct hmap *lflows, const struct ovn_port *op,
4108 const char *lrp_addr_s, const char *network_s, int plen,
4109 const char *gateway, const char *policy)
4110 {
4111 bool is_ipv4 = strchr(network_s, '.') ? true : false;
4112 struct ds match = DS_EMPTY_INITIALIZER;
4113 const char *dir;
4114 uint16_t priority;
4115
4116 if (policy && !strcmp(policy, "src-ip")) {
4117 dir = "src";
4118 priority = plen * 2;
4119 } else {
4120 dir = "dst";
4121 priority = (plen * 2) + 1;
4122 }
4123
4124 /* IPv6 link-local addresses must be scoped to the local router port. */
4125 if (!is_ipv4) {
4126 struct in6_addr network;
4127 ovs_assert(ipv6_parse(network_s, &network));
4128 if (in6_is_lla(&network)) {
4129 ds_put_format(&match, "inport == %s && ", op->json_key);
4130 }
4131 }
4132 ds_put_format(&match, "ip%s.%s == %s/%d", is_ipv4 ? "4" : "6", dir,
4133 network_s, plen);
4134
4135 struct ds actions = DS_EMPTY_INITIALIZER;
4136 ds_put_format(&actions, "ip.ttl--; %sreg0 = ", is_ipv4 ? "" : "xx");
4137
4138 if (gateway) {
4139 ds_put_cstr(&actions, gateway);
4140 } else {
4141 ds_put_format(&actions, "ip%s.dst", is_ipv4 ? "4" : "6");
4142 }
4143 ds_put_format(&actions, "; "
4144 "%sreg1 = %s; "
4145 "eth.src = %s; "
4146 "outport = %s; "
4147 "flags.loopback = 1; "
4148 "next;",
4149 is_ipv4 ? "" : "xx",
4150 lrp_addr_s,
4151 op->lrp_networks.ea_s,
4152 op->json_key);
4153
4154 /* The priority here is calculated to implement longest-prefix-match
4155 * routing. */
4156 ovn_lflow_add(lflows, op->od, S_ROUTER_IN_IP_ROUTING, priority,
4157 ds_cstr(&match), ds_cstr(&actions));
4158 ds_destroy(&match);
4159 ds_destroy(&actions);
4160 }
4161
4162 static void
4163 build_static_route_flow(struct hmap *lflows, struct ovn_datapath *od,
4164 struct hmap *ports,
4165 const struct nbrec_logical_router_static_route *route)
4166 {
4167 ovs_be32 nexthop;
4168 const char *lrp_addr_s = NULL;
4169 unsigned int plen;
4170 bool is_ipv4;
4171
4172 /* Verify that the next hop is an IP address with an all-ones mask. */
4173 char *error = ip_parse_cidr(route->nexthop, &nexthop, &plen);
4174 if (!error) {
4175 if (plen != 32) {
4176 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
4177 VLOG_WARN_RL(&rl, "bad next hop mask %s", route->nexthop);
4178 return;
4179 }
4180 is_ipv4 = true;
4181 } else {
4182 free(error);
4183
4184 struct in6_addr ip6;
4185 error = ipv6_parse_cidr(route->nexthop, &ip6, &plen);
4186 if (!error) {
4187 if (plen != 128) {
4188 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
4189 VLOG_WARN_RL(&rl, "bad next hop mask %s", route->nexthop);
4190 return;
4191 }
4192 is_ipv4 = false;
4193 } else {
4194 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
4195 VLOG_WARN_RL(&rl, "bad next hop ip address %s", route->nexthop);
4196 free(error);
4197 return;
4198 }
4199 }
4200
4201 char *prefix_s;
4202 if (is_ipv4) {
4203 ovs_be32 prefix;
4204 /* Verify that ip prefix is a valid IPv4 address. */
4205 error = ip_parse_cidr(route->ip_prefix, &prefix, &plen);
4206 if (error) {
4207 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
4208 VLOG_WARN_RL(&rl, "bad 'ip_prefix' in static routes %s",
4209 route->ip_prefix);
4210 free(error);
4211 return;
4212 }
4213 prefix_s = xasprintf(IP_FMT, IP_ARGS(prefix & be32_prefix_mask(plen)));
4214 } else {
4215 /* Verify that ip prefix is a valid IPv6 address. */
4216 struct in6_addr prefix;
4217 error = ipv6_parse_cidr(route->ip_prefix, &prefix, &plen);
4218 if (error) {
4219 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
4220 VLOG_WARN_RL(&rl, "bad 'ip_prefix' in static routes %s",
4221 route->ip_prefix);
4222 free(error);
4223 return;
4224 }
4225 struct in6_addr mask = ipv6_create_mask(plen);
4226 struct in6_addr network = ipv6_addr_bitand(&prefix, &mask);
4227 prefix_s = xmalloc(INET6_ADDRSTRLEN);
4228 inet_ntop(AF_INET6, &network, prefix_s, INET6_ADDRSTRLEN);
4229 }
4230
4231 /* Find the outgoing port. */
4232 struct ovn_port *out_port = NULL;
4233 if (route->output_port) {
4234 out_port = ovn_port_find(ports, route->output_port);
4235 if (!out_port) {
4236 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
4237 VLOG_WARN_RL(&rl, "Bad out port %s for static route %s",
4238 route->output_port, route->ip_prefix);
4239 goto free_prefix_s;
4240 }
4241 lrp_addr_s = find_lrp_member_ip(out_port, route->nexthop);
4242 if (!lrp_addr_s) {
4243 /* There are no IP networks configured on the router's port via
4244 * which 'route->nexthop' is theoretically reachable. But since
4245 * 'out_port' has been specified, we honor it by trying to reach
4246 * 'route->nexthop' via the first IP address of 'out_port'.
4247 * (There are cases, e.g in GCE, where each VM gets a /32 IP
4248 * address and the default gateway is still reachable from it.) */
4249 if (is_ipv4) {
4250 if (out_port->lrp_networks.n_ipv4_addrs) {
4251 lrp_addr_s = out_port->lrp_networks.ipv4_addrs[0].addr_s;
4252 }
4253 } else {
4254 if (out_port->lrp_networks.n_ipv6_addrs) {
4255 lrp_addr_s = out_port->lrp_networks.ipv6_addrs[0].addr_s;
4256 }
4257 }
4258 }
4259 } else {
4260 /* output_port is not specified, find the
4261 * router port matching the next hop. */
4262 int i;
4263 for (i = 0; i < od->nbr->n_ports; i++) {
4264 struct nbrec_logical_router_port *lrp = od->nbr->ports[i];
4265 out_port = ovn_port_find(ports, lrp->name);
4266 if (!out_port) {
4267 /* This should not happen. */
4268 continue;
4269 }
4270
4271 lrp_addr_s = find_lrp_member_ip(out_port, route->nexthop);
4272 if (lrp_addr_s) {
4273 break;
4274 }
4275 }
4276 }
4277
4278 if (!out_port || !lrp_addr_s) {
4279 /* There is no matched out port. */
4280 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
4281 VLOG_WARN_RL(&rl, "No path for static route %s; next hop %s",
4282 route->ip_prefix, route->nexthop);
4283 goto free_prefix_s;
4284 }
4285
4286 char *policy = route->policy ? route->policy : "dst-ip";
4287 add_route(lflows, out_port, lrp_addr_s, prefix_s, plen, route->nexthop,
4288 policy);
4289
4290 free_prefix_s:
4291 free(prefix_s);
4292 }
4293
4294 static void
4295 op_put_v4_networks(struct ds *ds, const struct ovn_port *op, bool add_bcast)
4296 {
4297 if (!add_bcast && op->lrp_networks.n_ipv4_addrs == 1) {
4298 ds_put_format(ds, "%s", op->lrp_networks.ipv4_addrs[0].addr_s);
4299 return;
4300 }
4301
4302 ds_put_cstr(ds, "{");
4303 for (int i = 0; i < op->lrp_networks.n_ipv4_addrs; i++) {
4304 ds_put_format(ds, "%s, ", op->lrp_networks.ipv4_addrs[i].addr_s);
4305 if (add_bcast) {
4306 ds_put_format(ds, "%s, ", op->lrp_networks.ipv4_addrs[i].bcast_s);
4307 }
4308 }
4309 ds_chomp(ds, ' ');
4310 ds_chomp(ds, ',');
4311 ds_put_cstr(ds, "}");
4312 }
4313
4314 static void
4315 op_put_v6_networks(struct ds *ds, const struct ovn_port *op)
4316 {
4317 if (op->lrp_networks.n_ipv6_addrs == 1) {
4318 ds_put_format(ds, "%s", op->lrp_networks.ipv6_addrs[0].addr_s);
4319 return;
4320 }
4321
4322 ds_put_cstr(ds, "{");
4323 for (int i = 0; i < op->lrp_networks.n_ipv6_addrs; i++) {
4324 ds_put_format(ds, "%s, ", op->lrp_networks.ipv6_addrs[i].addr_s);
4325 }
4326 ds_chomp(ds, ' ');
4327 ds_chomp(ds, ',');
4328 ds_put_cstr(ds, "}");
4329 }
4330
4331 static const char *
4332 get_force_snat_ip(struct ovn_datapath *od, const char *key_type, ovs_be32 *ip)
4333 {
4334 char *key = xasprintf("%s_force_snat_ip", key_type);
4335 const char *ip_address = smap_get(&od->nbr->options, key);
4336 free(key);
4337
4338 if (ip_address) {
4339 ovs_be32 mask;
4340 char *error = ip_parse_masked(ip_address, ip, &mask);
4341 if (error || mask != OVS_BE32_MAX) {
4342 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
4343 VLOG_WARN_RL(&rl, "bad ip %s in options of router "UUID_FMT"",
4344 ip_address, UUID_ARGS(&od->key));
4345 free(error);
4346 *ip = 0;
4347 return NULL;
4348 }
4349 return ip_address;
4350 }
4351
4352 *ip = 0;
4353 return NULL;
4354 }
4355
4356 static void
4357 add_router_lb_flow(struct hmap *lflows, struct ovn_datapath *od,
4358 struct ds *match, struct ds *actions, int priority,
4359 const char *lb_force_snat_ip, char *backend_ips,
4360 bool is_udp, int addr_family)
4361 {
4362 /* A match and actions for new connections. */
4363 char *new_match = xasprintf("ct.new && %s", ds_cstr(match));
4364 if (lb_force_snat_ip) {
4365 char *new_actions = xasprintf("flags.force_snat_for_lb = 1; %s",
4366 ds_cstr(actions));
4367 ovn_lflow_add(lflows, od, S_ROUTER_IN_DNAT, priority, new_match,
4368 new_actions);
4369 free(new_actions);
4370 } else {
4371 ovn_lflow_add(lflows, od, S_ROUTER_IN_DNAT, priority, new_match,
4372 ds_cstr(actions));
4373 }
4374
4375 /* A match and actions for established connections. */
4376 char *est_match = xasprintf("ct.est && %s", ds_cstr(match));
4377 if (lb_force_snat_ip) {
4378 ovn_lflow_add(lflows, od, S_ROUTER_IN_DNAT, priority, est_match,
4379 "flags.force_snat_for_lb = 1; ct_dnat;");
4380 } else {
4381 ovn_lflow_add(lflows, od, S_ROUTER_IN_DNAT, priority, est_match,
4382 "ct_dnat;");
4383 }
4384
4385 free(new_match);
4386 free(est_match);
4387
4388 if (!od->l3dgw_port || !od->l3redirect_port || !backend_ips
4389 || addr_family != AF_INET) {
4390 return;
4391 }
4392
4393 /* Add logical flows to UNDNAT the load balanced reverse traffic in
4394 * the router egress pipleine stage - S_ROUTER_OUT_UNDNAT if the logical
4395 * router has a gateway router port associated.
4396 */
4397 struct ds undnat_match = DS_EMPTY_INITIALIZER;
4398 ds_put_cstr(&undnat_match, "ip4 && (");
4399 char *start, *next, *ip_str;
4400 start = next = xstrdup(backend_ips);
4401 ip_str = strsep(&next, ",");
4402 bool backend_ips_found = false;
4403 while (ip_str && ip_str[0]) {
4404 char *ip_address = NULL;
4405 uint16_t port = 0;
4406 int addr_family;
4407 ip_address_and_port_from_lb_key(ip_str, &ip_address, &port,
4408 &addr_family);
4409 if (!ip_address) {
4410 break;
4411 }
4412
4413 ds_put_format(&undnat_match, "(ip4.src == %s", ip_address);
4414 free(ip_address);
4415 if (port) {
4416 ds_put_format(&undnat_match, " && %s.src == %d) || ",
4417 is_udp ? "udp" : "tcp", port);
4418 } else {
4419 ds_put_cstr(&undnat_match, ") || ");
4420 }
4421 ip_str = strsep(&next, ",");
4422 backend_ips_found = true;
4423 }
4424
4425 free(start);
4426 if (!backend_ips_found) {
4427 ds_destroy(&undnat_match);
4428 return;
4429 }
4430 ds_chomp(&undnat_match, ' ');
4431 ds_chomp(&undnat_match, '|');
4432 ds_chomp(&undnat_match, '|');
4433 ds_chomp(&undnat_match, ' ');
4434 ds_put_format(&undnat_match, ") && outport == %s && "
4435 "is_chassis_resident(%s)", od->l3dgw_port->json_key,
4436 od->l3redirect_port->json_key);
4437 if (lb_force_snat_ip) {
4438 ovn_lflow_add(lflows, od, S_ROUTER_OUT_UNDNAT, 120,
4439 ds_cstr(&undnat_match),
4440 "flags.force_snat_for_lb = 1; ct_dnat;");
4441 } else {
4442 ovn_lflow_add(lflows, od, S_ROUTER_OUT_UNDNAT, 120,
4443 ds_cstr(&undnat_match), "ct_dnat;");
4444 }
4445
4446 ds_destroy(&undnat_match);
4447 }
4448
4449 static void
4450 build_lrouter_flows(struct hmap *datapaths, struct hmap *ports,
4451 struct hmap *lflows)
4452 {
4453 /* This flow table structure is documented in ovn-northd(8), so please
4454 * update ovn-northd.8.xml if you change anything. */
4455
4456 struct ds match = DS_EMPTY_INITIALIZER;
4457 struct ds actions = DS_EMPTY_INITIALIZER;
4458
4459 /* Logical router ingress table 0: Admission control framework. */
4460 struct ovn_datapath *od;
4461 HMAP_FOR_EACH (od, key_node, datapaths) {
4462 if (!od->nbr) {
4463 continue;
4464 }
4465
4466 /* Logical VLANs not supported.
4467 * Broadcast/multicast source address is invalid. */
4468 ovn_lflow_add(lflows, od, S_ROUTER_IN_ADMISSION, 100,
4469 "vlan.present || eth.src[40]", "drop;");
4470 }
4471
4472 /* Logical router ingress table 0: match (priority 50). */
4473 struct ovn_port *op;
4474 HMAP_FOR_EACH (op, key_node, ports) {
4475 if (!op->nbrp) {
4476 continue;
4477 }
4478
4479 if (!lrport_is_enabled(op->nbrp)) {
4480 /* Drop packets from disabled logical ports (since logical flow
4481 * tables are default-drop). */
4482 continue;
4483 }
4484
4485 if (op->derived) {
4486 /* No ingress packets should be received on a chassisredirect
4487 * port. */
4488 continue;
4489 }
4490
4491 ds_clear(&match);
4492 ds_put_format(&match, "eth.mcast && inport == %s", op->json_key);
4493 ovn_lflow_add(lflows, op->od, S_ROUTER_IN_ADMISSION, 50,
4494 ds_cstr(&match), "next;");
4495
4496 ds_clear(&match);
4497 ds_put_format(&match, "eth.dst == %s && inport == %s",
4498 op->lrp_networks.ea_s, op->json_key);
4499 if (op->od->l3dgw_port && op == op->od->l3dgw_port
4500 && op->od->l3redirect_port) {
4501 /* Traffic with eth.dst = l3dgw_port->lrp_networks.ea_s
4502 * should only be received on the "redirect-chassis". */
4503 ds_put_format(&match, " && is_chassis_resident(%s)",
4504 op->od->l3redirect_port->json_key);
4505 }
4506 ovn_lflow_add(lflows, op->od, S_ROUTER_IN_ADMISSION, 50,
4507 ds_cstr(&match), "next;");
4508 }
4509
4510 /* Logical router ingress table 1: IP Input. */
4511 HMAP_FOR_EACH (od, key_node, datapaths) {
4512 if (!od->nbr) {
4513 continue;
4514 }
4515
4516 /* L3 admission control: drop multicast and broadcast source, localhost
4517 * source or destination, and zero network source or destination
4518 * (priority 100). */
4519 ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 100,
4520 "ip4.mcast || "
4521 "ip4.src == 255.255.255.255 || "
4522 "ip4.src == 127.0.0.0/8 || "
4523 "ip4.dst == 127.0.0.0/8 || "
4524 "ip4.src == 0.0.0.0/8 || "
4525 "ip4.dst == 0.0.0.0/8",
4526 "drop;");
4527
4528 /* ARP reply handling. Use ARP replies to populate the logical
4529 * router's ARP table. */
4530 ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 90, "arp.op == 2",
4531 "put_arp(inport, arp.spa, arp.sha);");
4532
4533 /* Drop Ethernet local broadcast. By definition this traffic should
4534 * not be forwarded.*/
4535 ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 50,
4536 "eth.bcast", "drop;");
4537
4538 /* TTL discard.
4539 *
4540 * XXX Need to send ICMP time exceeded if !ip.later_frag. */
4541 ds_clear(&match);
4542 ds_put_cstr(&match, "ip4 && ip.ttl == {0, 1}");
4543 ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 30,
4544 ds_cstr(&match), "drop;");
4545
4546 /* ND advertisement handling. Use advertisements to populate
4547 * the logical router's ARP/ND table. */
4548 ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 90, "nd_na",
4549 "put_nd(inport, nd.target, nd.tll);");
4550
4551 /* Lean from neighbor solicitations that were not directed at
4552 * us. (A priority-90 flow will respond to requests to us and
4553 * learn the sender's mac address. */
4554 ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 80, "nd_ns",
4555 "put_nd(inport, ip6.src, nd.sll);");
4556
4557 /* Pass other traffic not already handled to the next table for
4558 * routing. */
4559 ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 0, "1", "next;");
4560 }
4561
4562 /* Logical router ingress table 1: IP Input for IPv4. */
4563 HMAP_FOR_EACH (op, key_node, ports) {
4564 if (!op->nbrp) {
4565 continue;
4566 }
4567
4568 if (op->derived) {
4569 /* No ingress packets are accepted on a chassisredirect
4570 * port, so no need to program flows for that port. */
4571 continue;
4572 }
4573
4574 if (op->lrp_networks.n_ipv4_addrs) {
4575 /* L3 admission control: drop packets that originate from an
4576 * IPv4 address owned by the router or a broadcast address
4577 * known to the router (priority 100). */
4578 ds_clear(&match);
4579 ds_put_cstr(&match, "ip4.src == ");
4580 op_put_v4_networks(&match, op, true);
4581 ds_put_cstr(&match, " && "REGBIT_EGRESS_LOOPBACK" == 0");
4582 ovn_lflow_add(lflows, op->od, S_ROUTER_IN_IP_INPUT, 100,
4583 ds_cstr(&match), "drop;");
4584
4585 /* ICMP echo reply. These flows reply to ICMP echo requests
4586 * received for the router's IP address. Since packets only
4587 * get here as part of the logical router datapath, the inport
4588 * (i.e. the incoming locally attached net) does not matter.
4589 * The ip.ttl also does not matter (RFC1812 section 4.2.2.9) */
4590 ds_clear(&match);
4591 ds_put_cstr(&match, "ip4.dst == ");
4592 op_put_v4_networks(&match, op, false);
4593 ds_put_cstr(&match, " && icmp4.type == 8 && icmp4.code == 0");
4594
4595 ds_clear(&actions);
4596 ds_put_format(&actions,
4597 "ip4.dst <-> ip4.src; "
4598 "ip.ttl = 255; "
4599 "icmp4.type = 0; "
4600 "flags.loopback = 1; "
4601 "next; ");
4602 ovn_lflow_add(lflows, op->od, S_ROUTER_IN_IP_INPUT, 90,
4603 ds_cstr(&match), ds_cstr(&actions));
4604 }
4605
4606 /* ARP reply. These flows reply to ARP requests for the router's own
4607 * IP address. */
4608 for (int i = 0; i < op->lrp_networks.n_ipv4_addrs; i++) {
4609 ds_clear(&match);
4610 ds_put_format(&match,
4611 "inport == %s && arp.tpa == %s && arp.op == 1",
4612 op->json_key, op->lrp_networks.ipv4_addrs[i].addr_s);
4613 if (op->od->l3dgw_port && op == op->od->l3dgw_port
4614 && op->od->l3redirect_port) {
4615 /* Traffic with eth.src = l3dgw_port->lrp_networks.ea_s
4616 * should only be sent from the "redirect-chassis", so that
4617 * upstream MAC learning points to the "redirect-chassis".
4618 * Also need to avoid generation of multiple ARP responses
4619 * from different chassis. */
4620 ds_put_format(&match, " && is_chassis_resident(%s)",
4621 op->od->l3redirect_port->json_key);
4622 }
4623
4624 ds_clear(&actions);
4625 ds_put_format(&actions,
4626 "eth.dst = eth.src; "
4627 "eth.src = %s; "
4628 "arp.op = 2; /* ARP reply */ "
4629 "arp.tha = arp.sha; "
4630 "arp.sha = %s; "
4631 "arp.tpa = arp.spa; "
4632 "arp.spa = %s; "
4633 "outport = %s; "
4634 "flags.loopback = 1; "
4635 "output;",
4636 op->lrp_networks.ea_s,
4637 op->lrp_networks.ea_s,
4638 op->lrp_networks.ipv4_addrs[i].addr_s,
4639 op->json_key);
4640 ovn_lflow_add(lflows, op->od, S_ROUTER_IN_IP_INPUT, 90,
4641 ds_cstr(&match), ds_cstr(&actions));
4642 }
4643
4644 /* A set to hold all load-balancer vips that need ARP responses. */
4645 struct sset all_ips = SSET_INITIALIZER(&all_ips);
4646 int addr_family;
4647 get_router_load_balancer_ips(op->od, &all_ips, &addr_family);
4648
4649 const char *ip_address;
4650 SSET_FOR_EACH(ip_address, &all_ips) {
4651 ds_clear(&match);
4652 if (addr_family == AF_INET) {
4653 ds_put_format(&match,
4654 "inport == %s && arp.tpa == %s && arp.op == 1",
4655 op->json_key, ip_address);
4656 } else {
4657 ds_put_format(&match,
4658 "inport == %s && nd_ns && nd.target == %s",
4659 op->json_key, ip_address);
4660 }
4661
4662 ds_clear(&actions);
4663 if (addr_family == AF_INET) {
4664 ds_put_format(&actions,
4665 "eth.dst = eth.src; "
4666 "eth.src = %s; "
4667 "arp.op = 2; /* ARP reply */ "
4668 "arp.tha = arp.sha; "
4669 "arp.sha = %s; "
4670 "arp.tpa = arp.spa; "
4671 "arp.spa = %s; "
4672 "outport = %s; "
4673 "flags.loopback = 1; "
4674 "output;",
4675 op->lrp_networks.ea_s,
4676 op->lrp_networks.ea_s,
4677 ip_address,
4678 op->json_key);
4679 } else {
4680 ds_put_format(&actions,
4681 "nd_na { "
4682 "eth.src = %s; "
4683 "ip6.src = %s; "
4684 "nd.target = %s; "
4685 "nd.tll = %s; "
4686 "outport = inport; "
4687 "flags.loopback = 1; "
4688 "output; "
4689 "};",
4690 op->lrp_networks.ea_s,
4691 ip_address,
4692 ip_address,
4693 op->lrp_networks.ea_s);
4694 }
4695 ovn_lflow_add(lflows, op->od, S_ROUTER_IN_IP_INPUT, 90,
4696 ds_cstr(&match), ds_cstr(&actions));
4697 }
4698
4699 sset_destroy(&all_ips);
4700
4701 /* A gateway router can have 2 SNAT IP addresses to force DNATed and
4702 * LBed traffic respectively to be SNATed. In addition, there can be
4703 * a number of SNAT rules in the NAT table. */
4704 ovs_be32 *snat_ips = xmalloc(sizeof *snat_ips *
4705 (op->od->nbr->n_nat + 2));
4706 size_t n_snat_ips = 0;
4707
4708 ovs_be32 snat_ip;
4709 const char *dnat_force_snat_ip = get_force_snat_ip(op->od, "dnat",
4710 &snat_ip);
4711 if (dnat_force_snat_ip) {
4712 snat_ips[n_snat_ips++] = snat_ip;
4713 }
4714
4715 const char *lb_force_snat_ip = get_force_snat_ip(op->od, "lb",
4716 &snat_ip);
4717 if (lb_force_snat_ip) {
4718 snat_ips[n_snat_ips++] = snat_ip;
4719 }
4720
4721 for (int i = 0; i < op->od->nbr->n_nat; i++) {
4722 const struct nbrec_nat *nat;
4723
4724 nat = op->od->nbr->nat[i];
4725
4726 ovs_be32 ip;
4727 if (!ip_parse(nat->external_ip, &ip) || !ip) {
4728 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
4729 VLOG_WARN_RL(&rl, "bad ip address %s in nat configuration "
4730 "for router %s", nat->external_ip, op->key);
4731 continue;
4732 }
4733
4734 if (!strcmp(nat->type, "snat")) {
4735 snat_ips[n_snat_ips++] = ip;
4736 continue;
4737 }
4738
4739 /* ARP handling for external IP addresses.
4740 *
4741 * DNAT IP addresses are external IP addresses that need ARP
4742 * handling. */
4743 ds_clear(&match);
4744 ds_put_format(&match,
4745 "inport == %s && arp.tpa == "IP_FMT" && arp.op == 1",
4746 op->json_key, IP_ARGS(ip));
4747
4748 ds_clear(&actions);
4749 ds_put_format(&actions,
4750 "eth.dst = eth.src; "
4751 "arp.op = 2; /* ARP reply */ "
4752 "arp.tha = arp.sha; ");
4753
4754 if (op->od->l3dgw_port && op == op->od->l3dgw_port) {
4755 struct eth_addr mac;
4756 if (nat->external_mac &&
4757 eth_addr_from_string(nat->external_mac, &mac)
4758 && nat->logical_port) {
4759 /* distributed NAT case, use nat->external_mac */
4760 ds_put_format(&actions,
4761 "eth.src = "ETH_ADDR_FMT"; "
4762 "arp.sha = "ETH_ADDR_FMT"; ",
4763 ETH_ADDR_ARGS(mac),
4764 ETH_ADDR_ARGS(mac));
4765 /* Traffic with eth.src = nat->external_mac should only be
4766 * sent from the chassis where nat->logical_port is
4767 * resident, so that upstream MAC learning points to the
4768 * correct chassis. Also need to avoid generation of
4769 * multiple ARP responses from different chassis. */
4770 ds_put_format(&match, " && is_chassis_resident(\"%s\")",
4771 nat->logical_port);
4772 } else {
4773 ds_put_format(&actions,
4774 "eth.src = %s; "
4775 "arp.sha = %s; ",
4776 op->lrp_networks.ea_s,
4777 op->lrp_networks.ea_s);
4778 /* Traffic with eth.src = l3dgw_port->lrp_networks.ea_s
4779 * should only be sent from the "redirect-chassis", so that
4780 * upstream MAC learning points to the "redirect-chassis".
4781 * Also need to avoid generation of multiple ARP responses
4782 * from different chassis. */
4783 if (op->od->l3redirect_port) {
4784 ds_put_format(&match, " && is_chassis_resident(%s)",
4785 op->od->l3redirect_port->json_key);
4786 }
4787 }
4788 } else {
4789 ds_put_format(&actions,
4790 "eth.src = %s; "
4791 "arp.sha = %s; ",
4792 op->lrp_networks.ea_s,
4793 op->lrp_networks.ea_s);
4794 }
4795 ds_put_format(&actions,
4796 "arp.tpa = arp.spa; "
4797 "arp.spa = "IP_FMT"; "
4798 "outport = %s; "
4799 "flags.loopback = 1; "
4800 "output;",
4801 IP_ARGS(ip),
4802 op->json_key);
4803 ovn_lflow_add(lflows, op->od, S_ROUTER_IN_IP_INPUT, 90,
4804 ds_cstr(&match), ds_cstr(&actions));
4805 }
4806
4807 ds_clear(&match);
4808 ds_put_cstr(&match, "ip4.dst == {");
4809 bool has_drop_ips = false;
4810 for (int i = 0; i < op->lrp_networks.n_ipv4_addrs; i++) {
4811 bool snat_ip_is_router_ip = false;
4812 for (int j = 0; j < n_snat_ips; j++) {
4813 /* Packets to SNAT IPs should not be dropped. */
4814 if (op->lrp_networks.ipv4_addrs[i].addr == snat_ips[j]) {
4815 snat_ip_is_router_ip = true;
4816 break;
4817 }
4818 }
4819 if (snat_ip_is_router_ip) {
4820 continue;
4821 }
4822 ds_put_format(&match, "%s, ",
4823 op->lrp_networks.ipv4_addrs[i].addr_s);
4824 has_drop_ips = true;
4825 }
4826 ds_chomp(&match, ' ');
4827 ds_chomp(&match, ',');
4828 ds_put_cstr(&match, "}");
4829
4830 if (has_drop_ips) {
4831 /* Drop IP traffic to this router. */
4832 ovn_lflow_add(lflows, op->od, S_ROUTER_IN_IP_INPUT, 60,
4833 ds_cstr(&match), "drop;");
4834 }
4835
4836 free(snat_ips);
4837 }
4838
4839 /* Logical router ingress table 1: IP Input for IPv6. */
4840 HMAP_FOR_EACH (op, key_node, ports) {
4841 if (!op->nbrp) {
4842 continue;
4843 }
4844
4845 if (op->derived) {
4846 /* No ingress packets are accepted on a chassisredirect
4847 * port, so no need to program flows for that port. */
4848 continue;
4849 }
4850
4851 if (op->lrp_networks.n_ipv6_addrs) {
4852 /* L3 admission control: drop packets that originate from an
4853 * IPv6 address owned by the router (priority 100). */
4854 ds_clear(&match);
4855 ds_put_cstr(&match, "ip6.src == ");
4856 op_put_v6_networks(&match, op);
4857 ovn_lflow_add(lflows, op->od, S_ROUTER_IN_IP_INPUT, 100,
4858 ds_cstr(&match), "drop;");
4859
4860 /* ICMPv6 echo reply. These flows reply to echo requests
4861 * received for the router's IP address. */
4862 ds_clear(&match);
4863 ds_put_cstr(&match, "ip6.dst == ");
4864 op_put_v6_networks(&match, op);
4865 ds_put_cstr(&match, " && icmp6.type == 128 && icmp6.code == 0");
4866
4867 ds_clear(&actions);
4868 ds_put_cstr(&actions,
4869 "ip6.dst <-> ip6.src; "
4870 "ip.ttl = 255; "
4871 "icmp6.type = 129; "
4872 "flags.loopback = 1; "
4873 "next; ");
4874 ovn_lflow_add(lflows, op->od, S_ROUTER_IN_IP_INPUT, 90,
4875 ds_cstr(&match), ds_cstr(&actions));
4876
4877 /* Drop IPv6 traffic to this router. */
4878 ds_clear(&match);
4879 ds_put_cstr(&match, "ip6.dst == ");
4880 op_put_v6_networks(&match, op);
4881 ovn_lflow_add(lflows, op->od, S_ROUTER_IN_IP_INPUT, 60,
4882 ds_cstr(&match), "drop;");
4883 }
4884
4885 /* ND reply. These flows reply to ND solicitations for the
4886 * router's own IP address. */
4887 for (int i = 0; i < op->lrp_networks.n_ipv6_addrs; i++) {
4888 ds_clear(&match);
4889 ds_put_format(&match,
4890 "inport == %s && nd_ns && ip6.dst == {%s, %s} "
4891 "&& nd.target == %s",
4892 op->json_key,
4893 op->lrp_networks.ipv6_addrs[i].addr_s,
4894 op->lrp_networks.ipv6_addrs[i].sn_addr_s,
4895 op->lrp_networks.ipv6_addrs[i].addr_s);
4896 if (op->od->l3dgw_port && op == op->od->l3dgw_port
4897 && op->od->l3redirect_port) {
4898 /* Traffic with eth.src = l3dgw_port->lrp_networks.ea_s
4899 * should only be sent from the "redirect-chassis", so that
4900 * upstream MAC learning points to the "redirect-chassis".
4901 * Also need to avoid generation of multiple ND replies
4902 * from different chassis. */
4903 ds_put_format(&match, " && is_chassis_resident(%s)",
4904 op->od->l3redirect_port->json_key);
4905 }
4906
4907 ds_clear(&actions);
4908 ds_put_format(&actions,
4909 "put_nd(inport, ip6.src, nd.sll); "
4910 "nd_na { "
4911 "eth.src = %s; "
4912 "ip6.src = %s; "
4913 "nd.target = %s; "
4914 "nd.tll = %s; "
4915 "outport = inport; "
4916 "flags.loopback = 1; "
4917 "output; "
4918 "};",
4919 op->lrp_networks.ea_s,
4920 op->lrp_networks.ipv6_addrs[i].addr_s,
4921 op->lrp_networks.ipv6_addrs[i].addr_s,
4922 op->lrp_networks.ea_s);
4923 ovn_lflow_add(lflows, op->od, S_ROUTER_IN_IP_INPUT, 90,
4924 ds_cstr(&match), ds_cstr(&actions));
4925 }
4926 }
4927
4928 /* NAT, Defrag and load balancing. */
4929 HMAP_FOR_EACH (od, key_node, datapaths) {
4930 if (!od->nbr) {
4931 continue;
4932 }
4933
4934 /* Packets are allowed by default. */
4935 ovn_lflow_add(lflows, od, S_ROUTER_IN_DEFRAG, 0, "1", "next;");
4936 ovn_lflow_add(lflows, od, S_ROUTER_IN_UNSNAT, 0, "1", "next;");
4937 ovn_lflow_add(lflows, od, S_ROUTER_OUT_SNAT, 0, "1", "next;");
4938 ovn_lflow_add(lflows, od, S_ROUTER_IN_DNAT, 0, "1", "next;");
4939 ovn_lflow_add(lflows, od, S_ROUTER_OUT_UNDNAT, 0, "1", "next;");
4940 ovn_lflow_add(lflows, od, S_ROUTER_OUT_EGR_LOOP, 0, "1", "next;");
4941
4942 /* NAT rules are only valid on Gateway routers and routers with
4943 * l3dgw_port (router has a port with "redirect-chassis"
4944 * specified). */
4945 if (!smap_get(&od->nbr->options, "chassis") && !od->l3dgw_port) {
4946 continue;
4947 }
4948
4949 ovs_be32 snat_ip;
4950 const char *dnat_force_snat_ip = get_force_snat_ip(od, "dnat",
4951 &snat_ip);
4952 const char *lb_force_snat_ip = get_force_snat_ip(od, "lb",
4953 &snat_ip);
4954
4955 for (int i = 0; i < od->nbr->n_nat; i++) {
4956 const struct nbrec_nat *nat;
4957
4958 nat = od->nbr->nat[i];
4959
4960 ovs_be32 ip, mask;
4961
4962 char *error = ip_parse_masked(nat->external_ip, &ip, &mask);
4963 if (error || mask != OVS_BE32_MAX) {
4964 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
4965 VLOG_WARN_RL(&rl, "bad external ip %s for nat",
4966 nat->external_ip);
4967 free(error);
4968 continue;
4969 }
4970
4971 /* Check the validity of nat->logical_ip. 'logical_ip' can
4972 * be a subnet when the type is "snat". */
4973 error = ip_parse_masked(nat->logical_ip, &ip, &mask);
4974 if (!strcmp(nat->type, "snat")) {
4975 if (error) {
4976 static struct vlog_rate_limit rl =
4977 VLOG_RATE_LIMIT_INIT(5, 1);
4978 VLOG_WARN_RL(&rl, "bad ip network or ip %s for snat "
4979 "in router "UUID_FMT"",
4980 nat->logical_ip, UUID_ARGS(&od->key));
4981 free(error);
4982 continue;
4983 }
4984 } else {
4985 if (error || mask != OVS_BE32_MAX) {
4986 static struct vlog_rate_limit rl =
4987 VLOG_RATE_LIMIT_INIT(5, 1);
4988 VLOG_WARN_RL(&rl, "bad ip %s for dnat in router "
4989 ""UUID_FMT"", nat->logical_ip, UUID_ARGS(&od->key));
4990 free(error);
4991 continue;
4992 }
4993 }
4994
4995 /* For distributed router NAT, determine whether this NAT rule
4996 * satisfies the conditions for distributed NAT processing. */
4997 bool distributed = false;
4998 struct eth_addr mac;
4999 if (od->l3dgw_port && !strcmp(nat->type, "dnat_and_snat") &&
5000 nat->logical_port && nat->external_mac) {
5001 if (eth_addr_from_string(nat->external_mac, &mac)) {
5002 distributed = true;
5003 } else {
5004 static struct vlog_rate_limit rl =
5005 VLOG_RATE_LIMIT_INIT(5, 1);
5006 VLOG_WARN_RL(&rl, "bad mac %s for dnat in router "
5007 ""UUID_FMT"", nat->external_mac, UUID_ARGS(&od->key));
5008 continue;
5009 }
5010 }
5011
5012 /* Ingress UNSNAT table: It is for already established connections'
5013 * reverse traffic. i.e., SNAT has already been done in egress
5014 * pipeline and now the packet has entered the ingress pipeline as
5015 * part of a reply. We undo the SNAT here.
5016 *
5017 * Undoing SNAT has to happen before DNAT processing. This is
5018 * because when the packet was DNATed in ingress pipeline, it did
5019 * not know about the possibility of eventual additional SNAT in
5020 * egress pipeline. */
5021 if (!strcmp(nat->type, "snat")
5022 || !strcmp(nat->type, "dnat_and_snat")) {
5023 if (!od->l3dgw_port) {
5024 /* Gateway router. */
5025 ds_clear(&match);
5026 ds_put_format(&match, "ip && ip4.dst == %s",
5027 nat->external_ip);
5028 ovn_lflow_add(lflows, od, S_ROUTER_IN_UNSNAT, 90,
5029 ds_cstr(&match), "ct_snat; next;");
5030 } else {
5031 /* Distributed router. */
5032
5033 /* Traffic received on l3dgw_port is subject to NAT. */
5034 ds_clear(&match);
5035 ds_put_format(&match, "ip && ip4.dst == %s"
5036 " && inport == %s",
5037 nat->external_ip,
5038 od->l3dgw_port->json_key);
5039 if (!distributed && od->l3redirect_port) {
5040 /* Flows for NAT rules that are centralized are only
5041 * programmed on the "redirect-chassis". */
5042 ds_put_format(&match, " && is_chassis_resident(%s)",
5043 od->l3redirect_port->json_key);
5044 }
5045 ovn_lflow_add(lflows, od, S_ROUTER_IN_UNSNAT, 100,
5046 ds_cstr(&match), "ct_snat;");
5047
5048 /* Traffic received on other router ports must be
5049 * redirected to the central instance of the l3dgw_port
5050 * for NAT processing. */
5051 ds_clear(&match);
5052 ds_put_format(&match, "ip && ip4.dst == %s",
5053 nat->external_ip);
5054 ovn_lflow_add(lflows, od, S_ROUTER_IN_UNSNAT, 50,
5055 ds_cstr(&match),
5056 REGBIT_NAT_REDIRECT" = 1; next;");
5057 }
5058 }
5059
5060 /* Ingress DNAT table: Packets enter the pipeline with destination
5061 * IP address that needs to be DNATted from a external IP address
5062 * to a logical IP address. */
5063 if (!strcmp(nat->type, "dnat")
5064 || !strcmp(nat->type, "dnat_and_snat")) {
5065 if (!od->l3dgw_port) {
5066 /* Gateway router. */
5067 /* Packet when it goes from the initiator to destination.
5068 * We need to set flags.loopback because the router can
5069 * send the packet back through the same interface. */
5070 ds_clear(&match);
5071 ds_put_format(&match, "ip && ip4.dst == %s",
5072 nat->external_ip);
5073 ds_clear(&actions);
5074 if (dnat_force_snat_ip) {
5075 /* Indicate to the future tables that a DNAT has taken
5076 * place and a force SNAT needs to be done in the
5077 * Egress SNAT table. */
5078 ds_put_format(&actions,
5079 "flags.force_snat_for_dnat = 1; ");
5080 }
5081 ds_put_format(&actions, "flags.loopback = 1; ct_dnat(%s);",
5082 nat->logical_ip);
5083 ovn_lflow_add(lflows, od, S_ROUTER_IN_DNAT, 100,
5084 ds_cstr(&match), ds_cstr(&actions));
5085 } else {
5086 /* Distributed router. */
5087
5088 /* Traffic received on l3dgw_port is subject to NAT. */
5089 ds_clear(&match);
5090 ds_put_format(&match, "ip && ip4.dst == %s"
5091 " && inport == %s",
5092 nat->external_ip,
5093 od->l3dgw_port->json_key);
5094 if (!distributed && od->l3redirect_port) {
5095 /* Flows for NAT rules that are centralized are only
5096 * programmed on the "redirect-chassis". */
5097 ds_put_format(&match, " && is_chassis_resident(%s)",
5098 od->l3redirect_port->json_key);
5099 }
5100 ds_clear(&actions);
5101 ds_put_format(&actions, "ct_dnat(%s);",
5102 nat->logical_ip);
5103 ovn_lflow_add(lflows, od, S_ROUTER_IN_DNAT, 100,
5104 ds_cstr(&match), ds_cstr(&actions));
5105
5106 /* Traffic received on other router ports must be
5107 * redirected to the central instance of the l3dgw_port
5108 * for NAT processing. */
5109 ds_clear(&match);
5110 ds_put_format(&match, "ip && ip4.dst == %s",
5111 nat->external_ip);
5112 ovn_lflow_add(lflows, od, S_ROUTER_IN_DNAT, 50,
5113 ds_cstr(&match),
5114 REGBIT_NAT_REDIRECT" = 1; next;");
5115 }
5116 }
5117
5118 /* Egress UNDNAT table: It is for already established connections'
5119 * reverse traffic. i.e., DNAT has already been done in ingress
5120 * pipeline and now the packet has entered the egress pipeline as
5121 * part of a reply. We undo the DNAT here.
5122 *
5123 * Note that this only applies for NAT on a distributed router.
5124 * Undo DNAT on a gateway router is done in the ingress DNAT
5125 * pipeline stage. */
5126 if (od->l3dgw_port && (!strcmp(nat->type, "dnat")
5127 || !strcmp(nat->type, "dnat_and_snat"))) {
5128 ds_clear(&match);
5129 ds_put_format(&match, "ip && ip4.src == %s"
5130 " && outport == %s",
5131 nat->logical_ip,
5132 od->l3dgw_port->json_key);
5133 if (!distributed && od->l3redirect_port) {
5134 /* Flows for NAT rules that are centralized are only
5135 * programmed on the "redirect-chassis". */
5136 ds_put_format(&match, " && is_chassis_resident(%s)",
5137 od->l3redirect_port->json_key);
5138 }
5139 ds_clear(&actions);
5140 if (distributed) {
5141 ds_put_format(&actions, "eth.src = "ETH_ADDR_FMT"; ",
5142 ETH_ADDR_ARGS(mac));
5143 }
5144 ds_put_format(&actions, "ct_dnat;");
5145 ovn_lflow_add(lflows, od, S_ROUTER_OUT_UNDNAT, 100,
5146 ds_cstr(&match), ds_cstr(&actions));
5147 }
5148
5149 /* Egress SNAT table: Packets enter the egress pipeline with
5150 * source ip address that needs to be SNATted to a external ip
5151 * address. */
5152 if (!strcmp(nat->type, "snat")
5153 || !strcmp(nat->type, "dnat_and_snat")) {
5154 if (!od->l3dgw_port) {
5155 /* Gateway router. */
5156 ds_clear(&match);
5157 ds_put_format(&match, "ip && ip4.src == %s",
5158 nat->logical_ip);
5159 ds_clear(&actions);
5160 ds_put_format(&actions, "ct_snat(%s);", nat->external_ip);
5161
5162 /* The priority here is calculated such that the
5163 * nat->logical_ip with the longest mask gets a higher
5164 * priority. */
5165 ovn_lflow_add(lflows, od, S_ROUTER_OUT_SNAT,
5166 count_1bits(ntohl(mask)) + 1,
5167 ds_cstr(&match), ds_cstr(&actions));
5168 } else {
5169 /* Distributed router. */
5170 ds_clear(&match);
5171 ds_put_format(&match, "ip && ip4.src == %s"
5172 " && outport == %s",
5173 nat->logical_ip,
5174 od->l3dgw_port->json_key);
5175 if (!distributed && od->l3redirect_port) {
5176 /* Flows for NAT rules that are centralized are only
5177 * programmed on the "redirect-chassis". */
5178 ds_put_format(&match, " && is_chassis_resident(%s)",
5179 od->l3redirect_port->json_key);
5180 }
5181 ds_clear(&actions);
5182 if (distributed) {
5183 ds_put_format(&actions, "eth.src = "ETH_ADDR_FMT"; ",
5184 ETH_ADDR_ARGS(mac));
5185 }
5186 ds_put_format(&actions, "ct_snat(%s);", nat->external_ip);
5187
5188 /* The priority here is calculated such that the
5189 * nat->logical_ip with the longest mask gets a higher
5190 * priority. */
5191 ovn_lflow_add(lflows, od, S_ROUTER_OUT_SNAT,
5192 count_1bits(ntohl(mask)) + 1,
5193 ds_cstr(&match), ds_cstr(&actions));
5194 }
5195 }
5196
5197 /* Logical router ingress table 0:
5198 * For NAT on a distributed router, add rules allowing
5199 * ingress traffic with eth.dst matching nat->external_mac
5200 * on the l3dgw_port instance where nat->logical_port is
5201 * resident. */
5202 if (distributed) {
5203 ds_clear(&match);
5204 ds_put_format(&match,
5205 "eth.dst == "ETH_ADDR_FMT" && inport == %s"
5206 " && is_chassis_resident(\"%s\")",
5207 ETH_ADDR_ARGS(mac),
5208 od->l3dgw_port->json_key,
5209 nat->logical_port);
5210 ovn_lflow_add(lflows, od, S_ROUTER_IN_ADMISSION, 50,
5211 ds_cstr(&match), "next;");
5212 }
5213
5214 /* Ingress Gateway Redirect Table: For NAT on a distributed
5215 * router, add flows that are specific to a NAT rule. These
5216 * flows indicate the presence of an applicable NAT rule that
5217 * can be applied in a distributed manner. */
5218 if (distributed) {
5219 ds_clear(&match);
5220 ds_put_format(&match, "ip4.src == %s && outport == %s",
5221 nat->logical_ip,
5222 od->l3dgw_port->json_key);
5223 ovn_lflow_add(lflows, od, S_ROUTER_IN_GW_REDIRECT, 100,
5224 ds_cstr(&match), "next;");
5225 }
5226
5227 /* Egress Loopback table: For NAT on a distributed router.
5228 * If packets in the egress pipeline on the distributed
5229 * gateway port have ip.dst matching a NAT external IP, then
5230 * loop a clone of the packet back to the beginning of the
5231 * ingress pipeline with inport = outport. */
5232 if (od->l3dgw_port) {
5233 /* Distributed router. */
5234 ds_clear(&match);
5235 ds_put_format(&match, "ip4.dst == %s && outport == %s",
5236 nat->external_ip,
5237 od->l3dgw_port->json_key);
5238 ds_clear(&actions);
5239 ds_put_format(&actions,
5240 "clone { ct_clear; "
5241 "inport = outport; outport = \"\"; "
5242 "flags = 0; flags.loopback = 1; ");
5243 for (int j = 0; j < MFF_N_LOG_REGS; j++) {
5244 ds_put_format(&actions, "reg%d = 0; ", j);
5245 }
5246 ds_put_format(&actions, REGBIT_EGRESS_LOOPBACK" = 1; "
5247 "next(pipeline=ingress, table=0); };");
5248 ovn_lflow_add(lflows, od, S_ROUTER_OUT_EGR_LOOP, 100,
5249 ds_cstr(&match), ds_cstr(&actions));
5250 }
5251 }
5252
5253 /* Handle force SNAT options set in the gateway router. */
5254 if (dnat_force_snat_ip && !od->l3dgw_port) {
5255 /* If a packet with destination IP address as that of the
5256 * gateway router (as set in options:dnat_force_snat_ip) is seen,
5257 * UNSNAT it. */
5258 ds_clear(&match);
5259 ds_put_format(&match, "ip && ip4.dst == %s", dnat_force_snat_ip);
5260 ovn_lflow_add(lflows, od, S_ROUTER_IN_UNSNAT, 110,
5261 ds_cstr(&match), "ct_snat; next;");
5262
5263 /* Higher priority rules to force SNAT with the IP addresses
5264 * configured in the Gateway router. This only takes effect
5265 * when the packet has already been DNATed once. */
5266 ds_clear(&match);
5267 ds_put_format(&match, "flags.force_snat_for_dnat == 1 && ip");
5268 ds_clear(&actions);
5269 ds_put_format(&actions, "ct_snat(%s);", dnat_force_snat_ip);
5270 ovn_lflow_add(lflows, od, S_ROUTER_OUT_SNAT, 100,
5271 ds_cstr(&match), ds_cstr(&actions));
5272 }
5273 if (lb_force_snat_ip && !od->l3dgw_port) {
5274 /* If a packet with destination IP address as that of the
5275 * gateway router (as set in options:lb_force_snat_ip) is seen,
5276 * UNSNAT it. */
5277 ds_clear(&match);
5278 ds_put_format(&match, "ip && ip4.dst == %s", lb_force_snat_ip);
5279 ovn_lflow_add(lflows, od, S_ROUTER_IN_UNSNAT, 100,
5280 ds_cstr(&match), "ct_snat; next;");
5281
5282 /* Load balanced traffic will have flags.force_snat_for_lb set.
5283 * Force SNAT it. */
5284 ds_clear(&match);
5285 ds_put_format(&match, "flags.force_snat_for_lb == 1 && ip");
5286 ds_clear(&actions);
5287 ds_put_format(&actions, "ct_snat(%s);", lb_force_snat_ip);
5288 ovn_lflow_add(lflows, od, S_ROUTER_OUT_SNAT, 100,
5289 ds_cstr(&match), ds_cstr(&actions));
5290 }
5291
5292 if (!od->l3dgw_port) {
5293 /* For gateway router, re-circulate every packet through
5294 * the DNAT zone. This helps with two things.
5295 *
5296 * 1. Any packet that needs to be unDNATed in the reverse
5297 * direction gets unDNATed. Ideally this could be done in
5298 * the egress pipeline. But since the gateway router
5299 * does not have any feature that depends on the source
5300 * ip address being external IP address for IP routing,
5301 * we can do it here, saving a future re-circulation.
5302 *
5303 * 2. Any packet that was sent through SNAT zone in the
5304 * previous table automatically gets re-circulated to get
5305 * back the new destination IP address that is needed for
5306 * routing in the openflow pipeline. */
5307 ovn_lflow_add(lflows, od, S_ROUTER_IN_DNAT, 50,
5308 "ip", "flags.loopback = 1; ct_dnat;");
5309 } else {
5310 /* For NAT on a distributed router, add flows to Ingress
5311 * IP Routing table, Ingress ARP Resolution table, and
5312 * Ingress Gateway Redirect Table that are not specific to a
5313 * NAT rule. */
5314
5315 /* The highest priority IN_IP_ROUTING rule matches packets
5316 * with REGBIT_NAT_REDIRECT (set in DNAT or UNSNAT stages),
5317 * with action "ip.ttl--; next;". The IN_GW_REDIRECT table
5318 * will take care of setting the outport. */
5319 ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_ROUTING, 300,
5320 REGBIT_NAT_REDIRECT" == 1", "ip.ttl--; next;");
5321
5322 /* The highest priority IN_ARP_RESOLVE rule matches packets
5323 * with REGBIT_NAT_REDIRECT (set in DNAT or UNSNAT stages),
5324 * then sets eth.dst to the distributed gateway port's
5325 * ethernet address. */
5326 ds_clear(&actions);
5327 ds_put_format(&actions, "eth.dst = %s; next;",
5328 od->l3dgw_port->lrp_networks.ea_s);
5329 ovn_lflow_add(lflows, od, S_ROUTER_IN_ARP_RESOLVE, 200,
5330 REGBIT_NAT_REDIRECT" == 1", ds_cstr(&actions));
5331
5332 /* The highest priority IN_GW_REDIRECT rule redirects packets
5333 * with REGBIT_NAT_REDIRECT (set in DNAT or UNSNAT stages) to
5334 * the central instance of the l3dgw_port for NAT processing. */
5335 ds_clear(&actions);
5336 ds_put_format(&actions, "outport = %s; next;",
5337 od->l3redirect_port->json_key);
5338 ovn_lflow_add(lflows, od, S_ROUTER_IN_GW_REDIRECT, 200,
5339 REGBIT_NAT_REDIRECT" == 1", ds_cstr(&actions));
5340 }
5341
5342 /* Load balancing and packet defrag are only valid on
5343 * Gateway routers or router with gateway port. */
5344 if (!smap_get(&od->nbr->options, "chassis") && !od->l3dgw_port) {
5345 continue;
5346 }
5347
5348 /* A set to hold all ips that need defragmentation and tracking. */
5349 struct sset all_ips = SSET_INITIALIZER(&all_ips);
5350
5351 for (int i = 0; i < od->nbr->n_load_balancer; i++) {
5352 struct nbrec_load_balancer *lb = od->nbr->load_balancer[i];
5353 struct smap *vips = &lb->vips;
5354 struct smap_node *node;
5355
5356 SMAP_FOR_EACH (node, vips) {
5357 uint16_t port = 0;
5358 int addr_family;
5359
5360 /* node->key contains IP:port or just IP. */
5361 char *ip_address = NULL;
5362 ip_address_and_port_from_lb_key(node->key, &ip_address, &port,
5363 &addr_family);
5364 if (!ip_address) {
5365 continue;
5366 }
5367
5368 if (!sset_contains(&all_ips, ip_address)) {
5369 sset_add(&all_ips, ip_address);
5370 /* If there are any load balancing rules, we should send
5371 * the packet to conntrack for defragmentation and
5372 * tracking. This helps with two things.
5373 *
5374 * 1. With tracking, we can send only new connections to
5375 * pick a DNAT ip address from a group.
5376 * 2. If there are L4 ports in load balancing rules, we
5377 * need the defragmentation to match on L4 ports. */
5378 ds_clear(&match);
5379 if (addr_family == AF_INET) {
5380 ds_put_format(&match, "ip && ip4.dst == %s",
5381 ip_address);
5382 } else {
5383 ds_put_format(&match, "ip && ip6.dst == %s",
5384 ip_address);
5385 }
5386 ovn_lflow_add(lflows, od, S_ROUTER_IN_DEFRAG,
5387 100, ds_cstr(&match), "ct_next;");
5388 }
5389
5390 /* Higher priority rules are added for load-balancing in DNAT
5391 * table. For every match (on a VIP[:port]), we add two flows
5392 * via add_router_lb_flow(). One flow is for specific matching
5393 * on ct.new with an action of "ct_lb($targets);". The other
5394 * flow is for ct.est with an action of "ct_dnat;". */
5395 ds_clear(&actions);
5396 ds_put_format(&actions, "ct_lb(%s);", node->value);
5397
5398 ds_clear(&match);
5399 if (addr_family == AF_INET) {
5400 ds_put_format(&match, "ip && ip4.dst == %s",
5401 ip_address);
5402 } else {
5403 ds_put_format(&match, "ip && ip6.dst == %s",
5404 ip_address);
5405 }
5406 free(ip_address);
5407
5408 int prio = 110;
5409 bool is_udp = lb->protocol && !strcmp(lb->protocol, "udp") ?
5410 true : false;
5411 if (port) {
5412 if (is_udp) {
5413 ds_put_format(&match, " && udp && udp.dst == %d",
5414 port);
5415 } else {
5416 ds_put_format(&match, " && tcp && tcp.dst == %d",
5417 port);
5418 }
5419 prio = 120;
5420 }
5421
5422 if (od->l3redirect_port) {
5423 ds_put_format(&match, " && is_chassis_resident(%s)",
5424 od->l3redirect_port->json_key);
5425 }
5426 add_router_lb_flow(lflows, od, &match, &actions, prio,
5427 lb_force_snat_ip, node->value, is_udp,
5428 addr_family);
5429 }
5430 }
5431 sset_destroy(&all_ips);
5432 }
5433
5434 /* Logical router ingress table 5: IP Routing.
5435 *
5436 * A packet that arrives at this table is an IP packet that should be
5437 * routed to the address in 'ip[46].dst'. This table sets outport to
5438 * the correct output port, eth.src to the output port's MAC
5439 * address, and '[xx]reg0' to the next-hop IP address (leaving
5440 * 'ip[46].dst', the packet’s final destination, unchanged), and
5441 * advances to the next table for ARP/ND resolution. */
5442 HMAP_FOR_EACH (op, key_node, ports) {
5443 if (!op->nbrp) {
5444 continue;
5445 }
5446
5447 for (int i = 0; i < op->lrp_networks.n_ipv4_addrs; i++) {
5448 add_route(lflows, op, op->lrp_networks.ipv4_addrs[i].addr_s,
5449 op->lrp_networks.ipv4_addrs[i].network_s,
5450 op->lrp_networks.ipv4_addrs[i].plen, NULL, NULL);
5451 }
5452
5453 for (int i = 0; i < op->lrp_networks.n_ipv6_addrs; i++) {
5454 add_route(lflows, op, op->lrp_networks.ipv6_addrs[i].addr_s,
5455 op->lrp_networks.ipv6_addrs[i].network_s,
5456 op->lrp_networks.ipv6_addrs[i].plen, NULL, NULL);
5457 }
5458 }
5459
5460 /* Convert the static routes to flows. */
5461 HMAP_FOR_EACH (od, key_node, datapaths) {
5462 if (!od->nbr) {
5463 continue;
5464 }
5465
5466 for (int i = 0; i < od->nbr->n_static_routes; i++) {
5467 const struct nbrec_logical_router_static_route *route;
5468
5469 route = od->nbr->static_routes[i];
5470 build_static_route_flow(lflows, od, ports, route);
5471 }
5472 }
5473
5474 /* XXX destination unreachable */
5475
5476 /* Local router ingress table 6: ARP Resolution.
5477 *
5478 * Any packet that reaches this table is an IP packet whose next-hop IP
5479 * address is in reg0. (ip4.dst is the final destination.) This table
5480 * resolves the IP address in reg0 into an output port in outport and an
5481 * Ethernet address in eth.dst. */
5482 HMAP_FOR_EACH (op, key_node, ports) {
5483 if (op->nbsp && !lsp_is_enabled(op->nbsp)) {
5484 continue;
5485 }
5486
5487 if (op->nbrp) {
5488 /* This is a logical router port. If next-hop IP address in
5489 * '[xx]reg0' matches IP address of this router port, then
5490 * the packet is intended to eventually be sent to this
5491 * logical port. Set the destination mac address using this
5492 * port's mac address.
5493 *
5494 * The packet is still in peer's logical pipeline. So the match
5495 * should be on peer's outport. */
5496 if (op->peer && op->nbrp->peer) {
5497 if (op->lrp_networks.n_ipv4_addrs) {
5498 ds_clear(&match);
5499 ds_put_format(&match, "outport == %s && reg0 == ",
5500 op->peer->json_key);
5501 op_put_v4_networks(&match, op, false);
5502
5503 ds_clear(&actions);
5504 ds_put_format(&actions, "eth.dst = %s; next;",
5505 op->lrp_networks.ea_s);
5506 ovn_lflow_add(lflows, op->peer->od, S_ROUTER_IN_ARP_RESOLVE,
5507 100, ds_cstr(&match), ds_cstr(&actions));
5508 }
5509
5510 if (op->lrp_networks.n_ipv6_addrs) {
5511 ds_clear(&match);
5512 ds_put_format(&match, "outport == %s && xxreg0 == ",
5513 op->peer->json_key);
5514 op_put_v6_networks(&match, op);
5515
5516 ds_clear(&actions);
5517 ds_put_format(&actions, "eth.dst = %s; next;",
5518 op->lrp_networks.ea_s);
5519 ovn_lflow_add(lflows, op->peer->od, S_ROUTER_IN_ARP_RESOLVE,
5520 100, ds_cstr(&match), ds_cstr(&actions));
5521 }
5522 }
5523 } else if (op->od->n_router_ports && strcmp(op->nbsp->type, "router")) {
5524 /* This is a logical switch port that backs a VM or a container.
5525 * Extract its addresses. For each of the address, go through all
5526 * the router ports attached to the switch (to which this port
5527 * connects) and if the address in question is reachable from the
5528 * router port, add an ARP/ND entry in that router's pipeline. */
5529
5530 for (size_t i = 0; i < op->n_lsp_addrs; i++) {
5531 const char *ea_s = op->lsp_addrs[i].ea_s;
5532 for (size_t j = 0; j < op->lsp_addrs[i].n_ipv4_addrs; j++) {
5533 const char *ip_s = op->lsp_addrs[i].ipv4_addrs[j].addr_s;
5534 for (size_t k = 0; k < op->od->n_router_ports; k++) {
5535 /* Get the Logical_Router_Port that the
5536 * Logical_Switch_Port is connected to, as
5537 * 'peer'. */
5538 const char *peer_name = smap_get(
5539 &op->od->router_ports[k]->nbsp->options,
5540 "router-port");
5541 if (!peer_name) {
5542 continue;
5543 }
5544
5545 struct ovn_port *peer = ovn_port_find(ports, peer_name);
5546 if (!peer || !peer->nbrp) {
5547 continue;
5548 }
5549
5550 if (!find_lrp_member_ip(peer, ip_s)) {
5551 continue;
5552 }
5553
5554 ds_clear(&match);
5555 ds_put_format(&match, "outport == %s && reg0 == %s",
5556 peer->json_key, ip_s);
5557
5558 ds_clear(&actions);
5559 ds_put_format(&actions, "eth.dst = %s; next;", ea_s);
5560 ovn_lflow_add(lflows, peer->od,
5561 S_ROUTER_IN_ARP_RESOLVE, 100,
5562 ds_cstr(&match), ds_cstr(&actions));
5563 }
5564 }
5565
5566 for (size_t j = 0; j < op->lsp_addrs[i].n_ipv6_addrs; j++) {
5567 const char *ip_s = op->lsp_addrs[i].ipv6_addrs[j].addr_s;
5568 for (size_t k = 0; k < op->od->n_router_ports; k++) {
5569 /* Get the Logical_Router_Port that the
5570 * Logical_Switch_Port is connected to, as
5571 * 'peer'. */
5572 const char *peer_name = smap_get(
5573 &op->od->router_ports[k]->nbsp->options,
5574 "router-port");
5575 if (!peer_name) {
5576 continue;
5577 }
5578
5579 struct ovn_port *peer = ovn_port_find(ports, peer_name);
5580 if (!peer || !peer->nbrp) {
5581 continue;
5582 }
5583
5584 if (!find_lrp_member_ip(peer, ip_s)) {
5585 continue;
5586 }
5587
5588 ds_clear(&match);
5589 ds_put_format(&match, "outport == %s && xxreg0 == %s",
5590 peer->json_key, ip_s);
5591
5592 ds_clear(&actions);
5593 ds_put_format(&actions, "eth.dst = %s; next;", ea_s);
5594 ovn_lflow_add(lflows, peer->od,
5595 S_ROUTER_IN_ARP_RESOLVE, 100,
5596 ds_cstr(&match), ds_cstr(&actions));
5597 }
5598 }
5599 }
5600 } else if (!strcmp(op->nbsp->type, "router")) {
5601 /* This is a logical switch port that connects to a router. */
5602
5603 /* The peer of this switch port is the router port for which
5604 * we need to add logical flows such that it can resolve
5605 * ARP entries for all the other router ports connected to
5606 * the switch in question. */
5607
5608 const char *peer_name = smap_get(&op->nbsp->options,
5609 "router-port");
5610 if (!peer_name) {
5611 continue;
5612 }
5613
5614 struct ovn_port *peer = ovn_port_find(ports, peer_name);
5615 if (!peer || !peer->nbrp) {
5616 continue;
5617 }
5618
5619 for (size_t i = 0; i < op->od->n_router_ports; i++) {
5620 const char *router_port_name = smap_get(
5621 &op->od->router_ports[i]->nbsp->options,
5622 "router-port");
5623 struct ovn_port *router_port = ovn_port_find(ports,
5624 router_port_name);
5625 if (!router_port || !router_port->nbrp) {
5626 continue;
5627 }
5628
5629 /* Skip the router port under consideration. */
5630 if (router_port == peer) {
5631 continue;
5632 }
5633
5634 if (router_port->lrp_networks.n_ipv4_addrs) {
5635 ds_clear(&match);
5636 ds_put_format(&match, "outport == %s && reg0 == ",
5637 peer->json_key);
5638 op_put_v4_networks(&match, router_port, false);
5639
5640 ds_clear(&actions);
5641 ds_put_format(&actions, "eth.dst = %s; next;",
5642 router_port->lrp_networks.ea_s);
5643 ovn_lflow_add(lflows, peer->od, S_ROUTER_IN_ARP_RESOLVE,
5644 100, ds_cstr(&match), ds_cstr(&actions));
5645 }
5646
5647 if (router_port->lrp_networks.n_ipv6_addrs) {
5648 ds_clear(&match);
5649 ds_put_format(&match, "outport == %s && xxreg0 == ",
5650 peer->json_key);
5651 op_put_v6_networks(&match, router_port);
5652
5653 ds_clear(&actions);
5654 ds_put_format(&actions, "eth.dst = %s; next;",
5655 router_port->lrp_networks.ea_s);
5656 ovn_lflow_add(lflows, peer->od, S_ROUTER_IN_ARP_RESOLVE,
5657 100, ds_cstr(&match), ds_cstr(&actions));
5658 }
5659 }
5660 }
5661 }
5662
5663 HMAP_FOR_EACH (od, key_node, datapaths) {
5664 if (!od->nbr) {
5665 continue;
5666 }
5667
5668 ovn_lflow_add(lflows, od, S_ROUTER_IN_ARP_RESOLVE, 0, "ip4",
5669 "get_arp(outport, reg0); next;");
5670
5671 ovn_lflow_add(lflows, od, S_ROUTER_IN_ARP_RESOLVE, 0, "ip6",
5672 "get_nd(outport, xxreg0); next;");
5673 }
5674
5675 /* Logical router ingress table 7: Gateway redirect.
5676 *
5677 * For traffic with outport equal to the l3dgw_port
5678 * on a distributed router, this table redirects a subset
5679 * of the traffic to the l3redirect_port which represents
5680 * the central instance of the l3dgw_port.
5681 */
5682 HMAP_FOR_EACH (od, key_node, datapaths) {
5683 if (!od->nbr) {
5684 continue;
5685 }
5686 if (od->l3dgw_port && od->l3redirect_port) {
5687 /* For traffic with outport == l3dgw_port, if the
5688 * packet did not match any higher priority redirect
5689 * rule, then the traffic is redirected to the central
5690 * instance of the l3dgw_port. */
5691 ds_clear(&match);
5692 ds_put_format(&match, "outport == %s",
5693 od->l3dgw_port->json_key);
5694 ds_clear(&actions);
5695 ds_put_format(&actions, "outport = %s; next;",
5696 od->l3redirect_port->json_key);
5697 ovn_lflow_add(lflows, od, S_ROUTER_IN_GW_REDIRECT, 50,
5698 ds_cstr(&match), ds_cstr(&actions));
5699
5700 /* If the Ethernet destination has not been resolved,
5701 * redirect to the central instance of the l3dgw_port.
5702 * Such traffic will be replaced by an ARP request or ND
5703 * Neighbor Solicitation in the ARP request ingress
5704 * table, before being redirected to the central instance.
5705 */
5706 ds_put_format(&match, " && eth.dst == 00:00:00:00:00:00");
5707 ovn_lflow_add(lflows, od, S_ROUTER_IN_GW_REDIRECT, 150,
5708 ds_cstr(&match), ds_cstr(&actions));
5709 }
5710
5711 /* Packets are allowed by default. */
5712 ovn_lflow_add(lflows, od, S_ROUTER_IN_GW_REDIRECT, 0, "1", "next;");
5713 }
5714
5715 /* Local router ingress table 8: ARP request.
5716 *
5717 * In the common case where the Ethernet destination has been resolved,
5718 * this table outputs the packet (priority 0). Otherwise, it composes
5719 * and sends an ARP request (priority 100). */
5720 HMAP_FOR_EACH (od, key_node, datapaths) {
5721 if (!od->nbr) {
5722 continue;
5723 }
5724
5725 ovn_lflow_add(lflows, od, S_ROUTER_IN_ARP_REQUEST, 100,
5726 "eth.dst == 00:00:00:00:00:00",
5727 "arp { "
5728 "eth.dst = ff:ff:ff:ff:ff:ff; "
5729 "arp.spa = reg1; "
5730 "arp.tpa = reg0; "
5731 "arp.op = 1; " /* ARP request */
5732 "output; "
5733 "};");
5734 ovn_lflow_add(lflows, od, S_ROUTER_IN_ARP_REQUEST, 0, "1", "output;");
5735 }
5736
5737 /* Logical router egress table 1: Delivery (priority 100).
5738 *
5739 * Priority 100 rules deliver packets to enabled logical ports. */
5740 HMAP_FOR_EACH (op, key_node, ports) {
5741 if (!op->nbrp) {
5742 continue;
5743 }
5744
5745 if (!lrport_is_enabled(op->nbrp)) {
5746 /* Drop packets to disabled logical ports (since logical flow
5747 * tables are default-drop). */
5748 continue;
5749 }
5750
5751 if (op->derived) {
5752 /* No egress packets should be processed in the context of
5753 * a chassisredirect port. The chassisredirect port should
5754 * be replaced by the l3dgw port in the local output
5755 * pipeline stage before egress processing. */
5756 continue;
5757 }
5758
5759 ds_clear(&match);
5760 ds_put_format(&match, "outport == %s", op->json_key);
5761 ovn_lflow_add(lflows, op->od, S_ROUTER_OUT_DELIVERY, 100,
5762 ds_cstr(&match), "output;");
5763 }
5764
5765 ds_destroy(&match);
5766 ds_destroy(&actions);
5767 }
5768
5769 /* Updates the Logical_Flow and Multicast_Group tables in the OVN_SB database,
5770 * constructing their contents based on the OVN_NB database. */
5771 static void
5772 build_lflows(struct northd_context *ctx, struct hmap *datapaths,
5773 struct hmap *ports)
5774 {
5775 struct hmap lflows = HMAP_INITIALIZER(&lflows);
5776 struct hmap mcgroups = HMAP_INITIALIZER(&mcgroups);
5777
5778 build_lswitch_flows(datapaths, ports, &lflows, &mcgroups);
5779 build_lrouter_flows(datapaths, ports, &lflows);
5780
5781 /* Push changes to the Logical_Flow table to database. */
5782 const struct sbrec_logical_flow *sbflow, *next_sbflow;
5783 SBREC_LOGICAL_FLOW_FOR_EACH_SAFE (sbflow, next_sbflow, ctx->ovnsb_idl) {
5784 struct ovn_datapath *od
5785 = ovn_datapath_from_sbrec(datapaths, sbflow->logical_datapath);
5786 if (!od) {
5787 sbrec_logical_flow_delete(sbflow);
5788 continue;
5789 }
5790
5791 enum ovn_datapath_type dp_type = od->nbs ? DP_SWITCH : DP_ROUTER;
5792 enum ovn_pipeline pipeline
5793 = !strcmp(sbflow->pipeline, "ingress") ? P_IN : P_OUT;
5794 struct ovn_lflow *lflow = ovn_lflow_find(
5795 &lflows, od, ovn_stage_build(dp_type, pipeline, sbflow->table_id),
5796 sbflow->priority, sbflow->match, sbflow->actions);
5797 if (lflow) {
5798 ovn_lflow_destroy(&lflows, lflow);
5799 } else {
5800 sbrec_logical_flow_delete(sbflow);
5801 }
5802 }
5803 struct ovn_lflow *lflow, *next_lflow;
5804 HMAP_FOR_EACH_SAFE (lflow, next_lflow, hmap_node, &lflows) {
5805 enum ovn_pipeline pipeline = ovn_stage_get_pipeline(lflow->stage);
5806 uint8_t table = ovn_stage_get_table(lflow->stage);
5807
5808 sbflow = sbrec_logical_flow_insert(ctx->ovnsb_txn);
5809 sbrec_logical_flow_set_logical_datapath(sbflow, lflow->od->sb);
5810 sbrec_logical_flow_set_pipeline(
5811 sbflow, pipeline == P_IN ? "ingress" : "egress");
5812 sbrec_logical_flow_set_table_id(sbflow, table);
5813 sbrec_logical_flow_set_priority(sbflow, lflow->priority);
5814 sbrec_logical_flow_set_match(sbflow, lflow->match);
5815 sbrec_logical_flow_set_actions(sbflow, lflow->actions);
5816
5817 /* Trim the source locator lflow->where, which looks something like
5818 * "ovn/northd/ovn-northd.c:1234", down to just the part following the
5819 * last slash, e.g. "ovn-northd.c:1234". */
5820 const char *slash = strrchr(lflow->where, '/');
5821 #if _WIN32
5822 const char *backslash = strrchr(lflow->where, '\\');
5823 if (!slash || backslash > slash) {
5824 slash = backslash;
5825 }
5826 #endif
5827 const char *where = slash ? slash + 1 : lflow->where;
5828
5829 struct smap ids = SMAP_INITIALIZER(&ids);
5830 smap_add(&ids, "stage-name", ovn_stage_to_str(lflow->stage));
5831 smap_add(&ids, "source", where);
5832 if (lflow->stage_hint) {
5833 smap_add(&ids, "stage-hint", lflow->stage_hint);
5834 }
5835 sbrec_logical_flow_set_external_ids(sbflow, &ids);
5836 smap_destroy(&ids);
5837
5838 ovn_lflow_destroy(&lflows, lflow);
5839 }
5840 hmap_destroy(&lflows);
5841
5842 /* Push changes to the Multicast_Group table to database. */
5843 const struct sbrec_multicast_group *sbmc, *next_sbmc;
5844 SBREC_MULTICAST_GROUP_FOR_EACH_SAFE (sbmc, next_sbmc, ctx->ovnsb_idl) {
5845 struct ovn_datapath *od = ovn_datapath_from_sbrec(datapaths,
5846 sbmc->datapath);
5847 if (!od) {
5848 sbrec_multicast_group_delete(sbmc);
5849 continue;
5850 }
5851
5852 struct multicast_group group = { .name = sbmc->name,
5853 .key = sbmc->tunnel_key };
5854 struct ovn_multicast *mc = ovn_multicast_find(&mcgroups, od, &group);
5855 if (mc) {
5856 ovn_multicast_update_sbrec(mc, sbmc);
5857 ovn_multicast_destroy(&mcgroups, mc);
5858 } else {
5859 sbrec_multicast_group_delete(sbmc);
5860 }
5861 }
5862 struct ovn_multicast *mc, *next_mc;
5863 HMAP_FOR_EACH_SAFE (mc, next_mc, hmap_node, &mcgroups) {
5864 sbmc = sbrec_multicast_group_insert(ctx->ovnsb_txn);
5865 sbrec_multicast_group_set_datapath(sbmc, mc->datapath->sb);
5866 sbrec_multicast_group_set_name(sbmc, mc->group->name);
5867 sbrec_multicast_group_set_tunnel_key(sbmc, mc->group->key);
5868 ovn_multicast_update_sbrec(mc, sbmc);
5869 ovn_multicast_destroy(&mcgroups, mc);
5870 }
5871 hmap_destroy(&mcgroups);
5872 }
5873
5874 /* OVN_Northbound and OVN_Southbound have an identical Address_Set table.
5875 * We always update OVN_Southbound to match the current data in
5876 * OVN_Northbound, so that the address sets used in Logical_Flows in
5877 * OVN_Southbound is checked against the proper set.*/
5878 static void
5879 sync_address_sets(struct northd_context *ctx)
5880 {
5881 struct shash sb_address_sets = SHASH_INITIALIZER(&sb_address_sets);
5882
5883 const struct sbrec_address_set *sb_address_set;
5884 SBREC_ADDRESS_SET_FOR_EACH (sb_address_set, ctx->ovnsb_idl) {
5885 shash_add(&sb_address_sets, sb_address_set->name, sb_address_set);
5886 }
5887
5888 const struct nbrec_address_set *nb_address_set;
5889 NBREC_ADDRESS_SET_FOR_EACH (nb_address_set, ctx->ovnnb_idl) {
5890 sb_address_set = shash_find_and_delete(&sb_address_sets,
5891 nb_address_set->name);
5892 if (!sb_address_set) {
5893 sb_address_set = sbrec_address_set_insert(ctx->ovnsb_txn);
5894 sbrec_address_set_set_name(sb_address_set, nb_address_set->name);
5895 }
5896
5897 sbrec_address_set_set_addresses(sb_address_set,
5898 /* "char **" is not compatible with "const char **" */
5899 (const char **) nb_address_set->addresses,
5900 nb_address_set->n_addresses);
5901 }
5902
5903 struct shash_node *node, *next;
5904 SHASH_FOR_EACH_SAFE (node, next, &sb_address_sets) {
5905 sbrec_address_set_delete(node->data);
5906 shash_delete(&sb_address_sets, node);
5907 }
5908 shash_destroy(&sb_address_sets);
5909 }
5910
5911 /*
5912 * struct 'dns_info' is used to sync the DNS records between OVN Northbound db
5913 * and Southbound db.
5914 */
5915 struct dns_info {
5916 struct hmap_node hmap_node;
5917 const struct nbrec_dns *nb_dns; /* DNS record in the Northbound db. */
5918 const struct sbrec_dns *sb_dns; /* DNS record in the Soutbound db. */
5919
5920 /* Datapaths to which the DNS entry is associated with it. */
5921 const struct sbrec_datapath_binding **sbs;
5922 size_t n_sbs;
5923 };
5924
5925 static inline struct dns_info *
5926 get_dns_info_from_hmap(struct hmap *dns_map, struct uuid *uuid)
5927 {
5928 struct dns_info *dns_info;
5929 size_t hash = uuid_hash(uuid);
5930 HMAP_FOR_EACH_WITH_HASH (dns_info, hmap_node, hash, dns_map) {
5931 if (uuid_equals(&dns_info->nb_dns->header_.uuid, uuid)) {
5932 return dns_info;
5933 }
5934 }
5935
5936 return NULL;
5937 }
5938
5939 static void
5940 sync_dns_entries(struct northd_context *ctx, struct hmap *datapaths)
5941 {
5942 struct hmap dns_map = HMAP_INITIALIZER(&dns_map);
5943 struct ovn_datapath *od;
5944 HMAP_FOR_EACH (od, key_node, datapaths) {
5945 if (!od->nbs || !od->nbs->n_dns_records) {
5946 continue;
5947 }
5948
5949 for (size_t i = 0; i < od->nbs->n_dns_records; i++) {
5950 struct dns_info *dns_info = get_dns_info_from_hmap(
5951 &dns_map, &od->nbs->dns_records[i]->header_.uuid);
5952 if (!dns_info) {
5953 size_t hash = uuid_hash(
5954 &od->nbs->dns_records[i]->header_.uuid);
5955 dns_info = xzalloc(sizeof *dns_info);;
5956 dns_info->nb_dns = od->nbs->dns_records[i];
5957 hmap_insert(&dns_map, &dns_info->hmap_node, hash);
5958 }
5959
5960 dns_info->n_sbs++;
5961 dns_info->sbs = xrealloc(dns_info->sbs,
5962 dns_info->n_sbs * sizeof *dns_info->sbs);
5963 dns_info->sbs[dns_info->n_sbs - 1] = od->sb;
5964 }
5965 }
5966
5967 const struct sbrec_dns *sbrec_dns, *next;
5968 SBREC_DNS_FOR_EACH_SAFE (sbrec_dns, next, ctx->ovnsb_idl) {
5969 const char *nb_dns_uuid = smap_get(&sbrec_dns->external_ids, "dns_id");
5970 struct uuid dns_uuid;
5971 if (!nb_dns_uuid || !uuid_from_string(&dns_uuid, nb_dns_uuid)) {
5972 sbrec_dns_delete(sbrec_dns);
5973 continue;
5974 }
5975
5976 struct dns_info *dns_info =
5977 get_dns_info_from_hmap(&dns_map, &dns_uuid);
5978 if (dns_info) {
5979 dns_info->sb_dns = sbrec_dns;
5980 } else {
5981 sbrec_dns_delete(sbrec_dns);
5982 }
5983 }
5984
5985 struct dns_info *dns_info;
5986 HMAP_FOR_EACH_POP (dns_info, hmap_node, &dns_map) {
5987 if (!dns_info->sb_dns) {
5988 sbrec_dns = sbrec_dns_insert(ctx->ovnsb_txn);
5989 dns_info->sb_dns = sbrec_dns;
5990 char *dns_id = xasprintf(
5991 UUID_FMT, UUID_ARGS(&dns_info->nb_dns->header_.uuid));
5992 const struct smap external_ids =
5993 SMAP_CONST1(&external_ids, "dns_id", dns_id);
5994 sbrec_dns_set_external_ids(sbrec_dns, &external_ids);
5995 free(dns_id);
5996 }
5997
5998 /* Set the datapaths and records. If nothing has changed, then
5999 * this will be a no-op.
6000 */
6001 sbrec_dns_set_datapaths(
6002 dns_info->sb_dns,
6003 (struct sbrec_datapath_binding **)dns_info->sbs,
6004 dns_info->n_sbs);
6005 sbrec_dns_set_records(dns_info->sb_dns, &dns_info->nb_dns->records);
6006 free(dns_info->sbs);
6007 free(dns_info);
6008 }
6009 hmap_destroy(&dns_map);
6010 }
6011
6012 \f
6013 static void
6014 ovnnb_db_run(struct northd_context *ctx, struct chassis_index *chassis_index,
6015 struct ovsdb_idl_loop *sb_loop)
6016 {
6017 if (!ctx->ovnsb_txn || !ctx->ovnnb_txn) {
6018 return;
6019 }
6020 struct hmap datapaths, ports;
6021 build_datapaths(ctx, &datapaths);
6022 build_ports(ctx, &datapaths, chassis_index, &ports);
6023 build_ipam(&datapaths, &ports);
6024 build_lflows(ctx, &datapaths, &ports);
6025
6026 sync_address_sets(ctx);
6027 sync_dns_entries(ctx, &datapaths);
6028
6029 struct ovn_datapath *dp, *next_dp;
6030 HMAP_FOR_EACH_SAFE (dp, next_dp, key_node, &datapaths) {
6031 ovn_datapath_destroy(&datapaths, dp);
6032 }
6033 hmap_destroy(&datapaths);
6034
6035 struct ovn_port *port, *next_port;
6036 HMAP_FOR_EACH_SAFE (port, next_port, key_node, &ports) {
6037 ovn_port_destroy(&ports, port);
6038 }
6039 hmap_destroy(&ports);
6040
6041 /* Copy nb_cfg from northbound to southbound database.
6042 *
6043 * Also set up to update sb_cfg once our southbound transaction commits. */
6044 const struct nbrec_nb_global *nb = nbrec_nb_global_first(ctx->ovnnb_idl);
6045 if (!nb) {
6046 nb = nbrec_nb_global_insert(ctx->ovnnb_txn);
6047 }
6048 const struct sbrec_sb_global *sb = sbrec_sb_global_first(ctx->ovnsb_idl);
6049 if (!sb) {
6050 sb = sbrec_sb_global_insert(ctx->ovnsb_txn);
6051 }
6052 sbrec_sb_global_set_nb_cfg(sb, nb->nb_cfg);
6053 sb_loop->next_cfg = nb->nb_cfg;
6054
6055 cleanup_macam(&macam);
6056 }
6057
6058 /* Handle changes to the 'chassis' column of the 'Port_Binding' table. When
6059 * this column is not empty, it means we need to set the corresponding logical
6060 * port as 'up' in the northbound DB. */
6061 static void
6062 update_logical_port_status(struct northd_context *ctx)
6063 {
6064 struct hmap lports_hmap;
6065 const struct sbrec_port_binding *sb;
6066 const struct nbrec_logical_switch_port *nbsp;
6067
6068 struct lport_hash_node {
6069 struct hmap_node node;
6070 const struct nbrec_logical_switch_port *nbsp;
6071 } *hash_node;
6072
6073 hmap_init(&lports_hmap);
6074
6075 NBREC_LOGICAL_SWITCH_PORT_FOR_EACH(nbsp, ctx->ovnnb_idl) {
6076 hash_node = xzalloc(sizeof *hash_node);
6077 hash_node->nbsp = nbsp;
6078 hmap_insert(&lports_hmap, &hash_node->node, hash_string(nbsp->name, 0));
6079 }
6080
6081 SBREC_PORT_BINDING_FOR_EACH(sb, ctx->ovnsb_idl) {
6082 nbsp = NULL;
6083 HMAP_FOR_EACH_WITH_HASH(hash_node, node,
6084 hash_string(sb->logical_port, 0),
6085 &lports_hmap) {
6086 if (!strcmp(sb->logical_port, hash_node->nbsp->name)) {
6087 nbsp = hash_node->nbsp;
6088 break;
6089 }
6090 }
6091
6092 if (!nbsp) {
6093 /* The logical port doesn't exist for this port binding. This can
6094 * happen under normal circumstances when ovn-northd hasn't gotten
6095 * around to pruning the Port_Binding yet. */
6096 continue;
6097 }
6098
6099 if (sb->chassis && (!nbsp->up || !*nbsp->up)) {
6100 bool up = true;
6101 nbrec_logical_switch_port_set_up(nbsp, &up, 1);
6102 } else if (!sb->chassis && (!nbsp->up || *nbsp->up)) {
6103 bool up = false;
6104 nbrec_logical_switch_port_set_up(nbsp, &up, 1);
6105 }
6106 }
6107
6108 HMAP_FOR_EACH_POP(hash_node, node, &lports_hmap) {
6109 free(hash_node);
6110 }
6111 hmap_destroy(&lports_hmap);
6112 }
6113
6114 static struct gen_opts_map supported_dhcp_opts[] = {
6115 OFFERIP,
6116 DHCP_OPT_NETMASK,
6117 DHCP_OPT_ROUTER,
6118 DHCP_OPT_DNS_SERVER,
6119 DHCP_OPT_LOG_SERVER,
6120 DHCP_OPT_LPR_SERVER,
6121 DHCP_OPT_SWAP_SERVER,
6122 DHCP_OPT_POLICY_FILTER,
6123 DHCP_OPT_ROUTER_SOLICITATION,
6124 DHCP_OPT_NIS_SERVER,
6125 DHCP_OPT_NTP_SERVER,
6126 DHCP_OPT_SERVER_ID,
6127 DHCP_OPT_TFTP_SERVER,
6128 DHCP_OPT_CLASSLESS_STATIC_ROUTE,
6129 DHCP_OPT_MS_CLASSLESS_STATIC_ROUTE,
6130 DHCP_OPT_IP_FORWARD_ENABLE,
6131 DHCP_OPT_ROUTER_DISCOVERY,
6132 DHCP_OPT_ETHERNET_ENCAP,
6133 DHCP_OPT_DEFAULT_TTL,
6134 DHCP_OPT_TCP_TTL,
6135 DHCP_OPT_MTU,
6136 DHCP_OPT_LEASE_TIME,
6137 DHCP_OPT_T1,
6138 DHCP_OPT_T2
6139 };
6140
6141 static struct gen_opts_map supported_dhcpv6_opts[] = {
6142 DHCPV6_OPT_IA_ADDR,
6143 DHCPV6_OPT_SERVER_ID,
6144 DHCPV6_OPT_DOMAIN_SEARCH,
6145 DHCPV6_OPT_DNS_SERVER
6146 };
6147
6148 static void
6149 check_and_add_supported_dhcp_opts_to_sb_db(struct northd_context *ctx)
6150 {
6151 struct hmap dhcp_opts_to_add = HMAP_INITIALIZER(&dhcp_opts_to_add);
6152 for (size_t i = 0; (i < sizeof(supported_dhcp_opts) /
6153 sizeof(supported_dhcp_opts[0])); i++) {
6154 hmap_insert(&dhcp_opts_to_add, &supported_dhcp_opts[i].hmap_node,
6155 dhcp_opt_hash(supported_dhcp_opts[i].name));
6156 }
6157
6158 const struct sbrec_dhcp_options *opt_row, *opt_row_next;
6159 SBREC_DHCP_OPTIONS_FOR_EACH_SAFE(opt_row, opt_row_next, ctx->ovnsb_idl) {
6160 struct gen_opts_map *dhcp_opt =
6161 dhcp_opts_find(&dhcp_opts_to_add, opt_row->name);
6162 if (dhcp_opt) {
6163 hmap_remove(&dhcp_opts_to_add, &dhcp_opt->hmap_node);
6164 } else {
6165 sbrec_dhcp_options_delete(opt_row);
6166 }
6167 }
6168
6169 struct gen_opts_map *opt;
6170 HMAP_FOR_EACH (opt, hmap_node, &dhcp_opts_to_add) {
6171 struct sbrec_dhcp_options *sbrec_dhcp_option =
6172 sbrec_dhcp_options_insert(ctx->ovnsb_txn);
6173 sbrec_dhcp_options_set_name(sbrec_dhcp_option, opt->name);
6174 sbrec_dhcp_options_set_code(sbrec_dhcp_option, opt->code);
6175 sbrec_dhcp_options_set_type(sbrec_dhcp_option, opt->type);
6176 }
6177
6178 hmap_destroy(&dhcp_opts_to_add);
6179 }
6180
6181 static void
6182 check_and_add_supported_dhcpv6_opts_to_sb_db(struct northd_context *ctx)
6183 {
6184 struct hmap dhcpv6_opts_to_add = HMAP_INITIALIZER(&dhcpv6_opts_to_add);
6185 for (size_t i = 0; (i < sizeof(supported_dhcpv6_opts) /
6186 sizeof(supported_dhcpv6_opts[0])); i++) {
6187 hmap_insert(&dhcpv6_opts_to_add, &supported_dhcpv6_opts[i].hmap_node,
6188 dhcp_opt_hash(supported_dhcpv6_opts[i].name));
6189 }
6190
6191 const struct sbrec_dhcpv6_options *opt_row, *opt_row_next;
6192 SBREC_DHCPV6_OPTIONS_FOR_EACH_SAFE(opt_row, opt_row_next, ctx->ovnsb_idl) {
6193 struct gen_opts_map *dhcp_opt =
6194 dhcp_opts_find(&dhcpv6_opts_to_add, opt_row->name);
6195 if (dhcp_opt) {
6196 hmap_remove(&dhcpv6_opts_to_add, &dhcp_opt->hmap_node);
6197 } else {
6198 sbrec_dhcpv6_options_delete(opt_row);
6199 }
6200 }
6201
6202 struct gen_opts_map *opt;
6203 HMAP_FOR_EACH(opt, hmap_node, &dhcpv6_opts_to_add) {
6204 struct sbrec_dhcpv6_options *sbrec_dhcpv6_option =
6205 sbrec_dhcpv6_options_insert(ctx->ovnsb_txn);
6206 sbrec_dhcpv6_options_set_name(sbrec_dhcpv6_option, opt->name);
6207 sbrec_dhcpv6_options_set_code(sbrec_dhcpv6_option, opt->code);
6208 sbrec_dhcpv6_options_set_type(sbrec_dhcpv6_option, opt->type);
6209 }
6210
6211 hmap_destroy(&dhcpv6_opts_to_add);
6212 }
6213
6214 static const char *rbac_chassis_auth[] =
6215 {"name"};
6216 static const char *rbac_chassis_update[] =
6217 {"nb_cfg", "external_ids", "encaps", "vtep_logical_switches"};
6218
6219 static const char *rbac_encap_auth[] =
6220 {"chassis_name"};
6221 static const char *rbac_encap_update[] =
6222 {"type", "options", "ip"};
6223
6224 static const char *rbac_port_binding_auth[] =
6225 {""};
6226 static const char *rbac_port_binding_update[] =
6227 {"chassis"};
6228
6229 static const char *rbac_mac_binding_auth[] =
6230 {""};
6231 static const char *rbac_mac_binding_update[] =
6232 {"logical_port", "ip", "mac", "datapath"};
6233
6234 static struct rbac_perm_cfg {
6235 const char *table;
6236 const char **auth;
6237 int n_auth;
6238 bool insdel;
6239 const char **update;
6240 int n_update;
6241 const struct sbrec_rbac_permission *row;
6242 } rbac_perm_cfg[] = {
6243 {
6244 .table = "Chassis",
6245 .auth = rbac_chassis_auth,
6246 .n_auth = ARRAY_SIZE(rbac_chassis_auth),
6247 .insdel = true,
6248 .update = rbac_chassis_update,
6249 .n_update = ARRAY_SIZE(rbac_chassis_update),
6250 .row = NULL
6251 },{
6252 .table = "Encap",
6253 .auth = rbac_encap_auth,
6254 .n_auth = ARRAY_SIZE(rbac_encap_auth),
6255 .insdel = true,
6256 .update = rbac_encap_update,
6257 .n_update = ARRAY_SIZE(rbac_encap_update),
6258 .row = NULL
6259 },{
6260 .table = "Port_Binding",
6261 .auth = rbac_port_binding_auth,
6262 .n_auth = ARRAY_SIZE(rbac_port_binding_auth),
6263 .insdel = false,
6264 .update = rbac_port_binding_update,
6265 .n_update = ARRAY_SIZE(rbac_port_binding_update),
6266 .row = NULL
6267 },{
6268 .table = "MAC_Binding",
6269 .auth = rbac_mac_binding_auth,
6270 .n_auth = ARRAY_SIZE(rbac_mac_binding_auth),
6271 .insdel = true,
6272 .update = rbac_mac_binding_update,
6273 .n_update = ARRAY_SIZE(rbac_mac_binding_update),
6274 .row = NULL
6275 },{
6276 .table = NULL,
6277 .auth = NULL,
6278 .n_auth = 0,
6279 .insdel = false,
6280 .update = NULL,
6281 .n_update = 0,
6282 .row = NULL
6283 }
6284 };
6285
6286 static bool
6287 ovn_rbac_validate_perm(const struct sbrec_rbac_permission *perm)
6288 {
6289 struct rbac_perm_cfg *pcfg;
6290 int i, j, n_found;
6291
6292 for (pcfg = rbac_perm_cfg; pcfg->table; pcfg++) {
6293 if (!strcmp(perm->table, pcfg->table)) {
6294 break;
6295 }
6296 }
6297 if (!pcfg->table) {
6298 return false;
6299 }
6300 if (perm->n_authorization != pcfg->n_auth ||
6301 perm->n_update != pcfg->n_update) {
6302 return false;
6303 }
6304 if (perm->insert_delete != pcfg->insdel) {
6305 return false;
6306 }
6307 /* verify perm->authorization vs. pcfg->auth */
6308 n_found = 0;
6309 for (i = 0; i < pcfg->n_auth; i++) {
6310 for (j = 0; j < perm->n_authorization; j++) {
6311 if (!strcmp(pcfg->auth[i], perm->authorization[j])) {
6312 n_found++;
6313 break;
6314 }
6315 }
6316 }
6317 if (n_found != pcfg->n_auth) {
6318 return false;
6319 }
6320
6321 /* verify perm->update vs. pcfg->update */
6322 n_found = 0;
6323 for (i = 0; i < pcfg->n_update; i++) {
6324 for (j = 0; j < perm->n_update; j++) {
6325 if (!strcmp(pcfg->update[i], perm->update[j])) {
6326 n_found++;
6327 break;
6328 }
6329 }
6330 }
6331 if (n_found != pcfg->n_update) {
6332 return false;
6333 }
6334
6335 /* Success, db state matches expected state */
6336 pcfg->row = perm;
6337 return true;
6338 }
6339
6340 static void
6341 ovn_rbac_create_perm(struct rbac_perm_cfg *pcfg,
6342 struct northd_context *ctx,
6343 const struct sbrec_rbac_role *rbac_role)
6344 {
6345 struct sbrec_rbac_permission *rbac_perm;
6346
6347 rbac_perm = sbrec_rbac_permission_insert(ctx->ovnsb_txn);
6348 sbrec_rbac_permission_set_table(rbac_perm, pcfg->table);
6349 sbrec_rbac_permission_set_authorization(rbac_perm,
6350 pcfg->auth,
6351 pcfg->n_auth);
6352 sbrec_rbac_permission_set_insert_delete(rbac_perm, pcfg->insdel);
6353 sbrec_rbac_permission_set_update(rbac_perm,
6354 pcfg->update,
6355 pcfg->n_update);
6356 sbrec_rbac_role_update_permissions_setkey(rbac_role, pcfg->table,
6357 rbac_perm);
6358 }
6359
6360 static void
6361 check_and_update_rbac(struct northd_context *ctx)
6362 {
6363 const struct sbrec_rbac_role *rbac_role = NULL;
6364 const struct sbrec_rbac_permission *perm_row, *perm_next;
6365 const struct sbrec_rbac_role *role_row, *role_row_next;
6366 struct rbac_perm_cfg *pcfg;
6367
6368 for (pcfg = rbac_perm_cfg; pcfg->table; pcfg++) {
6369 pcfg->row = NULL;
6370 }
6371
6372 SBREC_RBAC_PERMISSION_FOR_EACH_SAFE (perm_row, perm_next, ctx->ovnsb_idl) {
6373 if (!ovn_rbac_validate_perm(perm_row)) {
6374 sbrec_rbac_permission_delete(perm_row);
6375 }
6376 }
6377 SBREC_RBAC_ROLE_FOR_EACH_SAFE (role_row, role_row_next, ctx->ovnsb_idl) {
6378 if (strcmp(role_row->name, "ovn-controller")) {
6379 sbrec_rbac_role_delete(role_row);
6380 } else {
6381 rbac_role = role_row;
6382 }
6383 }
6384
6385 if (!rbac_role) {
6386 rbac_role = sbrec_rbac_role_insert(ctx->ovnsb_txn);
6387 sbrec_rbac_role_set_name(rbac_role, "ovn-controller");
6388 }
6389
6390 for (pcfg = rbac_perm_cfg; pcfg->table; pcfg++) {
6391 if (!pcfg->row) {
6392 ovn_rbac_create_perm(pcfg, ctx, rbac_role);
6393 }
6394 }
6395 }
6396
6397 /* Updates the sb_cfg and hv_cfg columns in the northbound NB_Global table. */
6398 static void
6399 update_northbound_cfg(struct northd_context *ctx,
6400 struct ovsdb_idl_loop *sb_loop)
6401 {
6402 /* Update northbound sb_cfg if appropriate. */
6403 const struct nbrec_nb_global *nbg = nbrec_nb_global_first(ctx->ovnnb_idl);
6404 int64_t sb_cfg = sb_loop->cur_cfg;
6405 if (nbg && sb_cfg && nbg->sb_cfg != sb_cfg) {
6406 nbrec_nb_global_set_sb_cfg(nbg, sb_cfg);
6407 }
6408
6409 /* Update northbound hv_cfg if appropriate. */
6410 if (nbg) {
6411 /* Find minimum nb_cfg among all chassis. */
6412 const struct sbrec_chassis *chassis;
6413 int64_t hv_cfg = nbg->nb_cfg;
6414 SBREC_CHASSIS_FOR_EACH (chassis, ctx->ovnsb_idl) {
6415 if (chassis->nb_cfg < hv_cfg) {
6416 hv_cfg = chassis->nb_cfg;
6417 }
6418 }
6419
6420 /* Update hv_cfg. */
6421 if (nbg->hv_cfg != hv_cfg) {
6422 nbrec_nb_global_set_hv_cfg(nbg, hv_cfg);
6423 }
6424 }
6425 }
6426
6427 /* Handle a fairly small set of changes in the southbound database. */
6428 static void
6429 ovnsb_db_run(struct northd_context *ctx, struct ovsdb_idl_loop *sb_loop)
6430 {
6431 if (!ctx->ovnnb_txn || !ovsdb_idl_has_ever_connected(ctx->ovnsb_idl)) {
6432 return;
6433 }
6434
6435 update_logical_port_status(ctx);
6436 update_northbound_cfg(ctx, sb_loop);
6437 }
6438 \f
6439 static void
6440 parse_options(int argc OVS_UNUSED, char *argv[] OVS_UNUSED)
6441 {
6442 enum {
6443 DAEMON_OPTION_ENUMS,
6444 VLOG_OPTION_ENUMS,
6445 SSL_OPTION_ENUMS,
6446 };
6447 static const struct option long_options[] = {
6448 {"ovnsb-db", required_argument, NULL, 'd'},
6449 {"ovnnb-db", required_argument, NULL, 'D'},
6450 {"help", no_argument, NULL, 'h'},
6451 {"options", no_argument, NULL, 'o'},
6452 {"version", no_argument, NULL, 'V'},
6453 DAEMON_LONG_OPTIONS,
6454 VLOG_LONG_OPTIONS,
6455 STREAM_SSL_LONG_OPTIONS,
6456 {NULL, 0, NULL, 0},
6457 };
6458 char *short_options = ovs_cmdl_long_options_to_short_options(long_options);
6459
6460 for (;;) {
6461 int c;
6462
6463 c = getopt_long(argc, argv, short_options, long_options, NULL);
6464 if (c == -1) {
6465 break;
6466 }
6467
6468 switch (c) {
6469 DAEMON_OPTION_HANDLERS;
6470 VLOG_OPTION_HANDLERS;
6471 STREAM_SSL_OPTION_HANDLERS;
6472
6473 case 'd':
6474 ovnsb_db = optarg;
6475 break;
6476
6477 case 'D':
6478 ovnnb_db = optarg;
6479 break;
6480
6481 case 'h':
6482 usage();
6483 exit(EXIT_SUCCESS);
6484
6485 case 'o':
6486 ovs_cmdl_print_options(long_options);
6487 exit(EXIT_SUCCESS);
6488
6489 case 'V':
6490 ovs_print_version(0, 0);
6491 exit(EXIT_SUCCESS);
6492
6493 default:
6494 break;
6495 }
6496 }
6497
6498 if (!ovnsb_db) {
6499 ovnsb_db = default_sb_db();
6500 }
6501
6502 if (!ovnnb_db) {
6503 ovnnb_db = default_nb_db();
6504 }
6505
6506 free(short_options);
6507 }
6508
6509 static void
6510 add_column_noalert(struct ovsdb_idl *idl,
6511 const struct ovsdb_idl_column *column)
6512 {
6513 ovsdb_idl_add_column(idl, column);
6514 ovsdb_idl_omit_alert(idl, column);
6515 }
6516
6517 int
6518 main(int argc, char *argv[])
6519 {
6520 int res = EXIT_SUCCESS;
6521 struct unixctl_server *unixctl;
6522 int retval;
6523 bool exiting;
6524
6525 fatal_ignore_sigpipe();
6526 ovs_cmdl_proctitle_init(argc, argv);
6527 set_program_name(argv[0]);
6528 service_start(&argc, &argv);
6529 parse_options(argc, argv);
6530
6531 daemonize_start(false);
6532
6533 retval = unixctl_server_create(NULL, &unixctl);
6534 if (retval) {
6535 exit(EXIT_FAILURE);
6536 }
6537 unixctl_command_register("exit", "", 0, 0, ovn_northd_exit, &exiting);
6538
6539 daemonize_complete();
6540
6541 /* We want to detect (almost) all changes to the ovn-nb db. */
6542 struct ovsdb_idl_loop ovnnb_idl_loop = OVSDB_IDL_LOOP_INITIALIZER(
6543 ovsdb_idl_create(ovnnb_db, &nbrec_idl_class, true, true));
6544 ovsdb_idl_omit_alert(ovnnb_idl_loop.idl, &nbrec_nb_global_col_sb_cfg);
6545 ovsdb_idl_omit_alert(ovnnb_idl_loop.idl, &nbrec_nb_global_col_hv_cfg);
6546
6547 /* We want to detect only selected changes to the ovn-sb db. */
6548 struct ovsdb_idl_loop ovnsb_idl_loop = OVSDB_IDL_LOOP_INITIALIZER(
6549 ovsdb_idl_create(ovnsb_db, &sbrec_idl_class, false, true));
6550
6551 ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_sb_global);
6552 add_column_noalert(ovnsb_idl_loop.idl, &sbrec_sb_global_col_nb_cfg);
6553
6554 ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_logical_flow);
6555 add_column_noalert(ovnsb_idl_loop.idl,
6556 &sbrec_logical_flow_col_logical_datapath);
6557 add_column_noalert(ovnsb_idl_loop.idl, &sbrec_logical_flow_col_pipeline);
6558 add_column_noalert(ovnsb_idl_loop.idl, &sbrec_logical_flow_col_table_id);
6559 add_column_noalert(ovnsb_idl_loop.idl, &sbrec_logical_flow_col_priority);
6560 add_column_noalert(ovnsb_idl_loop.idl, &sbrec_logical_flow_col_match);
6561 add_column_noalert(ovnsb_idl_loop.idl, &sbrec_logical_flow_col_actions);
6562
6563 ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_multicast_group);
6564 add_column_noalert(ovnsb_idl_loop.idl,
6565 &sbrec_multicast_group_col_datapath);
6566 add_column_noalert(ovnsb_idl_loop.idl,
6567 &sbrec_multicast_group_col_tunnel_key);
6568 add_column_noalert(ovnsb_idl_loop.idl, &sbrec_multicast_group_col_name);
6569 add_column_noalert(ovnsb_idl_loop.idl, &sbrec_multicast_group_col_ports);
6570
6571 ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_datapath_binding);
6572 add_column_noalert(ovnsb_idl_loop.idl,
6573 &sbrec_datapath_binding_col_tunnel_key);
6574 add_column_noalert(ovnsb_idl_loop.idl,
6575 &sbrec_datapath_binding_col_external_ids);
6576
6577 ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_port_binding);
6578 add_column_noalert(ovnsb_idl_loop.idl, &sbrec_port_binding_col_datapath);
6579 add_column_noalert(ovnsb_idl_loop.idl,
6580 &sbrec_port_binding_col_logical_port);
6581 add_column_noalert(ovnsb_idl_loop.idl,
6582 &sbrec_port_binding_col_tunnel_key);
6583 add_column_noalert(ovnsb_idl_loop.idl,
6584 &sbrec_port_binding_col_parent_port);
6585 add_column_noalert(ovnsb_idl_loop.idl, &sbrec_port_binding_col_tag);
6586 add_column_noalert(ovnsb_idl_loop.idl, &sbrec_port_binding_col_type);
6587 add_column_noalert(ovnsb_idl_loop.idl, &sbrec_port_binding_col_options);
6588 add_column_noalert(ovnsb_idl_loop.idl, &sbrec_port_binding_col_mac);
6589 add_column_noalert(ovnsb_idl_loop.idl,
6590 &sbrec_port_binding_col_nat_addresses);
6591 ovsdb_idl_add_column(ovnsb_idl_loop.idl, &sbrec_port_binding_col_chassis);
6592 ovsdb_idl_add_column(ovnsb_idl_loop.idl,
6593 &sbrec_port_binding_col_gateway_chassis);
6594 ovsdb_idl_add_column(ovnsb_idl_loop.idl,
6595 &sbrec_gateway_chassis_col_chassis);
6596 ovsdb_idl_add_column(ovnsb_idl_loop.idl, &sbrec_gateway_chassis_col_name);
6597 ovsdb_idl_add_column(ovnsb_idl_loop.idl,
6598 &sbrec_gateway_chassis_col_priority);
6599 ovsdb_idl_add_column(ovnsb_idl_loop.idl,
6600 &sbrec_gateway_chassis_col_external_ids);
6601 ovsdb_idl_add_column(ovnsb_idl_loop.idl,
6602 &sbrec_gateway_chassis_col_options);
6603 add_column_noalert(ovnsb_idl_loop.idl,
6604 &sbrec_port_binding_col_external_ids);
6605 ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_mac_binding);
6606 add_column_noalert(ovnsb_idl_loop.idl, &sbrec_mac_binding_col_datapath);
6607 add_column_noalert(ovnsb_idl_loop.idl, &sbrec_mac_binding_col_ip);
6608 add_column_noalert(ovnsb_idl_loop.idl, &sbrec_mac_binding_col_mac);
6609 add_column_noalert(ovnsb_idl_loop.idl,
6610 &sbrec_mac_binding_col_logical_port);
6611 ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_dhcp_options);
6612 add_column_noalert(ovnsb_idl_loop.idl, &sbrec_dhcp_options_col_code);
6613 add_column_noalert(ovnsb_idl_loop.idl, &sbrec_dhcp_options_col_type);
6614 add_column_noalert(ovnsb_idl_loop.idl, &sbrec_dhcp_options_col_name);
6615 ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_dhcpv6_options);
6616 add_column_noalert(ovnsb_idl_loop.idl, &sbrec_dhcpv6_options_col_code);
6617 add_column_noalert(ovnsb_idl_loop.idl, &sbrec_dhcpv6_options_col_type);
6618 add_column_noalert(ovnsb_idl_loop.idl, &sbrec_dhcpv6_options_col_name);
6619 ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_address_set);
6620 add_column_noalert(ovnsb_idl_loop.idl, &sbrec_address_set_col_name);
6621 add_column_noalert(ovnsb_idl_loop.idl, &sbrec_address_set_col_addresses);
6622
6623 ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_dns);
6624 add_column_noalert(ovnsb_idl_loop.idl, &sbrec_dns_col_datapaths);
6625 add_column_noalert(ovnsb_idl_loop.idl, &sbrec_dns_col_records);
6626 add_column_noalert(ovnsb_idl_loop.idl, &sbrec_dns_col_external_ids);
6627
6628 ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_rbac_role);
6629 add_column_noalert(ovnsb_idl_loop.idl, &sbrec_rbac_role_col_name);
6630 add_column_noalert(ovnsb_idl_loop.idl, &sbrec_rbac_role_col_permissions);
6631
6632 ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_rbac_permission);
6633 add_column_noalert(ovnsb_idl_loop.idl,
6634 &sbrec_rbac_permission_col_table);
6635 add_column_noalert(ovnsb_idl_loop.idl,
6636 &sbrec_rbac_permission_col_authorization);
6637 add_column_noalert(ovnsb_idl_loop.idl,
6638 &sbrec_rbac_permission_col_insert_delete);
6639 add_column_noalert(ovnsb_idl_loop.idl, &sbrec_rbac_permission_col_update);
6640
6641 ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_chassis);
6642 ovsdb_idl_add_column(ovnsb_idl_loop.idl, &sbrec_chassis_col_nb_cfg);
6643 ovsdb_idl_add_column(ovnsb_idl_loop.idl, &sbrec_chassis_col_name);
6644
6645 /* Ensure that only a single ovn-northd is active in the deployment by
6646 * acquiring a lock called "ovn_northd" on the southbound database
6647 * and then only performing DB transactions if the lock is held. */
6648 ovsdb_idl_set_lock(ovnsb_idl_loop.idl, "ovn_northd");
6649 bool had_lock = false;
6650
6651 /* Main loop. */
6652 exiting = false;
6653 while (!exiting) {
6654 struct northd_context ctx = {
6655 .ovnnb_idl = ovnnb_idl_loop.idl,
6656 .ovnnb_txn = ovsdb_idl_loop_run(&ovnnb_idl_loop),
6657 .ovnsb_idl = ovnsb_idl_loop.idl,
6658 .ovnsb_txn = ovsdb_idl_loop_run(&ovnsb_idl_loop),
6659 };
6660
6661 if (!had_lock && ovsdb_idl_has_lock(ovnsb_idl_loop.idl)) {
6662 VLOG_INFO("ovn-northd lock acquired. "
6663 "This ovn-northd instance is now active.");
6664 had_lock = true;
6665 } else if (had_lock && !ovsdb_idl_has_lock(ovnsb_idl_loop.idl)) {
6666 VLOG_INFO("ovn-northd lock lost. "
6667 "This ovn-northd instance is now on standby.");
6668 had_lock = false;
6669 }
6670
6671 struct chassis_index chassis_index;
6672 bool destroy_chassis_index = false;
6673 if (ovsdb_idl_has_lock(ovnsb_idl_loop.idl)) {
6674 chassis_index_init(&chassis_index, ctx.ovnsb_idl);
6675 destroy_chassis_index = true;
6676
6677 ovnnb_db_run(&ctx, &chassis_index, &ovnsb_idl_loop);
6678 ovnsb_db_run(&ctx, &ovnsb_idl_loop);
6679 if (ctx.ovnsb_txn) {
6680 check_and_add_supported_dhcp_opts_to_sb_db(&ctx);
6681 check_and_add_supported_dhcpv6_opts_to_sb_db(&ctx);
6682 check_and_update_rbac(&ctx);
6683 }
6684 }
6685
6686 unixctl_server_run(unixctl);
6687 unixctl_server_wait(unixctl);
6688 if (exiting) {
6689 poll_immediate_wake();
6690 }
6691 ovsdb_idl_loop_commit_and_wait(&ovnnb_idl_loop);
6692 ovsdb_idl_loop_commit_and_wait(&ovnsb_idl_loop);
6693
6694 poll_block();
6695 if (should_service_stop()) {
6696 exiting = true;
6697 }
6698
6699 if (destroy_chassis_index) {
6700 chassis_index_destroy(&chassis_index);
6701 }
6702 }
6703
6704 unixctl_server_destroy(unixctl);
6705 ovsdb_idl_loop_destroy(&ovnnb_idl_loop);
6706 ovsdb_idl_loop_destroy(&ovnsb_idl_loop);
6707 service_stop();
6708
6709 exit(res);
6710 }
6711
6712 static void
6713 ovn_northd_exit(struct unixctl_conn *conn, int argc OVS_UNUSED,
6714 const char *argv[] OVS_UNUSED, void *exiting_)
6715 {
6716 bool *exiting = exiting_;
6717 *exiting = true;
6718
6719 unixctl_command_reply(conn, NULL);
6720 }