]> git.proxmox.com Git - mirror_frr.git/blob - zebra/kernel_netlink.c
Merge pull request #11272 from AbhishekNR/flag_removal
[mirror_frr.git] / zebra / kernel_netlink.c
1 /* Kernel communication using netlink interface.
2 * Copyright (C) 1999 Kunihiro Ishiguro
3 *
4 * This file is part of GNU Zebra.
5 *
6 * GNU Zebra is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the
8 * Free Software Foundation; either version 2, or (at your option) any
9 * later version.
10 *
11 * GNU Zebra is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; see the file COPYING; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include <zebra.h>
22
23 #ifdef HAVE_NETLINK
24
25 #include "linklist.h"
26 #include "if.h"
27 #include "log.h"
28 #include "prefix.h"
29 #include "connected.h"
30 #include "table.h"
31 #include "memory.h"
32 #include "rib.h"
33 #include "thread.h"
34 #include "privs.h"
35 #include "nexthop.h"
36 #include "vrf.h"
37 #include "mpls.h"
38 #include "lib_errors.h"
39 #include "hash.h"
40
41 #include "zebra/zebra_router.h"
42 #include "zebra/zebra_ns.h"
43 #include "zebra/zebra_vrf.h"
44 #include "zebra/rt.h"
45 #include "zebra/debug.h"
46 #include "zebra/kernel_netlink.h"
47 #include "zebra/rt_netlink.h"
48 #include "zebra/if_netlink.h"
49 #include "zebra/rule_netlink.h"
50 #include "zebra/netconf_netlink.h"
51 #include "zebra/zebra_errors.h"
52
53 #ifndef SO_RCVBUFFORCE
54 #define SO_RCVBUFFORCE (33)
55 #endif
56
57 /* Hack for GNU libc version 2. */
58 #ifndef MSG_TRUNC
59 #define MSG_TRUNC 0x20
60 #endif /* MSG_TRUNC */
61
62 #ifndef NLMSG_TAIL
63 #define NLMSG_TAIL(nmsg) \
64 ((struct rtattr *)(((uint8_t *)(nmsg)) \
65 + NLMSG_ALIGN((nmsg)->nlmsg_len)))
66 #endif
67
68 #ifndef RTA_TAIL
69 #define RTA_TAIL(rta) \
70 ((struct rtattr *)(((uint8_t *)(rta)) + RTA_ALIGN((rta)->rta_len)))
71 #endif
72
73 #ifndef RTNL_FAMILY_IP6MR
74 #define RTNL_FAMILY_IP6MR 129
75 #endif
76
77 #ifndef RTPROT_MROUTED
78 #define RTPROT_MROUTED 17
79 #endif
80
81 #define NL_DEFAULT_BATCH_BUFSIZE (16 * NL_PKT_BUF_SIZE)
82
83 /*
84 * We limit the batch's size to a number smaller than the length of the
85 * underlying buffer since the last message that wouldn't fit the batch would go
86 * over the upper boundary and then it would have to be encoded again into a new
87 * buffer. If the difference between the limit and the length of the buffer is
88 * big enough (bigger than the biggest Netlink message) then this situation
89 * won't occur.
90 */
91 #define NL_DEFAULT_BATCH_SEND_THRESHOLD (15 * NL_PKT_BUF_SIZE)
92
93 static const struct message nlmsg_str[] = {{RTM_NEWROUTE, "RTM_NEWROUTE"},
94 {RTM_DELROUTE, "RTM_DELROUTE"},
95 {RTM_GETROUTE, "RTM_GETROUTE"},
96 {RTM_NEWLINK, "RTM_NEWLINK"},
97 {RTM_SETLINK, "RTM_SETLINK"},
98 {RTM_DELLINK, "RTM_DELLINK"},
99 {RTM_GETLINK, "RTM_GETLINK"},
100 {RTM_NEWADDR, "RTM_NEWADDR"},
101 {RTM_DELADDR, "RTM_DELADDR"},
102 {RTM_GETADDR, "RTM_GETADDR"},
103 {RTM_NEWNEIGH, "RTM_NEWNEIGH"},
104 {RTM_DELNEIGH, "RTM_DELNEIGH"},
105 {RTM_GETNEIGH, "RTM_GETNEIGH"},
106 {RTM_NEWRULE, "RTM_NEWRULE"},
107 {RTM_DELRULE, "RTM_DELRULE"},
108 {RTM_GETRULE, "RTM_GETRULE"},
109 {RTM_NEWNEXTHOP, "RTM_NEWNEXTHOP"},
110 {RTM_DELNEXTHOP, "RTM_DELNEXTHOP"},
111 {RTM_GETNEXTHOP, "RTM_GETNEXTHOP"},
112 {RTM_NEWNETCONF, "RTM_NEWNETCONF"},
113 {RTM_DELNETCONF, "RTM_DELNETCONF"},
114 {0}};
115
116 static const struct message rtproto_str[] = {
117 {RTPROT_REDIRECT, "redirect"},
118 {RTPROT_KERNEL, "kernel"},
119 {RTPROT_BOOT, "boot"},
120 {RTPROT_STATIC, "static"},
121 {RTPROT_GATED, "GateD"},
122 {RTPROT_RA, "router advertisement"},
123 {RTPROT_MRT, "MRT"},
124 {RTPROT_ZEBRA, "Zebra"},
125 #ifdef RTPROT_BIRD
126 {RTPROT_BIRD, "BIRD"},
127 #endif /* RTPROT_BIRD */
128 {RTPROT_MROUTED, "mroute"},
129 {RTPROT_BGP, "BGP"},
130 {RTPROT_OSPF, "OSPF"},
131 {RTPROT_ISIS, "IS-IS"},
132 {RTPROT_RIP, "RIP"},
133 {RTPROT_RIPNG, "RIPNG"},
134 {RTPROT_ZSTATIC, "static"},
135 {0}};
136
137 static const struct message family_str[] = {{AF_INET, "ipv4"},
138 {AF_INET6, "ipv6"},
139 {AF_BRIDGE, "bridge"},
140 {RTNL_FAMILY_IPMR, "ipv4MR"},
141 {RTNL_FAMILY_IP6MR, "ipv6MR"},
142 {0}};
143
144 static const struct message rttype_str[] = {{RTN_UNSPEC, "none"},
145 {RTN_UNICAST, "unicast"},
146 {RTN_LOCAL, "local"},
147 {RTN_BROADCAST, "broadcast"},
148 {RTN_ANYCAST, "anycast"},
149 {RTN_MULTICAST, "multicast"},
150 {RTN_BLACKHOLE, "blackhole"},
151 {RTN_UNREACHABLE, "unreachable"},
152 {RTN_PROHIBIT, "prohibited"},
153 {RTN_THROW, "throw"},
154 {RTN_NAT, "nat"},
155 {RTN_XRESOLVE, "resolver"},
156 {0}};
157
158 extern struct thread_master *master;
159
160 extern struct zebra_privs_t zserv_privs;
161
162 DEFINE_MTYPE_STATIC(ZEBRA, NL_BUF, "Zebra Netlink buffers");
163
164 /* Hashtable and mutex to allow lookup of nlsock structs by socket/fd value.
165 * We have both the main and dplane pthreads using these structs, so we have
166 * to protect the hash with a lock.
167 */
168 static struct hash *nlsock_hash;
169 pthread_mutex_t nlsock_mutex;
170
171 /* Lock and unlock wrappers for nlsock hash */
172 #define NLSOCK_LOCK() pthread_mutex_lock(&nlsock_mutex)
173 #define NLSOCK_UNLOCK() pthread_mutex_unlock(&nlsock_mutex)
174
175 size_t nl_batch_tx_bufsize;
176 char *nl_batch_tx_buf;
177
178 _Atomic uint32_t nl_batch_bufsize = NL_DEFAULT_BATCH_BUFSIZE;
179 _Atomic uint32_t nl_batch_send_threshold = NL_DEFAULT_BATCH_SEND_THRESHOLD;
180
181 struct nl_batch {
182 void *buf;
183 size_t bufsiz;
184 size_t limit;
185
186 void *buf_head;
187 size_t curlen;
188 size_t msgcnt;
189
190 const struct zebra_dplane_info *zns;
191
192 struct dplane_ctx_q ctx_list;
193
194 /*
195 * Pointer to the queue of completed contexts outbound back
196 * towards the dataplane module.
197 */
198 struct dplane_ctx_q *ctx_out_q;
199 };
200
201 int netlink_config_write_helper(struct vty *vty)
202 {
203 uint32_t size =
204 atomic_load_explicit(&nl_batch_bufsize, memory_order_relaxed);
205 uint32_t threshold = atomic_load_explicit(&nl_batch_send_threshold,
206 memory_order_relaxed);
207
208 if (size != NL_DEFAULT_BATCH_BUFSIZE
209 || threshold != NL_DEFAULT_BATCH_SEND_THRESHOLD)
210 vty_out(vty, "zebra kernel netlink batch-tx-buf %u %u\n", size,
211 threshold);
212
213 if (if_netlink_frr_protodown_r_bit_is_set())
214 vty_out(vty, "zebra protodown reason-bit %u\n",
215 if_netlink_get_frr_protodown_r_bit());
216
217 return 0;
218 }
219
220 void netlink_set_batch_buffer_size(uint32_t size, uint32_t threshold, bool set)
221 {
222 if (!set) {
223 size = NL_DEFAULT_BATCH_BUFSIZE;
224 threshold = NL_DEFAULT_BATCH_SEND_THRESHOLD;
225 }
226
227 atomic_store_explicit(&nl_batch_bufsize, size, memory_order_relaxed);
228 atomic_store_explicit(&nl_batch_send_threshold, threshold,
229 memory_order_relaxed);
230 }
231
232 int netlink_talk_filter(struct nlmsghdr *h, ns_id_t ns_id, int startup)
233 {
234 /*
235 * This is an error condition that must be handled during
236 * development.
237 *
238 * The netlink_talk_filter function is used for communication
239 * down the netlink_cmd pipe and we are expecting
240 * an ack being received. So if we get here
241 * then we did not receive the ack and instead
242 * received some other message in an unexpected
243 * way.
244 */
245 zlog_debug("%s: ignoring message type 0x%04x(%s) NS %u", __func__,
246 h->nlmsg_type, nl_msg_type_to_str(h->nlmsg_type), ns_id);
247 return 0;
248 }
249
250 static int netlink_recvbuf(struct nlsock *nl, uint32_t newsize)
251 {
252 uint32_t oldsize;
253 socklen_t newlen = sizeof(newsize);
254 socklen_t oldlen = sizeof(oldsize);
255 int ret;
256
257 ret = getsockopt(nl->sock, SOL_SOCKET, SO_RCVBUF, &oldsize, &oldlen);
258 if (ret < 0) {
259 flog_err_sys(EC_LIB_SOCKET,
260 "Can't get %s receive buffer size: %s", nl->name,
261 safe_strerror(errno));
262 return -1;
263 }
264
265 /* Try force option (linux >= 2.6.14) and fall back to normal set */
266 frr_with_privs(&zserv_privs) {
267 ret = setsockopt(nl->sock, SOL_SOCKET, SO_RCVBUFFORCE,
268 &rcvbufsize, sizeof(rcvbufsize));
269 }
270 if (ret < 0)
271 ret = setsockopt(nl->sock, SOL_SOCKET, SO_RCVBUF, &rcvbufsize,
272 sizeof(rcvbufsize));
273 if (ret < 0) {
274 flog_err_sys(EC_LIB_SOCKET,
275 "Can't set %s receive buffer size: %s", nl->name,
276 safe_strerror(errno));
277 return -1;
278 }
279
280 ret = getsockopt(nl->sock, SOL_SOCKET, SO_RCVBUF, &newsize, &newlen);
281 if (ret < 0) {
282 flog_err_sys(EC_LIB_SOCKET,
283 "Can't get %s receive buffer size: %s", nl->name,
284 safe_strerror(errno));
285 return -1;
286 }
287 return 0;
288 }
289
290 /* Make socket for Linux netlink interface. */
291 static int netlink_socket(struct nlsock *nl, unsigned long groups,
292 unsigned long ext_groups, ns_id_t ns_id)
293 {
294 int ret;
295 struct sockaddr_nl snl;
296 int sock;
297 int namelen;
298
299 frr_with_privs(&zserv_privs) {
300 sock = ns_socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE, ns_id);
301 if (sock < 0) {
302 zlog_err("Can't open %s socket: %s", nl->name,
303 safe_strerror(errno));
304 return -1;
305 }
306
307 memset(&snl, 0, sizeof(snl));
308 snl.nl_family = AF_NETLINK;
309 snl.nl_groups = groups;
310
311 #if defined SOL_NETLINK
312 if (ext_groups) {
313 ret = setsockopt(sock, SOL_NETLINK,
314 NETLINK_ADD_MEMBERSHIP, &ext_groups,
315 sizeof(ext_groups));
316 if (ret < 0) {
317 zlog_notice(
318 "can't setsockopt NETLINK_ADD_MEMBERSHIP: %s(%d)",
319 safe_strerror(errno), errno);
320 }
321 }
322 #endif
323
324 /* Bind the socket to the netlink structure for anything. */
325 ret = bind(sock, (struct sockaddr *)&snl, sizeof(snl));
326 }
327
328 if (ret < 0) {
329 zlog_err("Can't bind %s socket to group 0x%x: %s", nl->name,
330 snl.nl_groups, safe_strerror(errno));
331 close(sock);
332 return -1;
333 }
334
335 /* multiple netlink sockets will have different nl_pid */
336 namelen = sizeof(snl);
337 ret = getsockname(sock, (struct sockaddr *)&snl, (socklen_t *)&namelen);
338 if (ret < 0 || namelen != sizeof(snl)) {
339 flog_err_sys(EC_LIB_SOCKET, "Can't get %s socket name: %s",
340 nl->name, safe_strerror(errno));
341 close(sock);
342 return -1;
343 }
344
345 nl->snl = snl;
346 nl->sock = sock;
347 nl->buflen = NL_RCV_PKT_BUF_SIZE;
348 nl->buf = XMALLOC(MTYPE_NL_BUF, nl->buflen);
349
350 return ret;
351 }
352
353 /*
354 * Dispatch an incoming netlink message; used by the zebra main pthread's
355 * netlink event reader.
356 */
357 static int netlink_information_fetch(struct nlmsghdr *h, ns_id_t ns_id,
358 int startup)
359 {
360 /*
361 * When we handle new message types here
362 * because we are starting to install them
363 * then lets check the netlink_install_filter
364 * and see if we should add the corresponding
365 * allow through entry there.
366 * Probably not needed to do but please
367 * think about it.
368 */
369 switch (h->nlmsg_type) {
370 case RTM_NEWROUTE:
371 return netlink_route_change(h, ns_id, startup);
372 case RTM_DELROUTE:
373 return netlink_route_change(h, ns_id, startup);
374 case RTM_NEWLINK:
375 return netlink_link_change(h, ns_id, startup);
376 case RTM_DELLINK:
377 return netlink_link_change(h, ns_id, startup);
378 case RTM_NEWNEIGH:
379 case RTM_DELNEIGH:
380 case RTM_GETNEIGH:
381 return netlink_neigh_change(h, ns_id);
382 case RTM_NEWRULE:
383 return netlink_rule_change(h, ns_id, startup);
384 case RTM_DELRULE:
385 return netlink_rule_change(h, ns_id, startup);
386 case RTM_NEWNEXTHOP:
387 return netlink_nexthop_change(h, ns_id, startup);
388 case RTM_DELNEXTHOP:
389 return netlink_nexthop_change(h, ns_id, startup);
390
391 /* Messages handled in the dplane thread */
392 case RTM_NEWADDR:
393 case RTM_DELADDR:
394 case RTM_NEWNETCONF:
395 case RTM_DELNETCONF:
396 return 0;
397
398 default:
399 /*
400 * If we have received this message then
401 * we have made a mistake during development
402 * and we need to write some code to handle
403 * this message type or not ask for
404 * it to be sent up to us
405 */
406 flog_err(EC_ZEBRA_UNKNOWN_NLMSG,
407 "Unknown netlink nlmsg_type %s(%d) vrf %u",
408 nl_msg_type_to_str(h->nlmsg_type), h->nlmsg_type,
409 ns_id);
410 break;
411 }
412 return 0;
413 }
414
415 /*
416 * Dispatch an incoming netlink message; used by the dataplane pthread's
417 * netlink event reader code.
418 */
419 static int dplane_netlink_information_fetch(struct nlmsghdr *h, ns_id_t ns_id,
420 int startup)
421 {
422 /*
423 * Dispatch the incoming messages that the dplane pthread handles
424 */
425 switch (h->nlmsg_type) {
426 case RTM_NEWADDR:
427 case RTM_DELADDR:
428 return netlink_interface_addr_dplane(h, ns_id, startup);
429
430 case RTM_NEWNETCONF:
431 case RTM_DELNETCONF:
432 return netlink_netconf_change(h, ns_id, startup);
433
434 /* TODO -- other messages for the dplane socket and pthread */
435
436 case RTM_NEWLINK:
437 case RTM_DELLINK:
438
439 default:
440 break;
441 }
442
443 return 0;
444 }
445
446 static void kernel_read(struct thread *thread)
447 {
448 struct zebra_ns *zns = (struct zebra_ns *)THREAD_ARG(thread);
449 struct zebra_dplane_info dp_info;
450
451 /* Capture key info from ns struct */
452 zebra_dplane_info_from_zns(&dp_info, zns, false);
453
454 netlink_parse_info(netlink_information_fetch, &zns->netlink, &dp_info,
455 5, false);
456
457 thread_add_read(zrouter.master, kernel_read, zns, zns->netlink.sock,
458 &zns->t_netlink);
459 }
460
461 /*
462 * Called by the dplane pthread to read incoming OS messages and dispatch them.
463 */
464 int kernel_dplane_read(struct zebra_dplane_info *info)
465 {
466 struct nlsock *nl = kernel_netlink_nlsock_lookup(info->sock);
467
468 netlink_parse_info(dplane_netlink_information_fetch, nl, info, 5,
469 false);
470
471 return 0;
472 }
473
474 /*
475 * Filter out messages from self that occur on listener socket,
476 * caused by our actions on the command socket(s)
477 *
478 * When we add new Netlink message types we probably
479 * do not need to add them here as that we are filtering
480 * on the routes we actually care to receive( which is rarer
481 * then the normal course of operations). We are intentionally
482 * allowing some messages from ourselves through
483 * ( I'm looking at you Interface based netlink messages )
484 * so that we only have to write one way to handle incoming
485 * address add/delete and xxxNETCONF changes.
486 */
487 static void netlink_install_filter(int sock, uint32_t pid, uint32_t dplane_pid)
488 {
489 /*
490 * BPF_JUMP instructions and where you jump to are based upon
491 * 0 as being the next statement. So count from 0. Writing
492 * this down because every time I look at this I have to
493 * re-remember it.
494 */
495 struct sock_filter filter[] = {
496 /*
497 * Logic:
498 * if (nlmsg_pid == pid ||
499 * nlmsg_pid == dplane_pid) {
500 * if (the incoming nlmsg_type ==
501 * RTM_NEWADDR || RTM_DELADDR || RTM_NEWNETCONF ||
502 * RTM_DELNETCONF)
503 * keep this message
504 * else
505 * skip this message
506 * } else
507 * keep this netlink message
508 */
509 /*
510 * 0: Load the nlmsg_pid into the BPF register
511 */
512 BPF_STMT(BPF_LD | BPF_ABS | BPF_W,
513 offsetof(struct nlmsghdr, nlmsg_pid)),
514 /*
515 * 1: Compare to pid
516 */
517 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htonl(pid), 1, 0),
518 /*
519 * 2: Compare to dplane pid
520 */
521 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htonl(dplane_pid), 0, 6),
522 /*
523 * 3: Load the nlmsg_type into BPF register
524 */
525 BPF_STMT(BPF_LD | BPF_ABS | BPF_H,
526 offsetof(struct nlmsghdr, nlmsg_type)),
527 /*
528 * 4: Compare to RTM_NEWADDR
529 */
530 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htons(RTM_NEWADDR), 4, 0),
531 /*
532 * 5: Compare to RTM_DELADDR
533 */
534 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htons(RTM_DELADDR), 3, 0),
535 /*
536 * 6: Compare to RTM_NEWNETCONF
537 */
538 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htons(RTM_NEWNETCONF), 2,
539 0),
540 /*
541 * 7: Compare to RTM_DELNETCONF
542 */
543 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htons(RTM_DELNETCONF), 1,
544 0),
545 /*
546 * 8: This is the end state of we want to skip the
547 * message
548 */
549 BPF_STMT(BPF_RET | BPF_K, 0),
550 /* 9: This is the end state of we want to keep
551 * the message
552 */
553 BPF_STMT(BPF_RET | BPF_K, 0xffff),
554 };
555
556 struct sock_fprog prog = {
557 .len = array_size(filter), .filter = filter,
558 };
559
560 if (setsockopt(sock, SOL_SOCKET, SO_ATTACH_FILTER, &prog, sizeof(prog))
561 < 0)
562 flog_err_sys(EC_LIB_SOCKET, "Can't install socket filter: %s",
563 safe_strerror(errno));
564 }
565
566 void netlink_parse_rtattr_flags(struct rtattr **tb, int max, struct rtattr *rta,
567 int len, unsigned short flags)
568 {
569 unsigned short type;
570
571 memset(tb, 0, sizeof(struct rtattr *) * (max + 1));
572 while (RTA_OK(rta, len)) {
573 type = rta->rta_type & ~flags;
574 if ((type <= max) && (!tb[type]))
575 tb[type] = rta;
576 rta = RTA_NEXT(rta, len);
577 }
578 }
579
580 void netlink_parse_rtattr(struct rtattr **tb, int max, struct rtattr *rta,
581 int len)
582 {
583 memset(tb, 0, sizeof(struct rtattr *) * (max + 1));
584 while (RTA_OK(rta, len)) {
585 if (rta->rta_type <= max)
586 tb[rta->rta_type] = rta;
587 rta = RTA_NEXT(rta, len);
588 }
589 }
590
591 /**
592 * netlink_parse_rtattr_nested() - Parses a nested route attribute
593 * @tb: Pointer to array for storing rtattr in.
594 * @max: Max number to store.
595 * @rta: Pointer to rtattr to look for nested items in.
596 */
597 void netlink_parse_rtattr_nested(struct rtattr **tb, int max,
598 struct rtattr *rta)
599 {
600 netlink_parse_rtattr(tb, max, RTA_DATA(rta), RTA_PAYLOAD(rta));
601 }
602
603 bool nl_addraw_l(struct nlmsghdr *n, unsigned int maxlen, const void *data,
604 unsigned int len)
605 {
606 if (NLMSG_ALIGN(n->nlmsg_len) + NLMSG_ALIGN(len) > maxlen) {
607 zlog_err("ERROR message exceeded bound of %d", maxlen);
608 return false;
609 }
610
611 memcpy(NLMSG_TAIL(n), data, len);
612 memset((uint8_t *)NLMSG_TAIL(n) + len, 0, NLMSG_ALIGN(len) - len);
613 n->nlmsg_len = NLMSG_ALIGN(n->nlmsg_len) + NLMSG_ALIGN(len);
614
615 return true;
616 }
617
618 bool nl_attr_put(struct nlmsghdr *n, unsigned int maxlen, int type,
619 const void *data, unsigned int alen)
620 {
621 int len;
622 struct rtattr *rta;
623
624 len = RTA_LENGTH(alen);
625
626 if (NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len) > maxlen)
627 return false;
628
629 rta = (struct rtattr *)(((char *)n) + NLMSG_ALIGN(n->nlmsg_len));
630 rta->rta_type = type;
631 rta->rta_len = len;
632
633 if (data)
634 memcpy(RTA_DATA(rta), data, alen);
635 else
636 assert(alen == 0);
637
638 n->nlmsg_len = NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len);
639
640 return true;
641 }
642
643 bool nl_attr_put8(struct nlmsghdr *n, unsigned int maxlen, int type,
644 uint8_t data)
645 {
646 return nl_attr_put(n, maxlen, type, &data, sizeof(uint8_t));
647 }
648
649 bool nl_attr_put16(struct nlmsghdr *n, unsigned int maxlen, int type,
650 uint16_t data)
651 {
652 return nl_attr_put(n, maxlen, type, &data, sizeof(uint16_t));
653 }
654
655 bool nl_attr_put32(struct nlmsghdr *n, unsigned int maxlen, int type,
656 uint32_t data)
657 {
658 return nl_attr_put(n, maxlen, type, &data, sizeof(uint32_t));
659 }
660
661 struct rtattr *nl_attr_nest(struct nlmsghdr *n, unsigned int maxlen, int type)
662 {
663 struct rtattr *nest = NLMSG_TAIL(n);
664
665 if (!nl_attr_put(n, maxlen, type, NULL, 0))
666 return NULL;
667
668 nest->rta_type |= NLA_F_NESTED;
669 return nest;
670 }
671
672 int nl_attr_nest_end(struct nlmsghdr *n, struct rtattr *nest)
673 {
674 nest->rta_len = (uint8_t *)NLMSG_TAIL(n) - (uint8_t *)nest;
675 return n->nlmsg_len;
676 }
677
678 struct rtnexthop *nl_attr_rtnh(struct nlmsghdr *n, unsigned int maxlen)
679 {
680 struct rtnexthop *rtnh = (struct rtnexthop *)NLMSG_TAIL(n);
681
682 if (NLMSG_ALIGN(n->nlmsg_len) + RTNH_ALIGN(sizeof(struct rtnexthop))
683 > maxlen)
684 return NULL;
685
686 memset(rtnh, 0, sizeof(struct rtnexthop));
687 n->nlmsg_len =
688 NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(sizeof(struct rtnexthop));
689
690 return rtnh;
691 }
692
693 void nl_attr_rtnh_end(struct nlmsghdr *n, struct rtnexthop *rtnh)
694 {
695 rtnh->rtnh_len = (uint8_t *)NLMSG_TAIL(n) - (uint8_t *)rtnh;
696 }
697
698 bool nl_rta_put(struct rtattr *rta, unsigned int maxlen, int type,
699 const void *data, int alen)
700 {
701 struct rtattr *subrta;
702 int len = RTA_LENGTH(alen);
703
704 if (RTA_ALIGN(rta->rta_len) + RTA_ALIGN(len) > maxlen) {
705 zlog_err("ERROR max allowed bound %d exceeded for rtattr",
706 maxlen);
707 return false;
708 }
709 subrta = (struct rtattr *)(((char *)rta) + RTA_ALIGN(rta->rta_len));
710 subrta->rta_type = type;
711 subrta->rta_len = len;
712 if (alen)
713 memcpy(RTA_DATA(subrta), data, alen);
714 rta->rta_len = NLMSG_ALIGN(rta->rta_len) + RTA_ALIGN(len);
715
716 return true;
717 }
718
719 bool nl_rta_put16(struct rtattr *rta, unsigned int maxlen, int type,
720 uint16_t data)
721 {
722 return nl_rta_put(rta, maxlen, type, &data, sizeof(uint16_t));
723 }
724
725 bool nl_rta_put64(struct rtattr *rta, unsigned int maxlen, int type,
726 uint64_t data)
727 {
728 return nl_rta_put(rta, maxlen, type, &data, sizeof(uint64_t));
729 }
730
731 struct rtattr *nl_rta_nest(struct rtattr *rta, unsigned int maxlen, int type)
732 {
733 struct rtattr *nest = RTA_TAIL(rta);
734
735 if (nl_rta_put(rta, maxlen, type, NULL, 0))
736 return NULL;
737
738 nest->rta_type |= NLA_F_NESTED;
739
740 return nest;
741 }
742
743 int nl_rta_nest_end(struct rtattr *rta, struct rtattr *nest)
744 {
745 nest->rta_len = (uint8_t *)RTA_TAIL(rta) - (uint8_t *)nest;
746
747 return rta->rta_len;
748 }
749
750 const char *nl_msg_type_to_str(uint16_t msg_type)
751 {
752 return lookup_msg(nlmsg_str, msg_type, "");
753 }
754
755 const char *nl_rtproto_to_str(uint8_t rtproto)
756 {
757 return lookup_msg(rtproto_str, rtproto, "");
758 }
759
760 const char *nl_family_to_str(uint8_t family)
761 {
762 return lookup_msg(family_str, family, "");
763 }
764
765 const char *nl_rttype_to_str(uint8_t rttype)
766 {
767 return lookup_msg(rttype_str, rttype, "");
768 }
769
770 #define NLA_OK(nla, len) \
771 ((len) >= (int)sizeof(struct nlattr) \
772 && (nla)->nla_len >= sizeof(struct nlattr) \
773 && (nla)->nla_len <= (len))
774 #define NLA_NEXT(nla, attrlen) \
775 ((attrlen) -= NLA_ALIGN((nla)->nla_len), \
776 (struct nlattr *)(((char *)(nla)) + NLA_ALIGN((nla)->nla_len)))
777 #define NLA_LENGTH(len) (NLA_ALIGN(sizeof(struct nlattr)) + (len))
778 #define NLA_DATA(nla) ((struct nlattr *)(((char *)(nla)) + NLA_LENGTH(0)))
779
780 #define ERR_NLA(err, inner_len) \
781 ((struct nlattr *)(((char *)(err)) \
782 + NLMSG_ALIGN(sizeof(struct nlmsgerr)) \
783 + NLMSG_ALIGN((inner_len))))
784
785 static void netlink_parse_nlattr(struct nlattr **tb, int max,
786 struct nlattr *nla, int len)
787 {
788 while (NLA_OK(nla, len)) {
789 if (nla->nla_type <= max)
790 tb[nla->nla_type] = nla;
791 nla = NLA_NEXT(nla, len);
792 }
793 }
794
795 static void netlink_parse_extended_ack(struct nlmsghdr *h)
796 {
797 struct nlattr *tb[NLMSGERR_ATTR_MAX + 1] = {};
798 const struct nlmsgerr *err = (const struct nlmsgerr *)NLMSG_DATA(h);
799 const struct nlmsghdr *err_nlh = NULL;
800 /* Length not including nlmsghdr */
801 uint32_t len = 0;
802 /* Inner error netlink message length */
803 uint32_t inner_len = 0;
804 const char *msg = NULL;
805 uint32_t off = 0;
806
807 if (!(h->nlmsg_flags & NLM_F_CAPPED))
808 inner_len = (uint32_t)NLMSG_PAYLOAD(&err->msg, 0);
809
810 len = (uint32_t)(NLMSG_PAYLOAD(h, sizeof(struct nlmsgerr)) - inner_len);
811
812 netlink_parse_nlattr(tb, NLMSGERR_ATTR_MAX, ERR_NLA(err, inner_len),
813 len);
814
815 if (tb[NLMSGERR_ATTR_MSG])
816 msg = (const char *)NLA_DATA(tb[NLMSGERR_ATTR_MSG]);
817
818 if (tb[NLMSGERR_ATTR_OFFS]) {
819 off = *(uint32_t *)NLA_DATA(tb[NLMSGERR_ATTR_OFFS]);
820
821 if (off > h->nlmsg_len) {
822 zlog_err("Invalid offset for NLMSGERR_ATTR_OFFS");
823 } else if (!(h->nlmsg_flags & NLM_F_CAPPED)) {
824 /*
825 * Header of failed message
826 * we are not doing anything currently with it
827 * but noticing it for later.
828 */
829 err_nlh = &err->msg;
830 zlog_debug("%s: Received %s extended Ack", __func__,
831 nl_msg_type_to_str(err_nlh->nlmsg_type));
832 }
833 }
834
835 if (msg && *msg != '\0') {
836 bool is_err = !!err->error;
837
838 if (is_err)
839 zlog_err("Extended Error: %s", msg);
840 else
841 flog_warn(EC_ZEBRA_NETLINK_EXTENDED_WARNING,
842 "Extended Warning: %s", msg);
843 }
844 }
845
846 /*
847 * netlink_send_msg - send a netlink message of a certain size.
848 *
849 * Returns -1 on error. Otherwise, it returns the number of bytes sent.
850 */
851 static ssize_t netlink_send_msg(const struct nlsock *nl, void *buf,
852 size_t buflen)
853 {
854 struct sockaddr_nl snl = {};
855 struct iovec iov = {};
856 struct msghdr msg = {};
857 ssize_t status;
858 int save_errno = 0;
859
860 iov.iov_base = buf;
861 iov.iov_len = buflen;
862 msg.msg_name = &snl;
863 msg.msg_namelen = sizeof(snl);
864 msg.msg_iov = &iov;
865 msg.msg_iovlen = 1;
866
867 snl.nl_family = AF_NETLINK;
868
869 /* Send message to netlink interface. */
870 frr_with_privs(&zserv_privs) {
871 status = sendmsg(nl->sock, &msg, 0);
872 save_errno = errno;
873 }
874
875 if (IS_ZEBRA_DEBUG_KERNEL_MSGDUMP_SEND) {
876 zlog_debug("%s: >> netlink message dump [sent]", __func__);
877 #ifdef NETLINK_DEBUG
878 nl_dump(buf, buflen);
879 #else
880 zlog_hexdump(buf, buflen);
881 #endif /* NETLINK_DEBUG */
882 }
883
884 if (status == -1) {
885 flog_err_sys(EC_LIB_SOCKET, "%s error: %s", __func__,
886 safe_strerror(save_errno));
887 return -1;
888 }
889
890 return status;
891 }
892
893 /*
894 * netlink_recv_msg - receive a netlink message.
895 *
896 * Returns -1 on error, 0 if read would block or the number of bytes received.
897 */
898 static int netlink_recv_msg(struct nlsock *nl, struct msghdr *msg)
899 {
900 struct iovec iov;
901 int status;
902
903 iov.iov_base = nl->buf;
904 iov.iov_len = nl->buflen;
905 msg->msg_iov = &iov;
906 msg->msg_iovlen = 1;
907
908 do {
909 int bytes;
910
911 bytes = recv(nl->sock, NULL, 0, MSG_PEEK | MSG_TRUNC);
912
913 if (bytes >= 0 && (size_t)bytes > nl->buflen) {
914 nl->buf = XREALLOC(MTYPE_NL_BUF, nl->buf, bytes);
915 nl->buflen = bytes;
916 iov.iov_base = nl->buf;
917 iov.iov_len = nl->buflen;
918 }
919
920 status = recvmsg(nl->sock, msg, 0);
921 } while (status == -1 && errno == EINTR);
922
923 if (status == -1) {
924 if (errno == EWOULDBLOCK || errno == EAGAIN)
925 return 0;
926 flog_err(EC_ZEBRA_RECVMSG_OVERRUN, "%s recvmsg overrun: %s",
927 nl->name, safe_strerror(errno));
928 /*
929 * In this case we are screwed. There is no good way to recover
930 * zebra at this point.
931 */
932 exit(-1);
933 }
934
935 if (status == 0) {
936 flog_err_sys(EC_LIB_SOCKET, "%s EOF", nl->name);
937 return -1;
938 }
939
940 if (msg->msg_namelen != sizeof(struct sockaddr_nl)) {
941 flog_err(EC_ZEBRA_NETLINK_LENGTH_ERROR,
942 "%s sender address length error: length %d", nl->name,
943 msg->msg_namelen);
944 return -1;
945 }
946
947 if (IS_ZEBRA_DEBUG_KERNEL_MSGDUMP_RECV) {
948 zlog_debug("%s: << netlink message dump [recv]", __func__);
949 #ifdef NETLINK_DEBUG
950 nl_dump(nl->buf, status);
951 #else
952 zlog_hexdump(nl->buf, status);
953 #endif /* NETLINK_DEBUG */
954 }
955
956 return status;
957 }
958
959 /*
960 * netlink_parse_error - parse a netlink error message
961 *
962 * Returns 1 if this message is acknowledgement, 0 if this error should be
963 * ignored, -1 otherwise.
964 */
965 static int netlink_parse_error(const struct nlsock *nl, struct nlmsghdr *h,
966 bool is_cmd, bool startup)
967 {
968 struct nlmsgerr *err = (struct nlmsgerr *)NLMSG_DATA(h);
969 int errnum = err->error;
970 int msg_type = err->msg.nlmsg_type;
971
972 if (h->nlmsg_len < NLMSG_LENGTH(sizeof(struct nlmsgerr))) {
973 flog_err(EC_ZEBRA_NETLINK_LENGTH_ERROR,
974 "%s error: message truncated", nl->name);
975 return -1;
976 }
977
978 /*
979 * Parse the extended information before we actually handle it. At this
980 * point in time we do not do anything other than report the issue.
981 */
982 if (h->nlmsg_flags & NLM_F_ACK_TLVS)
983 netlink_parse_extended_ack(h);
984
985 /* If the error field is zero, then this is an ACK. */
986 if (err->error == 0) {
987 if (IS_ZEBRA_DEBUG_KERNEL) {
988 zlog_debug("%s: %s ACK: type=%s(%u), seq=%u, pid=%u",
989 __func__, nl->name,
990 nl_msg_type_to_str(err->msg.nlmsg_type),
991 err->msg.nlmsg_type, err->msg.nlmsg_seq,
992 err->msg.nlmsg_pid);
993 }
994
995 return 1;
996 }
997
998 /* Deal with errors that occur because of races in link handling. */
999 if (is_cmd
1000 && ((msg_type == RTM_DELROUTE
1001 && (-errnum == ENODEV || -errnum == ESRCH))
1002 || (msg_type == RTM_NEWROUTE
1003 && (-errnum == ENETDOWN || -errnum == EEXIST)))) {
1004 if (IS_ZEBRA_DEBUG_KERNEL)
1005 zlog_debug("%s: error: %s type=%s(%u), seq=%u, pid=%u",
1006 nl->name, safe_strerror(-errnum),
1007 nl_msg_type_to_str(msg_type), msg_type,
1008 err->msg.nlmsg_seq, err->msg.nlmsg_pid);
1009 return 0;
1010 }
1011
1012 /*
1013 * We see RTM_DELNEIGH when shutting down an interface with an IPv4
1014 * link-local. The kernel should have already deleted the neighbor so
1015 * do not log these as an error.
1016 */
1017 if (msg_type == RTM_DELNEIGH
1018 || (is_cmd && msg_type == RTM_NEWROUTE
1019 && (-errnum == ESRCH || -errnum == ENETUNREACH))) {
1020 /*
1021 * This is known to happen in some situations, don't log as
1022 * error.
1023 */
1024 if (IS_ZEBRA_DEBUG_KERNEL)
1025 zlog_debug("%s error: %s, type=%s(%u), seq=%u, pid=%u",
1026 nl->name, safe_strerror(-errnum),
1027 nl_msg_type_to_str(msg_type), msg_type,
1028 err->msg.nlmsg_seq, err->msg.nlmsg_pid);
1029 } else {
1030 if ((msg_type != RTM_GETNEXTHOP) || !startup)
1031 flog_err(EC_ZEBRA_UNEXPECTED_MESSAGE,
1032 "%s error: %s, type=%s(%u), seq=%u, pid=%u",
1033 nl->name, safe_strerror(-errnum),
1034 nl_msg_type_to_str(msg_type), msg_type,
1035 err->msg.nlmsg_seq, err->msg.nlmsg_pid);
1036 }
1037
1038 return -1;
1039 }
1040
1041 /*
1042 * netlink_parse_info
1043 *
1044 * Receive message from netlink interface and pass those information
1045 * to the given function.
1046 *
1047 * filter -> Function to call to read the results
1048 * nl -> netlink socket information
1049 * zns -> The zebra namespace data
1050 * count -> How many we should read in, 0 means as much as possible
1051 * startup -> Are we reading in under startup conditions? passed to
1052 * the filter.
1053 */
1054 int netlink_parse_info(int (*filter)(struct nlmsghdr *, ns_id_t, int),
1055 struct nlsock *nl, const struct zebra_dplane_info *zns,
1056 int count, bool startup)
1057 {
1058 int status;
1059 int ret = 0;
1060 int error;
1061 int read_in = 0;
1062
1063 while (1) {
1064 struct sockaddr_nl snl;
1065 struct msghdr msg = {.msg_name = (void *)&snl,
1066 .msg_namelen = sizeof(snl)};
1067 struct nlmsghdr *h;
1068
1069 if (count && read_in >= count)
1070 return 0;
1071
1072 status = netlink_recv_msg(nl, &msg);
1073 if (status == -1)
1074 return -1;
1075 else if (status == 0)
1076 break;
1077
1078 read_in++;
1079 for (h = (struct nlmsghdr *)nl->buf;
1080 (status >= 0 && NLMSG_OK(h, (unsigned int)status));
1081 h = NLMSG_NEXT(h, status)) {
1082 /* Finish of reading. */
1083 if (h->nlmsg_type == NLMSG_DONE)
1084 return ret;
1085
1086 /* Error handling. */
1087 if (h->nlmsg_type == NLMSG_ERROR) {
1088 int err = netlink_parse_error(
1089 nl, h, zns->is_cmd, startup);
1090
1091 if (err == 1) {
1092 if (!(h->nlmsg_flags & NLM_F_MULTI))
1093 return 0;
1094 continue;
1095 } else
1096 return err;
1097 }
1098
1099 /*
1100 * What is the right thing to do? The kernel
1101 * is telling us that the dump request was interrupted
1102 * and we more than likely are out of luck and have
1103 * missed data from the kernel. At this point in time
1104 * lets just note that this is happening.
1105 */
1106 if (h->nlmsg_flags & NLM_F_DUMP_INTR)
1107 flog_err(
1108 EC_ZEBRA_NETLINK_BAD_SEQUENCE,
1109 "netlink recvmsg: The Dump request was interrupted");
1110
1111 /* OK we got netlink message. */
1112 if (IS_ZEBRA_DEBUG_KERNEL)
1113 zlog_debug(
1114 "%s: %s type %s(%u), len=%d, seq=%u, pid=%u",
1115 __func__, nl->name,
1116 nl_msg_type_to_str(h->nlmsg_type),
1117 h->nlmsg_type, h->nlmsg_len,
1118 h->nlmsg_seq, h->nlmsg_pid);
1119
1120
1121 /*
1122 * Ignore messages that maybe sent from
1123 * other actors besides the kernel
1124 */
1125 if (snl.nl_pid != 0) {
1126 zlog_debug("Ignoring message from pid %u",
1127 snl.nl_pid);
1128 continue;
1129 }
1130
1131 error = (*filter)(h, zns->ns_id, startup);
1132 if (error < 0) {
1133 zlog_debug("%s filter function error",
1134 nl->name);
1135 ret = error;
1136 }
1137 }
1138
1139 /* After error care. */
1140 if (msg.msg_flags & MSG_TRUNC) {
1141 flog_err(EC_ZEBRA_NETLINK_LENGTH_ERROR,
1142 "%s error: message truncated", nl->name);
1143 continue;
1144 }
1145 if (status) {
1146 flog_err(EC_ZEBRA_NETLINK_LENGTH_ERROR,
1147 "%s error: data remnant size %d", nl->name,
1148 status);
1149 return -1;
1150 }
1151 }
1152 return ret;
1153 }
1154
1155 /*
1156 * netlink_talk_info
1157 *
1158 * sendmsg() to netlink socket then recvmsg().
1159 * Calls netlink_parse_info to parse returned data
1160 *
1161 * filter -> The filter to read final results from kernel
1162 * nlmsghdr -> The data to send to the kernel
1163 * dp_info -> The dataplane and netlink socket information
1164 * startup -> Are we reading in under startup conditions
1165 * This is passed through eventually to filter.
1166 */
1167 static int netlink_talk_info(int (*filter)(struct nlmsghdr *, ns_id_t,
1168 int startup),
1169 struct nlmsghdr *n,
1170 struct zebra_dplane_info *dp_info, bool startup)
1171 {
1172 struct nlsock *nl;
1173
1174 nl = kernel_netlink_nlsock_lookup(dp_info->sock);
1175 n->nlmsg_seq = dp_info->seq;
1176 n->nlmsg_pid = nl->snl.nl_pid;
1177
1178 if (IS_ZEBRA_DEBUG_KERNEL)
1179 zlog_debug(
1180 "netlink_talk: %s type %s(%u), len=%d seq=%u flags 0x%x",
1181 nl->name, nl_msg_type_to_str(n->nlmsg_type),
1182 n->nlmsg_type, n->nlmsg_len, n->nlmsg_seq,
1183 n->nlmsg_flags);
1184
1185 if (netlink_send_msg(nl, n, n->nlmsg_len) == -1)
1186 return -1;
1187
1188 /*
1189 * Get reply from netlink socket.
1190 * The reply should either be an acknowlegement or an error.
1191 */
1192 return netlink_parse_info(filter, nl, dp_info, 0, startup);
1193 }
1194
1195 /*
1196 * Synchronous version of netlink_talk_info. Converts args to suit the
1197 * common version, which is suitable for both sync and async use.
1198 */
1199 int netlink_talk(int (*filter)(struct nlmsghdr *, ns_id_t, int startup),
1200 struct nlmsghdr *n, struct nlsock *nl, struct zebra_ns *zns,
1201 bool startup)
1202 {
1203 struct zebra_dplane_info dp_info;
1204
1205 /* Increment sequence number before capturing snapshot of ns socket
1206 * info.
1207 */
1208 nl->seq++;
1209
1210 /* Capture info in intermediate info struct */
1211 zebra_dplane_info_from_zns(&dp_info, zns, (nl == &(zns->netlink_cmd)));
1212
1213 return netlink_talk_info(filter, n, &dp_info, startup);
1214 }
1215
1216 /* Issue request message to kernel via netlink socket. GET messages
1217 * are issued through this interface.
1218 */
1219 int netlink_request(struct nlsock *nl, void *req)
1220 {
1221 struct nlmsghdr *n = (struct nlmsghdr *)req;
1222
1223 /* Check netlink socket. */
1224 if (nl->sock < 0) {
1225 flog_err_sys(EC_LIB_SOCKET, "%s socket isn't active.",
1226 nl->name);
1227 return -1;
1228 }
1229
1230 /* Fill common fields for all requests. */
1231 n->nlmsg_pid = nl->snl.nl_pid;
1232 n->nlmsg_seq = ++nl->seq;
1233
1234 if (netlink_send_msg(nl, req, n->nlmsg_len) == -1)
1235 return -1;
1236
1237 return 0;
1238 }
1239
1240 static int nl_batch_read_resp(struct nl_batch *bth)
1241 {
1242 struct nlmsghdr *h;
1243 struct sockaddr_nl snl;
1244 struct msghdr msg = {};
1245 int status, seq;
1246 struct nlsock *nl;
1247 struct zebra_dplane_ctx *ctx;
1248 bool ignore_msg;
1249
1250 nl = kernel_netlink_nlsock_lookup(bth->zns->sock);
1251
1252 msg.msg_name = (void *)&snl;
1253 msg.msg_namelen = sizeof(snl);
1254
1255 /*
1256 * The responses are not batched, so we need to read and process one
1257 * message at a time.
1258 */
1259 while (true) {
1260 status = netlink_recv_msg(nl, &msg);
1261 /*
1262 * status == -1 is a full on failure somewhere
1263 * since we don't know where the problem happened
1264 * we must mark all as failed
1265 *
1266 * Else we mark everything as worked
1267 *
1268 */
1269 if (status == -1 || status == 0) {
1270 while ((ctx = dplane_ctx_dequeue(&(bth->ctx_list))) !=
1271 NULL) {
1272 if (status == -1)
1273 dplane_ctx_set_status(
1274 ctx,
1275 ZEBRA_DPLANE_REQUEST_FAILURE);
1276 dplane_ctx_enqueue_tail(bth->ctx_out_q, ctx);
1277 }
1278 return status;
1279 }
1280
1281 h = (struct nlmsghdr *)nl->buf;
1282 ignore_msg = false;
1283 seq = h->nlmsg_seq;
1284 /*
1285 * Find the corresponding context object. Received responses are
1286 * in the same order as requests we sent, so we can simply
1287 * iterate over the context list and match responses with
1288 * requests at same time.
1289 */
1290 while (true) {
1291 ctx = dplane_ctx_get_head(&(bth->ctx_list));
1292 if (ctx == NULL) {
1293 /*
1294 * This is a situation where we have gotten
1295 * into a bad spot. We need to know that
1296 * this happens( does it? )
1297 */
1298 zlog_err(
1299 "%s:WARNING Received netlink Response for an error and no Contexts to associate with it",
1300 __func__);
1301 break;
1302 }
1303
1304 /*
1305 * 'update' context objects take two consecutive
1306 * sequence numbers.
1307 */
1308 if (dplane_ctx_is_update(ctx) &&
1309 dplane_ctx_get_ns(ctx)->seq + 1 == seq) {
1310 /*
1311 * This is the situation where we get a response
1312 * to a message that should be ignored.
1313 */
1314 ignore_msg = true;
1315 break;
1316 }
1317
1318 ctx = dplane_ctx_dequeue(&(bth->ctx_list));
1319 dplane_ctx_enqueue_tail(bth->ctx_out_q, ctx);
1320
1321 /* We have found corresponding context object. */
1322 if (dplane_ctx_get_ns(ctx)->seq == seq)
1323 break;
1324
1325 if (dplane_ctx_get_ns(ctx)->seq > seq)
1326 zlog_warn(
1327 "%s:WARNING Received %u is less than any context on the queue ctx->seq %u",
1328 __func__, seq,
1329 dplane_ctx_get_ns(ctx)->seq);
1330 }
1331
1332 if (ignore_msg) {
1333 /*
1334 * If we ignore the message due to an update
1335 * above we should still fricking decode the
1336 * message for our operator to understand
1337 * what is going on
1338 */
1339 int err = netlink_parse_error(nl, h, bth->zns->is_cmd,
1340 false);
1341
1342 zlog_debug("%s: netlink error message seq=%d %d",
1343 __func__, h->nlmsg_seq, err);
1344 continue;
1345 }
1346
1347 /*
1348 * We received a message with the sequence number that isn't
1349 * associated with any dplane context object.
1350 */
1351 if (ctx == NULL) {
1352 if (IS_ZEBRA_DEBUG_KERNEL)
1353 zlog_debug(
1354 "%s: skipping unassociated response, seq number %d NS %u",
1355 __func__, h->nlmsg_seq,
1356 bth->zns->ns_id);
1357 continue;
1358 }
1359
1360 if (h->nlmsg_type == NLMSG_ERROR) {
1361 int err = netlink_parse_error(nl, h, bth->zns->is_cmd,
1362 false);
1363
1364 if (err == -1)
1365 dplane_ctx_set_status(
1366 ctx, ZEBRA_DPLANE_REQUEST_FAILURE);
1367
1368 if (IS_ZEBRA_DEBUG_KERNEL)
1369 zlog_debug("%s: netlink error message seq=%d ",
1370 __func__, h->nlmsg_seq);
1371 continue;
1372 }
1373
1374 /*
1375 * If we get here then we did not receive neither the ack nor
1376 * the error and instead received some other message in an
1377 * unexpected way.
1378 */
1379 if (IS_ZEBRA_DEBUG_KERNEL)
1380 zlog_debug("%s: ignoring message type 0x%04x(%s) NS %u",
1381 __func__, h->nlmsg_type,
1382 nl_msg_type_to_str(h->nlmsg_type),
1383 bth->zns->ns_id);
1384 }
1385
1386 return 0;
1387 }
1388
1389 static void nl_batch_reset(struct nl_batch *bth)
1390 {
1391 bth->buf_head = bth->buf;
1392 bth->curlen = 0;
1393 bth->msgcnt = 0;
1394 bth->zns = NULL;
1395
1396 TAILQ_INIT(&(bth->ctx_list));
1397 }
1398
1399 static void nl_batch_init(struct nl_batch *bth, struct dplane_ctx_q *ctx_out_q)
1400 {
1401 /*
1402 * If the size of the buffer has changed, free and then allocate a new
1403 * one.
1404 */
1405 size_t bufsize =
1406 atomic_load_explicit(&nl_batch_bufsize, memory_order_relaxed);
1407 if (bufsize != nl_batch_tx_bufsize) {
1408 if (nl_batch_tx_buf)
1409 XFREE(MTYPE_NL_BUF, nl_batch_tx_buf);
1410
1411 nl_batch_tx_buf = XCALLOC(MTYPE_NL_BUF, bufsize);
1412 nl_batch_tx_bufsize = bufsize;
1413 }
1414
1415 bth->buf = nl_batch_tx_buf;
1416 bth->bufsiz = bufsize;
1417 bth->limit = atomic_load_explicit(&nl_batch_send_threshold,
1418 memory_order_relaxed);
1419
1420 bth->ctx_out_q = ctx_out_q;
1421
1422 nl_batch_reset(bth);
1423 }
1424
1425 static void nl_batch_send(struct nl_batch *bth)
1426 {
1427 struct zebra_dplane_ctx *ctx;
1428 bool err = false;
1429
1430 if (bth->curlen != 0 && bth->zns != NULL) {
1431 struct nlsock *nl =
1432 kernel_netlink_nlsock_lookup(bth->zns->sock);
1433
1434 if (IS_ZEBRA_DEBUG_KERNEL)
1435 zlog_debug("%s: %s, batch size=%zu, msg cnt=%zu",
1436 __func__, nl->name, bth->curlen,
1437 bth->msgcnt);
1438
1439 if (netlink_send_msg(nl, bth->buf, bth->curlen) == -1)
1440 err = true;
1441
1442 if (!err) {
1443 if (nl_batch_read_resp(bth) == -1)
1444 err = true;
1445 }
1446 }
1447
1448 /* Move remaining contexts to the outbound queue. */
1449 while (true) {
1450 ctx = dplane_ctx_dequeue(&(bth->ctx_list));
1451 if (ctx == NULL)
1452 break;
1453
1454 if (err)
1455 dplane_ctx_set_status(ctx,
1456 ZEBRA_DPLANE_REQUEST_FAILURE);
1457
1458 dplane_ctx_enqueue_tail(bth->ctx_out_q, ctx);
1459 }
1460
1461 nl_batch_reset(bth);
1462 }
1463
1464 enum netlink_msg_status netlink_batch_add_msg(
1465 struct nl_batch *bth, struct zebra_dplane_ctx *ctx,
1466 ssize_t (*msg_encoder)(struct zebra_dplane_ctx *, void *, size_t),
1467 bool ignore_res)
1468 {
1469 int seq;
1470 ssize_t size;
1471 struct nlmsghdr *msgh;
1472 struct nlsock *nl;
1473
1474 size = (*msg_encoder)(ctx, bth->buf_head, bth->bufsiz - bth->curlen);
1475
1476 /*
1477 * If there was an error while encoding the message (other than buffer
1478 * overflow) then return an error.
1479 */
1480 if (size < 0)
1481 return FRR_NETLINK_ERROR;
1482
1483 /*
1484 * If the message doesn't fit entirely in the buffer then send the batch
1485 * and retry.
1486 */
1487 if (size == 0) {
1488 nl_batch_send(bth);
1489 size = (*msg_encoder)(ctx, bth->buf_head,
1490 bth->bufsiz - bth->curlen);
1491 /*
1492 * If the message doesn't fit in the empty buffer then just
1493 * return an error.
1494 */
1495 if (size <= 0)
1496 return FRR_NETLINK_ERROR;
1497 }
1498
1499 seq = dplane_ctx_get_ns(ctx)->seq;
1500 nl = kernel_netlink_nlsock_lookup(dplane_ctx_get_ns_sock(ctx));
1501
1502 if (ignore_res)
1503 seq++;
1504
1505 msgh = (struct nlmsghdr *)bth->buf_head;
1506 msgh->nlmsg_seq = seq;
1507 msgh->nlmsg_pid = nl->snl.nl_pid;
1508
1509 bth->zns = dplane_ctx_get_ns(ctx);
1510 bth->buf_head = ((char *)bth->buf_head) + size;
1511 bth->curlen += size;
1512 bth->msgcnt++;
1513
1514 return FRR_NETLINK_QUEUED;
1515 }
1516
1517 static enum netlink_msg_status nl_put_msg(struct nl_batch *bth,
1518 struct zebra_dplane_ctx *ctx)
1519 {
1520 if (dplane_ctx_is_skip_kernel(ctx))
1521 return FRR_NETLINK_SUCCESS;
1522
1523 switch (dplane_ctx_get_op(ctx)) {
1524
1525 case DPLANE_OP_ROUTE_INSTALL:
1526 case DPLANE_OP_ROUTE_UPDATE:
1527 case DPLANE_OP_ROUTE_DELETE:
1528 return netlink_put_route_update_msg(bth, ctx);
1529
1530 case DPLANE_OP_NH_INSTALL:
1531 case DPLANE_OP_NH_UPDATE:
1532 case DPLANE_OP_NH_DELETE:
1533 return netlink_put_nexthop_update_msg(bth, ctx);
1534
1535 case DPLANE_OP_LSP_INSTALL:
1536 case DPLANE_OP_LSP_UPDATE:
1537 case DPLANE_OP_LSP_DELETE:
1538 return netlink_put_lsp_update_msg(bth, ctx);
1539
1540 case DPLANE_OP_PW_INSTALL:
1541 case DPLANE_OP_PW_UNINSTALL:
1542 return netlink_put_pw_update_msg(bth, ctx);
1543
1544 case DPLANE_OP_ADDR_INSTALL:
1545 case DPLANE_OP_ADDR_UNINSTALL:
1546 return netlink_put_address_update_msg(bth, ctx);
1547
1548 case DPLANE_OP_MAC_INSTALL:
1549 case DPLANE_OP_MAC_DELETE:
1550 return netlink_put_mac_update_msg(bth, ctx);
1551
1552 case DPLANE_OP_NEIGH_INSTALL:
1553 case DPLANE_OP_NEIGH_UPDATE:
1554 case DPLANE_OP_NEIGH_DELETE:
1555 case DPLANE_OP_VTEP_ADD:
1556 case DPLANE_OP_VTEP_DELETE:
1557 case DPLANE_OP_NEIGH_DISCOVER:
1558 case DPLANE_OP_NEIGH_IP_INSTALL:
1559 case DPLANE_OP_NEIGH_IP_DELETE:
1560 case DPLANE_OP_NEIGH_TABLE_UPDATE:
1561 return netlink_put_neigh_update_msg(bth, ctx);
1562
1563 case DPLANE_OP_RULE_ADD:
1564 case DPLANE_OP_RULE_DELETE:
1565 case DPLANE_OP_RULE_UPDATE:
1566 return netlink_put_rule_update_msg(bth, ctx);
1567
1568 case DPLANE_OP_SYS_ROUTE_ADD:
1569 case DPLANE_OP_SYS_ROUTE_DELETE:
1570 case DPLANE_OP_ROUTE_NOTIFY:
1571 case DPLANE_OP_LSP_NOTIFY:
1572 case DPLANE_OP_BR_PORT_UPDATE:
1573 return FRR_NETLINK_SUCCESS;
1574
1575 case DPLANE_OP_IPTABLE_ADD:
1576 case DPLANE_OP_IPTABLE_DELETE:
1577 case DPLANE_OP_IPSET_ADD:
1578 case DPLANE_OP_IPSET_DELETE:
1579 case DPLANE_OP_IPSET_ENTRY_ADD:
1580 case DPLANE_OP_IPSET_ENTRY_DELETE:
1581 return FRR_NETLINK_ERROR;
1582
1583 case DPLANE_OP_GRE_SET:
1584 return netlink_put_gre_set_msg(bth, ctx);
1585
1586 case DPLANE_OP_INTF_ADDR_ADD:
1587 case DPLANE_OP_INTF_ADDR_DEL:
1588 case DPLANE_OP_INTF_NETCONFIG:
1589 case DPLANE_OP_NONE:
1590 return FRR_NETLINK_ERROR;
1591
1592 case DPLANE_OP_INTF_INSTALL:
1593 case DPLANE_OP_INTF_UPDATE:
1594 case DPLANE_OP_INTF_DELETE:
1595 return netlink_put_intf_update_msg(bth, ctx);
1596 }
1597
1598 return FRR_NETLINK_ERROR;
1599 }
1600
1601 void kernel_update_multi(struct dplane_ctx_q *ctx_list)
1602 {
1603 struct nl_batch batch;
1604 struct zebra_dplane_ctx *ctx;
1605 struct dplane_ctx_q handled_list;
1606 enum netlink_msg_status res;
1607
1608 TAILQ_INIT(&handled_list);
1609 nl_batch_init(&batch, &handled_list);
1610
1611 while (true) {
1612 ctx = dplane_ctx_dequeue(ctx_list);
1613 if (ctx == NULL)
1614 break;
1615
1616 if (batch.zns != NULL
1617 && batch.zns->ns_id != dplane_ctx_get_ns(ctx)->ns_id)
1618 nl_batch_send(&batch);
1619
1620 /*
1621 * Assume all messages will succeed and then mark only the ones
1622 * that failed.
1623 */
1624 dplane_ctx_set_status(ctx, ZEBRA_DPLANE_REQUEST_SUCCESS);
1625
1626 res = nl_put_msg(&batch, ctx);
1627
1628 dplane_ctx_enqueue_tail(&(batch.ctx_list), ctx);
1629 if (res == FRR_NETLINK_ERROR)
1630 dplane_ctx_set_status(ctx,
1631 ZEBRA_DPLANE_REQUEST_FAILURE);
1632
1633 if (batch.curlen > batch.limit)
1634 nl_batch_send(&batch);
1635 }
1636
1637 nl_batch_send(&batch);
1638
1639 TAILQ_INIT(ctx_list);
1640 dplane_ctx_list_append(ctx_list, &handled_list);
1641 }
1642
1643 struct nlsock *kernel_netlink_nlsock_lookup(int sock)
1644 {
1645 struct nlsock lookup, *retval;
1646
1647 lookup.sock = sock;
1648
1649 NLSOCK_LOCK();
1650 retval = hash_lookup(nlsock_hash, &lookup);
1651 NLSOCK_UNLOCK();
1652
1653 return retval;
1654 }
1655
1656 /* Insert nlsock entry into hash */
1657 static void kernel_netlink_nlsock_insert(struct nlsock *nls)
1658 {
1659 NLSOCK_LOCK();
1660 (void)hash_get(nlsock_hash, nls, hash_alloc_intern);
1661 NLSOCK_UNLOCK();
1662 }
1663
1664 /* Remove nlsock entry from hash */
1665 static void kernel_netlink_nlsock_remove(struct nlsock *nls)
1666 {
1667 NLSOCK_LOCK();
1668 (void)hash_release(nlsock_hash, nls);
1669 NLSOCK_UNLOCK();
1670 }
1671
1672 static uint32_t kernel_netlink_nlsock_key(const void *arg)
1673 {
1674 const struct nlsock *nl = arg;
1675
1676 return nl->sock;
1677 }
1678
1679 static bool kernel_netlink_nlsock_hash_equal(const void *arg1, const void *arg2)
1680 {
1681 const struct nlsock *nl1 = arg1;
1682 const struct nlsock *nl2 = arg2;
1683
1684 if (nl1->sock == nl2->sock)
1685 return true;
1686
1687 return false;
1688 }
1689
1690 /* Exported interface function. This function simply calls
1691 netlink_socket (). */
1692 void kernel_init(struct zebra_ns *zns)
1693 {
1694 uint32_t groups, dplane_groups, ext_groups;
1695 #if defined SOL_NETLINK
1696 int one, ret;
1697 #endif
1698
1699 /*
1700 * Initialize netlink sockets
1701 *
1702 * If RTMGRP_XXX exists use that, but at some point
1703 * I think the kernel developers realized that
1704 * keeping track of all the different values would
1705 * lead to confusion, so we need to convert the
1706 * RTNLGRP_XXX to a bit position for ourself
1707 */
1708 groups = RTMGRP_LINK |
1709 RTMGRP_IPV4_ROUTE |
1710 RTMGRP_IPV4_IFADDR |
1711 RTMGRP_IPV6_ROUTE |
1712 RTMGRP_IPV6_IFADDR |
1713 RTMGRP_IPV4_MROUTE |
1714 RTMGRP_NEIGH |
1715 ((uint32_t) 1 << (RTNLGRP_IPV4_RULE - 1)) |
1716 ((uint32_t) 1 << (RTNLGRP_IPV6_RULE - 1)) |
1717 ((uint32_t) 1 << (RTNLGRP_NEXTHOP - 1));
1718
1719 dplane_groups = (RTMGRP_LINK |
1720 RTMGRP_IPV4_IFADDR |
1721 RTMGRP_IPV6_IFADDR |
1722 ((uint32_t) 1 << (RTNLGRP_IPV4_NETCONF - 1)) |
1723 ((uint32_t) 1 << (RTNLGRP_IPV6_NETCONF - 1)) |
1724 ((uint32_t) 1 << (RTNLGRP_MPLS_NETCONF - 1)));
1725
1726 /* Use setsockopt for > 31 group */
1727 ext_groups = RTNLGRP_TUNNEL;
1728
1729 snprintf(zns->netlink.name, sizeof(zns->netlink.name),
1730 "netlink-listen (NS %u)", zns->ns_id);
1731 zns->netlink.sock = -1;
1732 if (netlink_socket(&zns->netlink, groups, ext_groups, zns->ns_id) < 0) {
1733 zlog_err("Failure to create %s socket",
1734 zns->netlink.name);
1735 exit(-1);
1736 }
1737
1738 kernel_netlink_nlsock_insert(&zns->netlink);
1739
1740 snprintf(zns->netlink_cmd.name, sizeof(zns->netlink_cmd.name),
1741 "netlink-cmd (NS %u)", zns->ns_id);
1742 zns->netlink_cmd.sock = -1;
1743 if (netlink_socket(&zns->netlink_cmd, 0, 0, zns->ns_id) < 0) {
1744 zlog_err("Failure to create %s socket",
1745 zns->netlink_cmd.name);
1746 exit(-1);
1747 }
1748
1749 kernel_netlink_nlsock_insert(&zns->netlink_cmd);
1750
1751 /* Outbound socket for dplane programming of the host OS. */
1752 snprintf(zns->netlink_dplane_out.name,
1753 sizeof(zns->netlink_dplane_out.name), "netlink-dp (NS %u)",
1754 zns->ns_id);
1755 zns->netlink_dplane_out.sock = -1;
1756 if (netlink_socket(&zns->netlink_dplane_out, 0, 0, zns->ns_id) < 0) {
1757 zlog_err("Failure to create %s socket",
1758 zns->netlink_dplane_out.name);
1759 exit(-1);
1760 }
1761
1762 kernel_netlink_nlsock_insert(&zns->netlink_dplane_out);
1763
1764 /* Inbound socket for OS events coming to the dplane. */
1765 snprintf(zns->netlink_dplane_in.name,
1766 sizeof(zns->netlink_dplane_in.name), "netlink-dp-in (NS %u)",
1767 zns->ns_id);
1768 zns->netlink_dplane_in.sock = -1;
1769 if (netlink_socket(&zns->netlink_dplane_in, dplane_groups, 0,
1770 zns->ns_id) < 0) {
1771 zlog_err("Failure to create %s socket",
1772 zns->netlink_dplane_in.name);
1773 exit(-1);
1774 }
1775
1776 kernel_netlink_nlsock_insert(&zns->netlink_dplane_in);
1777
1778 /*
1779 * SOL_NETLINK is not available on all platforms yet
1780 * apparently. It's in bits/socket.h which I am not
1781 * sure that we want to pull into our build system.
1782 */
1783 #if defined SOL_NETLINK
1784 /*
1785 * Let's tell the kernel that we want to receive extended
1786 * ACKS over our command socket(s)
1787 */
1788 one = 1;
1789 ret = setsockopt(zns->netlink_cmd.sock, SOL_NETLINK, NETLINK_EXT_ACK,
1790 &one, sizeof(one));
1791
1792 if (ret < 0)
1793 zlog_notice("Registration for extended cmd ACK failed : %d %s",
1794 errno, safe_strerror(errno));
1795
1796 one = 1;
1797 ret = setsockopt(zns->netlink_dplane_out.sock, SOL_NETLINK,
1798 NETLINK_EXT_ACK, &one, sizeof(one));
1799
1800 if (ret < 0)
1801 zlog_notice("Registration for extended dp ACK failed : %d %s",
1802 errno, safe_strerror(errno));
1803
1804 /*
1805 * Trim off the payload of the original netlink message in the
1806 * acknowledgment. This option is available since Linux 4.2, so if
1807 * setsockopt fails, ignore the error.
1808 */
1809 one = 1;
1810 ret = setsockopt(zns->netlink_dplane_out.sock, SOL_NETLINK,
1811 NETLINK_CAP_ACK, &one, sizeof(one));
1812 if (ret < 0)
1813 zlog_notice(
1814 "Registration for reduced ACK packet size failed, probably running an early kernel");
1815 #endif
1816
1817 /* Register kernel socket. */
1818 if (fcntl(zns->netlink.sock, F_SETFL, O_NONBLOCK) < 0)
1819 flog_err_sys(EC_LIB_SOCKET, "Can't set %s socket flags: %s",
1820 zns->netlink.name, safe_strerror(errno));
1821
1822 if (fcntl(zns->netlink_cmd.sock, F_SETFL, O_NONBLOCK) < 0)
1823 zlog_err("Can't set %s socket error: %s(%d)",
1824 zns->netlink_cmd.name, safe_strerror(errno), errno);
1825
1826 if (fcntl(zns->netlink_dplane_out.sock, F_SETFL, O_NONBLOCK) < 0)
1827 zlog_err("Can't set %s socket error: %s(%d)",
1828 zns->netlink_dplane_out.name, safe_strerror(errno),
1829 errno);
1830
1831 if (fcntl(zns->netlink_dplane_in.sock, F_SETFL, O_NONBLOCK) < 0)
1832 zlog_err("Can't set %s socket error: %s(%d)",
1833 zns->netlink_dplane_in.name, safe_strerror(errno),
1834 errno);
1835
1836 /* Set receive buffer size if it's set from command line */
1837 if (rcvbufsize) {
1838 netlink_recvbuf(&zns->netlink, rcvbufsize);
1839 netlink_recvbuf(&zns->netlink_cmd, rcvbufsize);
1840 netlink_recvbuf(&zns->netlink_dplane_out, rcvbufsize);
1841 netlink_recvbuf(&zns->netlink_dplane_in, rcvbufsize);
1842 }
1843
1844 /* Set filter for inbound sockets, to exclude events we've generated
1845 * ourselves.
1846 */
1847 netlink_install_filter(zns->netlink.sock, zns->netlink_cmd.snl.nl_pid,
1848 zns->netlink_dplane_out.snl.nl_pid);
1849
1850 netlink_install_filter(zns->netlink_dplane_in.sock,
1851 zns->netlink_cmd.snl.nl_pid,
1852 zns->netlink_dplane_out.snl.nl_pid);
1853
1854 zns->t_netlink = NULL;
1855
1856 thread_add_read(zrouter.master, kernel_read, zns,
1857 zns->netlink.sock, &zns->t_netlink);
1858
1859 rt_netlink_init();
1860 }
1861
1862 /* Helper to clean up an nlsock */
1863 static void kernel_nlsock_fini(struct nlsock *nls)
1864 {
1865 if (nls && nls->sock >= 0) {
1866 kernel_netlink_nlsock_remove(nls);
1867 close(nls->sock);
1868 nls->sock = -1;
1869 XFREE(MTYPE_NL_BUF, nls->buf);
1870 nls->buflen = 0;
1871 }
1872 }
1873
1874 void kernel_terminate(struct zebra_ns *zns, bool complete)
1875 {
1876 thread_cancel(&zns->t_netlink);
1877
1878 kernel_nlsock_fini(&zns->netlink);
1879
1880 kernel_nlsock_fini(&zns->netlink_cmd);
1881
1882 kernel_nlsock_fini(&zns->netlink_dplane_in);
1883
1884 /* During zebra shutdown, we need to leave the dataplane socket
1885 * around until all work is done.
1886 */
1887 if (complete)
1888 kernel_nlsock_fini(&zns->netlink_dplane_out);
1889 }
1890
1891 /*
1892 * Global init for platform-/OS-specific things
1893 */
1894 void kernel_router_init(void)
1895 {
1896 /* Init nlsock hash and lock */
1897 pthread_mutex_init(&nlsock_mutex, NULL);
1898 nlsock_hash = hash_create_size(8, kernel_netlink_nlsock_key,
1899 kernel_netlink_nlsock_hash_equal,
1900 "Netlink Socket Hash");
1901 }
1902
1903 /*
1904 * Global deinit for platform-/OS-specific things
1905 */
1906 void kernel_router_terminate(void)
1907 {
1908 pthread_mutex_destroy(&nlsock_mutex);
1909
1910 hash_free(nlsock_hash);
1911 nlsock_hash = NULL;
1912 }
1913
1914 #endif /* HAVE_NETLINK */