]> git.proxmox.com Git - mirror_frr.git/blame - zebra/kernel_netlink.c
Merge pull request #8643 from icosahedral/master
[mirror_frr.git] / zebra / kernel_netlink.c
CommitLineData
718e3744 1/* Kernel communication using netlink interface.
2 * Copyright (C) 1999 Kunihiro Ishiguro
3 *
4 * This file is part of GNU Zebra.
5 *
6 * GNU Zebra is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the
8 * Free Software Foundation; either version 2, or (at your option) any
9 * later version.
10 *
11 * GNU Zebra is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
896014f4
DL
16 * You should have received a copy of the GNU General Public License along
17 * with this program; see the file COPYING; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
718e3744 19 */
1fdc9eae 20
21#include <zebra.h>
22
ddfeb486
DL
23#ifdef HAVE_NETLINK
24
1fdc9eae 25#include "linklist.h"
26#include "if.h"
27#include "log.h"
28#include "prefix.h"
29#include "connected.h"
30#include "table.h"
31#include "memory.h"
1fdc9eae 32#include "rib.h"
33#include "thread.h"
34#include "privs.h"
35#include "nexthop.h"
36#include "vrf.h"
37#include "mpls.h"
174482ef 38#include "lib_errors.h"
1fdc9eae 39
3801e764
DS
40//#include "zebra/zserv.h"
41#include "zebra/zebra_router.h"
1fdc9eae 42#include "zebra/zebra_ns.h"
43#include "zebra/zebra_vrf.h"
05f7f5db 44#include "zebra/rt.h"
1fdc9eae 45#include "zebra/debug.h"
46#include "zebra/kernel_netlink.h"
47#include "zebra/rt_netlink.h"
48#include "zebra/if_netlink.h"
942bf97b 49#include "zebra/rule_netlink.h"
43e52561 50#include "zebra/zebra_errors.h"
1fdc9eae 51
52#ifndef SO_RCVBUFFORCE
53#define SO_RCVBUFFORCE (33)
54#endif
55
56/* Hack for GNU libc version 2. */
57#ifndef MSG_TRUNC
58#define MSG_TRUNC 0x20
59#endif /* MSG_TRUNC */
60
61#ifndef NLMSG_TAIL
d62a17ae 62#define NLMSG_TAIL(nmsg) \
d7c0a89a
QY
63 ((struct rtattr *)(((uint8_t *)(nmsg)) \
64 + NLMSG_ALIGN((nmsg)->nlmsg_len)))
1fdc9eae 65#endif
66
67#ifndef RTA_TAIL
d62a17ae 68#define RTA_TAIL(rta) \
d7c0a89a 69 ((struct rtattr *)(((uint8_t *)(rta)) + RTA_ALIGN((rta)->rta_len)))
1fdc9eae 70#endif
71
f909c673
DS
72#ifndef RTNL_FAMILY_IP6MR
73#define RTNL_FAMILY_IP6MR 129
74#endif
75
76#ifndef RTPROT_MROUTED
77#define RTPROT_MROUTED 17
78#endif
79
531c92b8 80#define NL_DEFAULT_BATCH_BUFSIZE (16 * NL_PKT_BUF_SIZE)
e63c7622
JU
81
82/*
83 * We limit the batch's size to a number smaller than the length of the
84 * underlying buffer since the last message that wouldn't fit the batch would go
85 * over the upper boundary and then it would have to be encoded again into a new
86 * buffer. If the difference between the limit and the length of the buffer is
87 * big enough (bigger than the biggest Netlink message) then this situation
88 * won't occur.
89 */
531c92b8
JU
90#define NL_DEFAULT_BATCH_SEND_THRESHOLD (15 * NL_PKT_BUF_SIZE)
91
2f9dbd3a 92#define NL_BATCH_RX_BUFSIZE NL_RCV_PKT_BUF_SIZE
e63c7622 93
d62a17ae 94static const struct message nlmsg_str[] = {{RTM_NEWROUTE, "RTM_NEWROUTE"},
95 {RTM_DELROUTE, "RTM_DELROUTE"},
96 {RTM_GETROUTE, "RTM_GETROUTE"},
97 {RTM_NEWLINK, "RTM_NEWLINK"},
98 {RTM_DELLINK, "RTM_DELLINK"},
99 {RTM_GETLINK, "RTM_GETLINK"},
100 {RTM_NEWADDR, "RTM_NEWADDR"},
101 {RTM_DELADDR, "RTM_DELADDR"},
102 {RTM_GETADDR, "RTM_GETADDR"},
103 {RTM_NEWNEIGH, "RTM_NEWNEIGH"},
104 {RTM_DELNEIGH, "RTM_DELNEIGH"},
105 {RTM_GETNEIGH, "RTM_GETNEIGH"},
942bf97b 106 {RTM_NEWRULE, "RTM_NEWRULE"},
107 {RTM_DELRULE, "RTM_DELRULE"},
108 {RTM_GETRULE, "RTM_GETRULE"},
79580b5a
SW
109 {RTM_NEWNEXTHOP, "RTM_NEWNEXTHOP"},
110 {RTM_DELNEXTHOP, "RTM_DELNEXTHOP"},
111 {RTM_GETNEXTHOP, "RTM_GETNEXTHOP"},
d62a17ae 112 {0}};
1fdc9eae 113
114static const struct message rtproto_str[] = {
d62a17ae 115 {RTPROT_REDIRECT, "redirect"},
116 {RTPROT_KERNEL, "kernel"},
117 {RTPROT_BOOT, "boot"},
118 {RTPROT_STATIC, "static"},
119 {RTPROT_GATED, "GateD"},
120 {RTPROT_RA, "router advertisement"},
121 {RTPROT_MRT, "MRT"},
122 {RTPROT_ZEBRA, "Zebra"},
1fdc9eae 123#ifdef RTPROT_BIRD
d62a17ae 124 {RTPROT_BIRD, "BIRD"},
1fdc9eae 125#endif /* RTPROT_BIRD */
d62a17ae 126 {RTPROT_MROUTED, "mroute"},
127 {RTPROT_BGP, "BGP"},
128 {RTPROT_OSPF, "OSPF"},
129 {RTPROT_ISIS, "IS-IS"},
130 {RTPROT_RIP, "RIP"},
131 {RTPROT_RIPNG, "RIPNG"},
d4d71f11 132 {RTPROT_ZSTATIC, "static"},
d62a17ae 133 {0}};
134
135static const struct message family_str[] = {{AF_INET, "ipv4"},
136 {AF_INET6, "ipv6"},
137 {AF_BRIDGE, "bridge"},
138 {RTNL_FAMILY_IPMR, "ipv4MR"},
139 {RTNL_FAMILY_IP6MR, "ipv6MR"},
140 {0}};
141
8c8f250b
DS
142static const struct message rttype_str[] = {{RTN_UNSPEC, "none"},
143 {RTN_UNICAST, "unicast"},
144 {RTN_LOCAL, "local"},
145 {RTN_BROADCAST, "broadcast"},
146 {RTN_ANYCAST, "anycast"},
d62a17ae 147 {RTN_MULTICAST, "multicast"},
8c8f250b
DS
148 {RTN_BLACKHOLE, "blackhole"},
149 {RTN_UNREACHABLE, "unreachable"},
150 {RTN_PROHIBIT, "prohibited"},
151 {RTN_THROW, "throw"},
152 {RTN_NAT, "nat"},
153 {RTN_XRESOLVE, "resolver"},
d62a17ae 154 {0}};
b339bde7 155
1fdc9eae 156extern struct thread_master *master;
d7c0a89a 157extern uint32_t nl_rcvbufsize;
1fdc9eae 158
159extern struct zebra_privs_t zserv_privs;
160
bf8d3d6a 161DEFINE_MTYPE_STATIC(ZEBRA, NL_BUF, "Zebra Netlink buffers");
531c92b8
JU
162
163size_t nl_batch_tx_bufsize;
164char *nl_batch_tx_buf;
165
e63c7622
JU
166char nl_batch_rx_buf[NL_BATCH_RX_BUFSIZE];
167
531c92b8
JU
168_Atomic uint32_t nl_batch_bufsize = NL_DEFAULT_BATCH_BUFSIZE;
169_Atomic uint32_t nl_batch_send_threshold = NL_DEFAULT_BATCH_SEND_THRESHOLD;
170
e63c7622
JU
171struct nl_batch {
172 void *buf;
173 size_t bufsiz;
174 size_t limit;
175
176 void *buf_head;
177 size_t curlen;
178 size_t msgcnt;
179
180 const struct zebra_dplane_info *zns;
e63c7622 181
f6feb48b 182 struct dplane_ctx_q ctx_list;
e63c7622 183
f6feb48b
JU
184 /*
185 * Pointer to the queue of completed contexts outbound back
186 * towards the dataplane module.
187 */
188 struct dplane_ctx_q *ctx_out_q;
e63c7622
JU
189};
190
531c92b8
JU
191int netlink_config_write_helper(struct vty *vty)
192{
193 uint32_t size =
194 atomic_load_explicit(&nl_batch_bufsize, memory_order_relaxed);
195 uint32_t threshold = atomic_load_explicit(&nl_batch_send_threshold,
196 memory_order_relaxed);
197
198 if (size != NL_DEFAULT_BATCH_BUFSIZE
199 || threshold != NL_DEFAULT_BATCH_SEND_THRESHOLD)
200 vty_out(vty, "zebra kernel netlink batch-tx-buf %u %u\n", size,
201 threshold);
202
203 return 0;
204}
205
206void netlink_set_batch_buffer_size(uint32_t size, uint32_t threshold, bool set)
207{
208 if (!set) {
209 size = NL_DEFAULT_BATCH_BUFSIZE;
210 threshold = NL_DEFAULT_BATCH_SEND_THRESHOLD;
211 }
212
213 atomic_store_explicit(&nl_batch_bufsize, size, memory_order_relaxed);
214 atomic_store_explicit(&nl_batch_send_threshold, threshold,
215 memory_order_relaxed);
216}
217
2414abd3 218int netlink_talk_filter(struct nlmsghdr *h, ns_id_t ns_id, int startup)
1fdc9eae 219{
3575d9e8
DS
220 /*
221 * This is an error condition that must be handled during
222 * development.
223 *
224 * The netlink_talk_filter function is used for communication
225 * down the netlink_cmd pipe and we are expecting
226 * an ack being received. So if we get here
227 * then we did not receive the ack and instead
228 * received some other message in an unexpected
229 * way.
230 */
43e52561
QY
231 zlog_debug("%s: ignoring message type 0x%04x(%s) NS %u", __func__,
232 h->nlmsg_type, nl_msg_type_to_str(h->nlmsg_type), ns_id);
d62a17ae 233 return 0;
1fdc9eae 234}
235
d62a17ae 236static int netlink_recvbuf(struct nlsock *nl, uint32_t newsize)
1fdc9eae 237{
d7c0a89a 238 uint32_t oldsize;
d62a17ae 239 socklen_t newlen = sizeof(newsize);
240 socklen_t oldlen = sizeof(oldsize);
241 int ret;
242
243 ret = getsockopt(nl->sock, SOL_SOCKET, SO_RCVBUF, &oldsize, &oldlen);
244 if (ret < 0) {
450971aa 245 flog_err_sys(EC_LIB_SOCKET,
09c866e3
QY
246 "Can't get %s receive buffer size: %s", nl->name,
247 safe_strerror(errno));
d62a17ae 248 return -1;
249 }
250
251 /* Try force option (linux >= 2.6.14) and fall back to normal set */
0cf6db21 252 frr_with_privs(&zserv_privs) {
01b9e3fd
DL
253 ret = setsockopt(nl->sock, SOL_SOCKET, SO_RCVBUFFORCE,
254 &nl_rcvbufsize,
255 sizeof(nl_rcvbufsize));
256 }
d62a17ae 257 if (ret < 0)
258 ret = setsockopt(nl->sock, SOL_SOCKET, SO_RCVBUF,
259 &nl_rcvbufsize, sizeof(nl_rcvbufsize));
260 if (ret < 0) {
450971aa 261 flog_err_sys(EC_LIB_SOCKET,
09c866e3
QY
262 "Can't set %s receive buffer size: %s", nl->name,
263 safe_strerror(errno));
d62a17ae 264 return -1;
265 }
266
267 ret = getsockopt(nl->sock, SOL_SOCKET, SO_RCVBUF, &newsize, &newlen);
268 if (ret < 0) {
450971aa 269 flog_err_sys(EC_LIB_SOCKET,
09c866e3
QY
270 "Can't get %s receive buffer size: %s", nl->name,
271 safe_strerror(errno));
d62a17ae 272 return -1;
273 }
274
275 zlog_info("Setting netlink socket receive buffer size: %u -> %u",
276 oldsize, newsize);
277 return 0;
1fdc9eae 278}
279
280/* Make socket for Linux netlink interface. */
d62a17ae 281static int netlink_socket(struct nlsock *nl, unsigned long groups,
282 ns_id_t ns_id)
1fdc9eae 283{
d62a17ae 284 int ret;
285 struct sockaddr_nl snl;
286 int sock;
287 int namelen;
d62a17ae 288
0cf6db21 289 frr_with_privs(&zserv_privs) {
6bb30c2c
DL
290 sock = ns_socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE, ns_id);
291 if (sock < 0) {
292 zlog_err("Can't open %s socket: %s", nl->name,
293 safe_strerror(errno));
294 return -1;
295 }
d62a17ae 296
0d6f7fd6 297 memset(&snl, 0, sizeof(snl));
6bb30c2c
DL
298 snl.nl_family = AF_NETLINK;
299 snl.nl_groups = groups;
d62a17ae 300
6bb30c2c 301 /* Bind the socket to the netlink structure for anything. */
0d6f7fd6 302 ret = bind(sock, (struct sockaddr *)&snl, sizeof(snl));
6bb30c2c 303 }
d62a17ae 304
305 if (ret < 0) {
6bb30c2c
DL
306 zlog_err("Can't bind %s socket to group 0x%x: %s", nl->name,
307 snl.nl_groups, safe_strerror(errno));
d62a17ae 308 close(sock);
309 return -1;
310 }
311
312 /* multiple netlink sockets will have different nl_pid */
0d6f7fd6 313 namelen = sizeof(snl);
d62a17ae 314 ret = getsockname(sock, (struct sockaddr *)&snl, (socklen_t *)&namelen);
0d6f7fd6 315 if (ret < 0 || namelen != sizeof(snl)) {
450971aa 316 flog_err_sys(EC_LIB_SOCKET, "Can't get %s socket name: %s",
09c866e3 317 nl->name, safe_strerror(errno));
d62a17ae 318 close(sock);
319 return -1;
320 }
321
322 nl->snl = snl;
323 nl->sock = sock;
324 return ret;
1fdc9eae 325}
326
2414abd3 327static int netlink_information_fetch(struct nlmsghdr *h, ns_id_t ns_id,
d62a17ae 328 int startup)
1fdc9eae 329{
3575d9e8
DS
330 /*
331 * When we handle new message types here
332 * because we are starting to install them
333 * then lets check the netlink_install_filter
334 * and see if we should add the corresponding
335 * allow through entry there.
336 * Probably not needed to do but please
337 * think about it.
338 */
d62a17ae 339 switch (h->nlmsg_type) {
340 case RTM_NEWROUTE:
2414abd3 341 return netlink_route_change(h, ns_id, startup);
d62a17ae 342 case RTM_DELROUTE:
2414abd3 343 return netlink_route_change(h, ns_id, startup);
d62a17ae 344 case RTM_NEWLINK:
2414abd3 345 return netlink_link_change(h, ns_id, startup);
d62a17ae 346 case RTM_DELLINK:
2414abd3 347 return netlink_link_change(h, ns_id, startup);
d62a17ae 348 case RTM_NEWADDR:
2414abd3 349 return netlink_interface_addr(h, ns_id, startup);
d62a17ae 350 case RTM_DELADDR:
2414abd3 351 return netlink_interface_addr(h, ns_id, startup);
d62a17ae 352 case RTM_NEWNEIGH:
2414abd3 353 return netlink_neigh_change(h, ns_id);
d62a17ae 354 case RTM_DELNEIGH:
2414abd3 355 return netlink_neigh_change(h, ns_id);
951f8bcb
DS
356 case RTM_GETNEIGH:
357 /*
358 * Kernel in some situations when it expects
359 * user space to resolve arp entries, we will
360 * receive this notification. As we don't
361 * need this notification and as that
362 * we don't want to spam the log file with
363 * below messages, just ignore.
364 */
365 if (IS_ZEBRA_DEBUG_KERNEL)
366 zlog_debug("Received RTM_GETNEIGH, ignoring");
367 break;
942bf97b 368 case RTM_NEWRULE:
2414abd3 369 return netlink_rule_change(h, ns_id, startup);
942bf97b 370 case RTM_DELRULE:
2414abd3 371 return netlink_rule_change(h, ns_id, startup);
79580b5a 372 case RTM_NEWNEXTHOP:
d9f5b2f5 373 return netlink_nexthop_change(h, ns_id, startup);
79580b5a 374 case RTM_DELNEXTHOP:
d9f5b2f5 375 return netlink_nexthop_change(h, ns_id, startup);
d62a17ae 376 default:
3575d9e8
DS
377 /*
378 * If we have received this message then
379 * we have made a mistake during development
380 * and we need to write some code to handle
381 * this message type or not ask for
382 * it to be sent up to us
383 */
e914ccbe 384 flog_err(EC_ZEBRA_UNKNOWN_NLMSG,
1d5453d6 385 "Unknown netlink nlmsg_type %s(%d) vrf %u",
1c50c1c0
QY
386 nl_msg_type_to_str(h->nlmsg_type), h->nlmsg_type,
387 ns_id);
d62a17ae 388 break;
389 }
390 return 0;
1fdc9eae 391}
392
d62a17ae 393static int kernel_read(struct thread *thread)
1fdc9eae 394{
d62a17ae 395 struct zebra_ns *zns = (struct zebra_ns *)THREAD_ARG(thread);
85a75f1e
MS
396 struct zebra_dplane_info dp_info;
397
398 /* Capture key info from ns struct */
399 zebra_dplane_info_from_zns(&dp_info, zns, false);
400
401 netlink_parse_info(netlink_information_fetch, &zns->netlink, &dp_info,
402 5, 0);
d62a17ae 403 zns->t_netlink = NULL;
3801e764 404 thread_add_read(zrouter.master, kernel_read, zns, zns->netlink.sock,
d62a17ae 405 &zns->t_netlink);
1fdc9eae 406
d62a17ae 407 return 0;
1fdc9eae 408}
409
3575d9e8
DS
410/*
411 * Filter out messages from self that occur on listener socket,
62b8bb7a 412 * caused by our actions on the command socket(s)
3575d9e8
DS
413 *
414 * When we add new Netlink message types we probably
415 * do not need to add them here as that we are filtering
416 * on the routes we actually care to receive( which is rarer
417 * then the normal course of operations). We are intentionally
418 * allowing some messages from ourselves through
419 * ( I'm looking at you Interface based netlink messages )
420 * so that we only had to write one way to handle incoming
421 * address add/delete changes.
1fdc9eae 422 */
62b8bb7a 423static void netlink_install_filter(int sock, __u32 pid, __u32 dplane_pid)
1fdc9eae 424{
3575d9e8
DS
425 /*
426 * BPF_JUMP instructions and where you jump to are based upon
427 * 0 as being the next statement. So count from 0. Writing
428 * this down because every time I look at this I have to
429 * re-remember it.
430 */
d62a17ae 431 struct sock_filter filter[] = {
3575d9e8
DS
432 /*
433 * Logic:
62b8bb7a
MS
434 * if (nlmsg_pid == pid ||
435 * nlmsg_pid == dplane_pid) {
3575d9e8
DS
436 * if (the incoming nlmsg_type ==
437 * RTM_NEWADDR | RTM_DELADDR)
438 * keep this message
439 * else
440 * skip this message
441 * } else
442 * keep this netlink message
443 */
444 /*
445 * 0: Load the nlmsg_pid into the BPF register
446 */
d62a17ae 447 BPF_STMT(BPF_LD | BPF_ABS | BPF_W,
448 offsetof(struct nlmsghdr, nlmsg_pid)),
3575d9e8
DS
449 /*
450 * 1: Compare to pid
451 */
62b8bb7a 452 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htonl(pid), 1, 0),
3575d9e8 453 /*
62b8bb7a
MS
454 * 2: Compare to dplane pid
455 */
456 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htonl(dplane_pid), 0, 4),
457 /*
458 * 3: Load the nlmsg_type into BPF register
3575d9e8
DS
459 */
460 BPF_STMT(BPF_LD | BPF_ABS | BPF_H,
461 offsetof(struct nlmsghdr, nlmsg_type)),
462 /*
62b8bb7a 463 * 4: Compare to RTM_NEWADDR
3575d9e8
DS
464 */
465 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htons(RTM_NEWADDR), 2, 0),
466 /*
62b8bb7a 467 * 5: Compare to RTM_DELADDR
3575d9e8
DS
468 */
469 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htons(RTM_DELADDR), 1, 0),
470 /*
62b8bb7a 471 * 6: This is the end state of we want to skip the
3575d9e8
DS
472 * message
473 */
d62a17ae 474 BPF_STMT(BPF_RET | BPF_K, 0),
62b8bb7a 475 /* 7: This is the end state of we want to keep
3575d9e8
DS
476 * the message
477 */
d62a17ae 478 BPF_STMT(BPF_RET | BPF_K, 0xffff),
479 };
480
481 struct sock_fprog prog = {
9d303b37 482 .len = array_size(filter), .filter = filter,
d62a17ae 483 };
484
485 if (setsockopt(sock, SOL_SOCKET, SO_ATTACH_FILTER, &prog, sizeof(prog))
486 < 0)
1d5453d6 487 flog_err_sys(EC_LIB_SOCKET, "Can't install socket filter: %s",
9df414fe 488 safe_strerror(errno));
1fdc9eae 489}
490
4bcdb608
NA
491void netlink_parse_rtattr_flags(struct rtattr **tb, int max,
492 struct rtattr *rta, int len, unsigned short flags)
493{
494 unsigned short type;
495
269b69d7 496 memset(tb, 0, sizeof(struct rtattr *) * (max + 1));
4bcdb608
NA
497 while (RTA_OK(rta, len)) {
498 type = rta->rta_type & ~flags;
499 if ((type <= max) && (!tb[type]))
500 tb[type] = rta;
501 rta = RTA_NEXT(rta, len);
502 }
503}
504
d62a17ae 505void netlink_parse_rtattr(struct rtattr **tb, int max, struct rtattr *rta,
506 int len)
1fdc9eae 507{
269b69d7 508 memset(tb, 0, sizeof(struct rtattr *) * (max + 1));
d62a17ae 509 while (RTA_OK(rta, len)) {
510 if (rta->rta_type <= max)
511 tb[rta->rta_type] = rta;
512 rta = RTA_NEXT(rta, len);
513 }
1fdc9eae 514}
515
87da6a60
SW
516/**
517 * netlink_parse_rtattr_nested() - Parses a nested route attribute
518 * @tb: Pointer to array for storing rtattr in.
519 * @max: Max number to store.
520 * @rta: Pointer to rtattr to look for nested items in.
521 */
522void netlink_parse_rtattr_nested(struct rtattr **tb, int max,
523 struct rtattr *rta)
524{
525 netlink_parse_rtattr(tb, max, RTA_DATA(rta), RTA_PAYLOAD(rta));
526}
527
312a6bee
JU
528bool nl_attr_put(struct nlmsghdr *n, unsigned int maxlen, int type,
529 const void *data, unsigned int alen)
1fdc9eae 530{
d62a17ae 531 int len;
532 struct rtattr *rta;
1fdc9eae 533
d62a17ae 534 len = RTA_LENGTH(alen);
1fdc9eae 535
d62a17ae 536 if (NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len) > maxlen)
312a6bee 537 return false;
1fdc9eae 538
d62a17ae 539 rta = (struct rtattr *)(((char *)n) + NLMSG_ALIGN(n->nlmsg_len));
540 rta->rta_type = type;
541 rta->rta_len = len;
4b2792b5 542
d62a17ae 543 if (data)
544 memcpy(RTA_DATA(rta), data, alen);
545 else
546 assert(alen == 0);
4b2792b5 547
d62a17ae 548 n->nlmsg_len = NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len);
1fdc9eae 549
312a6bee 550 return true;
1fdc9eae 551}
552
312a6bee
JU
553bool nl_attr_put16(struct nlmsghdr *n, unsigned int maxlen, int type,
554 uint16_t data)
bbc16902 555{
312a6bee 556 return nl_attr_put(n, maxlen, type, &data, sizeof(uint16_t));
bbc16902 557}
558
312a6bee
JU
559bool nl_attr_put32(struct nlmsghdr *n, unsigned int maxlen, int type,
560 uint32_t data)
1fdc9eae 561{
312a6bee 562 return nl_attr_put(n, maxlen, type, &data, sizeof(uint32_t));
1fdc9eae 563}
564
312a6bee 565struct rtattr *nl_attr_nest(struct nlmsghdr *n, unsigned int maxlen, int type)
1fdc9eae 566{
d62a17ae 567 struct rtattr *nest = NLMSG_TAIL(n);
1fdc9eae 568
312a6bee
JU
569 if (!nl_attr_put(n, maxlen, type, NULL, 0))
570 return NULL;
571
40d86eba 572 nest->rta_type |= NLA_F_NESTED;
d62a17ae 573 return nest;
1fdc9eae 574}
575
312a6bee 576int nl_attr_nest_end(struct nlmsghdr *n, struct rtattr *nest)
1fdc9eae 577{
d7c0a89a 578 nest->rta_len = (uint8_t *)NLMSG_TAIL(n) - (uint8_t *)nest;
d62a17ae 579 return n->nlmsg_len;
1fdc9eae 580}
581
312a6bee 582struct rtnexthop *nl_attr_rtnh(struct nlmsghdr *n, unsigned int maxlen)
1fdc9eae 583{
312a6bee 584 struct rtnexthop *rtnh = (struct rtnexthop *)NLMSG_TAIL(n);
1fdc9eae 585
312a6bee
JU
586 if (NLMSG_ALIGN(n->nlmsg_len) + RTNH_ALIGN(sizeof(struct rtnexthop))
587 > maxlen)
588 return NULL;
589
590 memset(rtnh, 0, sizeof(struct rtnexthop));
591 n->nlmsg_len =
592 NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(sizeof(struct rtnexthop));
593
594 return rtnh;
1fdc9eae 595}
596
312a6bee 597void nl_attr_rtnh_end(struct nlmsghdr *n, struct rtnexthop *rtnh)
1fdc9eae 598{
312a6bee 599 rtnh->rtnh_len = (uint8_t *)NLMSG_TAIL(n) - (uint8_t *)rtnh;
1fdc9eae 600}
601
d62a17ae 602const char *nl_msg_type_to_str(uint16_t msg_type)
1fdc9eae 603{
d62a17ae 604 return lookup_msg(nlmsg_str, msg_type, "");
1fdc9eae 605}
606
d7c0a89a 607const char *nl_rtproto_to_str(uint8_t rtproto)
1fdc9eae 608{
d62a17ae 609 return lookup_msg(rtproto_str, rtproto, "");
1fdc9eae 610}
b339bde7 611
d7c0a89a 612const char *nl_family_to_str(uint8_t family)
b339bde7 613{
d62a17ae 614 return lookup_msg(family_str, family, "");
b339bde7
DS
615}
616
d7c0a89a 617const char *nl_rttype_to_str(uint8_t rttype)
b339bde7 618{
d62a17ae 619 return lookup_msg(rttype_str, rttype, "");
b339bde7
DS
620}
621
4cebb2b6 622#define NLA_OK(nla, len) \
5d307d5d
DS
623 ((len) >= (int)sizeof(struct nlattr) \
624 && (nla)->nla_len >= sizeof(struct nlattr) \
625 && (nla)->nla_len <= (len))
4cebb2b6
SW
626#define NLA_NEXT(nla, attrlen) \
627 ((attrlen) -= NLA_ALIGN((nla)->nla_len), \
628 (struct nlattr *)(((char *)(nla)) + NLA_ALIGN((nla)->nla_len)))
629#define NLA_LENGTH(len) (NLA_ALIGN(sizeof(struct nlattr)) + (len))
630#define NLA_DATA(nla) ((struct nlattr *)(((char *)(nla)) + NLA_LENGTH(0)))
631
632#define ERR_NLA(err, inner_len) \
633 ((struct nlattr *)(((char *)(err)) \
634 + NLMSG_ALIGN(sizeof(struct nlmsgerr)) \
635 + NLMSG_ALIGN((inner_len))))
5d307d5d
DS
636
637static void netlink_parse_nlattr(struct nlattr **tb, int max,
638 struct nlattr *nla, int len)
639{
4cebb2b6 640 while (NLA_OK(nla, len)) {
5d307d5d
DS
641 if (nla->nla_type <= max)
642 tb[nla->nla_type] = nla;
4cebb2b6 643 nla = NLA_NEXT(nla, len);
5d307d5d
DS
644 }
645}
646
647static void netlink_parse_extended_ack(struct nlmsghdr *h)
648{
4cebb2b6
SW
649 struct nlattr *tb[NLMSGERR_ATTR_MAX + 1] = {};
650 const struct nlmsgerr *err = (const struct nlmsgerr *)NLMSG_DATA(h);
5d307d5d 651 const struct nlmsghdr *err_nlh = NULL;
4cebb2b6
SW
652 /* Length not including nlmsghdr */
653 uint32_t len = 0;
654 /* Inner error netlink message length */
655 uint32_t inner_len = 0;
5d307d5d
DS
656 const char *msg = NULL;
657 uint32_t off = 0;
658
659 if (!(h->nlmsg_flags & NLM_F_CAPPED))
4cebb2b6
SW
660 inner_len = (uint32_t)NLMSG_PAYLOAD(&err->msg, 0);
661
662 len = (uint32_t)(NLMSG_PAYLOAD(h, sizeof(struct nlmsgerr)) - inner_len);
5d307d5d 663
4cebb2b6
SW
664 netlink_parse_nlattr(tb, NLMSGERR_ATTR_MAX, ERR_NLA(err, inner_len),
665 len);
5d307d5d
DS
666
667 if (tb[NLMSGERR_ATTR_MSG])
4cebb2b6 668 msg = (const char *)NLA_DATA(tb[NLMSGERR_ATTR_MSG]);
5d307d5d
DS
669
670 if (tb[NLMSGERR_ATTR_OFFS]) {
4cebb2b6 671 off = *(uint32_t *)NLA_DATA(tb[NLMSGERR_ATTR_OFFS]);
5d307d5d
DS
672
673 if (off > h->nlmsg_len) {
9165c5f5 674 zlog_err("Invalid offset for NLMSGERR_ATTR_OFFS");
5d307d5d
DS
675 } else if (!(h->nlmsg_flags & NLM_F_CAPPED)) {
676 /*
677 * Header of failed message
678 * we are not doing anything currently with it
679 * but noticing it for later.
680 */
681 err_nlh = &err->msg;
15569c58 682 zlog_debug("%s: Received %s extended Ack", __func__,
87b5d1b0 683 nl_msg_type_to_str(err_nlh->nlmsg_type));
5d307d5d
DS
684 }
685 }
686
687 if (msg && *msg != '\0') {
688 bool is_err = !!err->error;
689
690 if (is_err)
691 zlog_err("Extended Error: %s", msg);
692 else
e914ccbe 693 flog_warn(EC_ZEBRA_NETLINK_EXTENDED_WARNING,
9df414fe 694 "Extended Warning: %s", msg);
5d307d5d
DS
695 }
696}
697
ae6138bf
JU
698/*
699 * netlink_send_msg - send a netlink message of a certain size.
700 *
701 * Returns -1 on error. Otherwise, it returns the number of bytes sent.
702 */
f8653393
JU
703static ssize_t netlink_send_msg(const struct nlsock *nl, void *buf,
704 size_t buflen)
ae6138bf 705{
f8653393
JU
706 struct sockaddr_nl snl = {};
707 struct iovec iov = {};
708 struct msghdr msg = {};
709 ssize_t status;
710 int save_errno = 0;
ae6138bf
JU
711
712 iov.iov_base = buf;
713 iov.iov_len = buflen;
f8653393 714 msg.msg_name = &snl;
ae6138bf
JU
715 msg.msg_namelen = sizeof(snl);
716 msg.msg_iov = &iov;
717 msg.msg_iovlen = 1;
718
719 snl.nl_family = AF_NETLINK;
720
721 /* Send message to netlink interface. */
722 frr_with_privs(&zserv_privs) {
723 status = sendmsg(nl->sock, &msg, 0);
724 save_errno = errno;
725 }
726
727 if (IS_ZEBRA_DEBUG_KERNEL_MSGDUMP_SEND) {
728 zlog_debug("%s: >> netlink message dump [sent]", __func__);
eead0bc4
RZ
729#ifdef NETLINK_DEBUG
730 nl_dump(buf, buflen);
731#else
ae6138bf 732 zlog_hexdump(buf, buflen);
eead0bc4 733#endif /* NETLINK_DEBUG */
ae6138bf
JU
734 }
735
f8653393 736 if (status == -1) {
ae6138bf
JU
737 flog_err_sys(EC_LIB_SOCKET, "%s error: %s", __func__,
738 safe_strerror(save_errno));
739 return -1;
740 }
741
742 return status;
743}
744
745/*
746 * netlink_recv_msg - receive a netlink message.
747 *
748 * Returns -1 on error, 0 if read would block or the number of bytes received.
749 */
750static int netlink_recv_msg(const struct nlsock *nl, struct msghdr msg,
751 void *buf, size_t buflen)
752{
753 struct iovec iov;
754 int status;
755
756 iov.iov_base = buf;
757 iov.iov_len = buflen;
758 msg.msg_iov = &iov;
759 msg.msg_iovlen = 1;
760
761 do {
ae6138bf 762 status = recvmsg(nl->sock, &msg, 0);
f8653393 763 } while (status == -1 && errno == EINTR);
ae6138bf 764
f8653393 765 if (status == -1) {
ae6138bf
JU
766 if (errno == EWOULDBLOCK || errno == EAGAIN)
767 return 0;
768 flog_err(EC_ZEBRA_RECVMSG_OVERRUN, "%s recvmsg overrun: %s",
769 nl->name, safe_strerror(errno));
770 /*
771 * In this case we are screwed. There is no good way to recover
772 * zebra at this point.
773 */
774 exit(-1);
775 }
776
777 if (status == 0) {
778 flog_err_sys(EC_LIB_SOCKET, "%s EOF", nl->name);
779 return -1;
780 }
781
782 if (msg.msg_namelen != sizeof(struct sockaddr_nl)) {
783 flog_err(EC_ZEBRA_NETLINK_LENGTH_ERROR,
784 "%s sender address length error: length %d", nl->name,
785 msg.msg_namelen);
786 return -1;
787 }
788
789 if (IS_ZEBRA_DEBUG_KERNEL_MSGDUMP_RECV) {
790 zlog_debug("%s: << netlink message dump [recv]", __func__);
eead0bc4
RZ
791#ifdef NETLINK_DEBUG
792 nl_dump(buf, status);
793#else
ae6138bf 794 zlog_hexdump(buf, status);
eead0bc4 795#endif /* NETLINK_DEBUG */
ae6138bf
JU
796 }
797
ae6138bf
JU
798 return status;
799}
800
801/*
802 * netlink_parse_error - parse a netlink error message
803 *
804 * Returns 1 if this message is acknowledgement, 0 if this error should be
805 * ignored, -1 otherwise.
806 */
807static int netlink_parse_error(const struct nlsock *nl, struct nlmsghdr *h,
808 const struct zebra_dplane_info *zns,
809 bool startup)
810{
811 struct nlmsgerr *err = (struct nlmsgerr *)NLMSG_DATA(h);
812 int errnum = err->error;
813 int msg_type = err->msg.nlmsg_type;
814
815 if (h->nlmsg_len < NLMSG_LENGTH(sizeof(struct nlmsgerr))) {
816 flog_err(EC_ZEBRA_NETLINK_LENGTH_ERROR,
817 "%s error: message truncated", nl->name);
818 return -1;
819 }
820
821 /*
822 * Parse the extended information before we actually handle it. At this
823 * point in time we do not do anything other than report the issue.
824 */
825 if (h->nlmsg_flags & NLM_F_ACK_TLVS)
826 netlink_parse_extended_ack(h);
827
828 /* If the error field is zero, then this is an ACK. */
829 if (err->error == 0) {
830 if (IS_ZEBRA_DEBUG_KERNEL) {
831 zlog_debug("%s: %s ACK: type=%s(%u), seq=%u, pid=%u",
832 __func__, nl->name,
833 nl_msg_type_to_str(err->msg.nlmsg_type),
834 err->msg.nlmsg_type, err->msg.nlmsg_seq,
835 err->msg.nlmsg_pid);
836 }
837
838 return 1;
839 }
840
841 /* Deal with errors that occur because of races in link handling. */
842 if (zns->is_cmd
843 && ((msg_type == RTM_DELROUTE
844 && (-errnum == ENODEV || -errnum == ESRCH))
845 || (msg_type == RTM_NEWROUTE
846 && (-errnum == ENETDOWN || -errnum == EEXIST)))) {
847 if (IS_ZEBRA_DEBUG_KERNEL)
848 zlog_debug("%s: error: %s type=%s(%u), seq=%u, pid=%u",
849 nl->name, safe_strerror(-errnum),
850 nl_msg_type_to_str(msg_type), msg_type,
851 err->msg.nlmsg_seq, err->msg.nlmsg_pid);
852 return 0;
853 }
854
855 /*
856 * We see RTM_DELNEIGH when shutting down an interface with an IPv4
857 * link-local. The kernel should have already deleted the neighbor so
858 * do not log these as an error.
859 */
860 if (msg_type == RTM_DELNEIGH
861 || (zns->is_cmd && msg_type == RTM_NEWROUTE
862 && (-errnum == ESRCH || -errnum == ENETUNREACH))) {
863 /*
864 * This is known to happen in some situations, don't log as
865 * error.
866 */
867 if (IS_ZEBRA_DEBUG_KERNEL)
868 zlog_debug("%s error: %s, type=%s(%u), seq=%u, pid=%u",
869 nl->name, safe_strerror(-errnum),
870 nl_msg_type_to_str(msg_type), msg_type,
871 err->msg.nlmsg_seq, err->msg.nlmsg_pid);
872 } else {
873 if ((msg_type != RTM_GETNEXTHOP) || !startup)
874 flog_err(EC_ZEBRA_UNEXPECTED_MESSAGE,
875 "%s error: %s, type=%s(%u), seq=%u, pid=%u",
876 nl->name, safe_strerror(-errnum),
877 nl_msg_type_to_str(msg_type), msg_type,
878 err->msg.nlmsg_seq, err->msg.nlmsg_pid);
879 }
880
881 return -1;
882}
883
936ebf0a
DS
884/*
885 * netlink_parse_info
886 *
887 * Receive message from netlink interface and pass those information
888 * to the given function.
889 *
890 * filter -> Function to call to read the results
891 * nl -> netlink socket information
892 * zns -> The zebra namespace data
893 * count -> How many we should read in, 0 means as much as possible
894 * startup -> Are we reading in under startup conditions? passed to
895 * the filter.
896 */
2414abd3 897int netlink_parse_info(int (*filter)(struct nlmsghdr *, ns_id_t, int),
7cdb1a84
MS
898 const struct nlsock *nl,
899 const struct zebra_dplane_info *zns,
85a75f1e 900 int count, int startup)
1fdc9eae 901{
d62a17ae 902 int status;
903 int ret = 0;
904 int error;
905 int read_in = 0;
906
907 while (1) {
9ed7517b 908 char buf[NL_RCV_PKT_BUF_SIZE];
d62a17ae 909 struct sockaddr_nl snl;
910 struct msghdr msg = {.msg_name = (void *)&snl,
ae6138bf 911 .msg_namelen = sizeof(snl)};
d62a17ae 912 struct nlmsghdr *h;
913
914 if (count && read_in >= count)
915 return 0;
916
ae6138bf
JU
917 status = netlink_recv_msg(nl, msg, buf, sizeof(buf));
918 if (status == -1)
d62a17ae 919 return -1;
ae6138bf
JU
920 else if (status == 0)
921 break;
81a2f870 922
d62a17ae 923 read_in++;
924 for (h = (struct nlmsghdr *)buf;
e6a0e0d1 925 (status >= 0 && NLMSG_OK(h, (unsigned int)status));
d62a17ae 926 h = NLMSG_NEXT(h, status)) {
927 /* Finish of reading. */
928 if (h->nlmsg_type == NLMSG_DONE)
929 return ret;
930
931 /* Error handling. */
932 if (h->nlmsg_type == NLMSG_ERROR) {
ae6138bf
JU
933 int err = netlink_parse_error(nl, h, zns,
934 startup);
935 if (err == 1) {
d62a17ae 936 if (!(h->nlmsg_flags & NLM_F_MULTI))
937 return 0;
938 continue;
ae6138bf
JU
939 } else
940 return err;
d62a17ae 941 }
942
943 /* OK we got netlink message. */
944 if (IS_ZEBRA_DEBUG_KERNEL)
945 zlog_debug(
946 "netlink_parse_info: %s type %s(%u), len=%d, seq=%u, pid=%u",
947 nl->name,
948 nl_msg_type_to_str(h->nlmsg_type),
949 h->nlmsg_type, h->nlmsg_len,
950 h->nlmsg_seq, h->nlmsg_pid);
951
783827ae
DS
952
953 /*
954 * Ignore messages that maybe sent from
955 * other actors besides the kernel
956 */
957 if (snl.nl_pid != 0) {
43e52561
QY
958 zlog_debug("Ignoring message from pid %u",
959 snl.nl_pid);
d62a17ae 960 continue;
961 }
962
2414abd3 963 error = (*filter)(h, zns->ns_id, startup);
d62a17ae 964 if (error < 0) {
9df414fe
QY
965 zlog_debug("%s filter function error",
966 nl->name);
d62a17ae 967 ret = error;
968 }
969 }
970
971 /* After error care. */
972 if (msg.msg_flags & MSG_TRUNC) {
e914ccbe 973 flog_err(EC_ZEBRA_NETLINK_LENGTH_ERROR,
1c50c1c0 974 "%s error: message truncated", nl->name);
d62a17ae 975 continue;
976 }
977 if (status) {
e914ccbe 978 flog_err(EC_ZEBRA_NETLINK_LENGTH_ERROR,
1c50c1c0
QY
979 "%s error: data remnant size %d", nl->name,
980 status);
d62a17ae 981 return -1;
982 }
983 }
984 return ret;
1fdc9eae 985}
986
936ebf0a 987/*
7cdb1a84 988 * netlink_talk_info
936ebf0a
DS
989 *
990 * sendmsg() to netlink socket then recvmsg().
991 * Calls netlink_parse_info to parse returned data
992 *
993 * filter -> The filter to read final results from kernel
994 * nlmsghdr -> The data to send to the kernel
8b962e77 995 * dp_info -> The dataplane and netlink socket information
936ebf0a
DS
996 * startup -> Are we reading in under startup conditions
997 * This is passed through eventually to filter.
998 */
67e3369e
JU
999static int
1000netlink_talk_info(int (*filter)(struct nlmsghdr *, ns_id_t, int startup),
1001 struct nlmsghdr *n, const struct zebra_dplane_info *dp_info,
1002 int startup)
1fdc9eae 1003{
7cdb1a84 1004 const struct nlsock *nl;
d62a17ae 1005
7cdb1a84
MS
1006 nl = &(dp_info->nls);
1007 n->nlmsg_seq = nl->seq;
d62a17ae 1008 n->nlmsg_pid = nl->snl.nl_pid;
1009
d62a17ae 1010 if (IS_ZEBRA_DEBUG_KERNEL)
1011 zlog_debug(
1012 "netlink_talk: %s type %s(%u), len=%d seq=%u flags 0x%x",
1013 nl->name, nl_msg_type_to_str(n->nlmsg_type),
1014 n->nlmsg_type, n->nlmsg_len, n->nlmsg_seq,
1015 n->nlmsg_flags);
1016
f8653393 1017 if (netlink_send_msg(nl, n, n->nlmsg_len) == -1)
d62a17ae 1018 return -1;
d62a17ae 1019
d62a17ae 1020 /*
1021 * Get reply from netlink socket.
1022 * The reply should either be an acknowlegement or an error.
1023 */
7cdb1a84
MS
1024 return netlink_parse_info(filter, nl, dp_info, 0, startup);
1025}
1026
1027/*
1028 * Synchronous version of netlink_talk_info. Converts args to suit the
1029 * common version, which is suitable for both sync and async use.
7cdb1a84
MS
1030 */
1031int netlink_talk(int (*filter)(struct nlmsghdr *, ns_id_t, int startup),
1032 struct nlmsghdr *n, struct nlsock *nl, struct zebra_ns *zns,
1033 int startup)
1034{
1035 struct zebra_dplane_info dp_info;
1036
1037 /* Increment sequence number before capturing snapshot of ns socket
1038 * info.
1039 */
1040 nl->seq++;
1041
1042 /* Capture info in intermediate info struct */
85a75f1e 1043 zebra_dplane_info_from_zns(&dp_info, zns, (nl == &(zns->netlink_cmd)));
7cdb1a84 1044
5709131c 1045 return netlink_talk_info(filter, n, &dp_info, startup);
1fdc9eae 1046}
1047
289602d7 1048/* Issue request message to kernel via netlink socket. GET messages
1049 * are issued through this interface.
1050 */
fd3f8e52 1051int netlink_request(struct nlsock *nl, void *req)
1fdc9eae 1052{
fd3f8e52 1053 struct nlmsghdr *n = (struct nlmsghdr *)req;
d62a17ae 1054
1055 /* Check netlink socket. */
1056 if (nl->sock < 0) {
450971aa 1057 flog_err_sys(EC_LIB_SOCKET, "%s socket isn't active.",
09c866e3 1058 nl->name);
d62a17ae 1059 return -1;
1060 }
1061
1062 /* Fill common fields for all requests. */
d62a17ae 1063 n->nlmsg_pid = nl->snl.nl_pid;
1064 n->nlmsg_seq = ++nl->seq;
1065
f8653393 1066 if (netlink_send_msg(nl, req, n->nlmsg_len) == -1)
d62a17ae 1067 return -1;
d62a17ae 1068
1069 return 0;
1fdc9eae 1070}
1071
e63c7622
JU
1072static int nl_batch_read_resp(struct nl_batch *bth)
1073{
1074 struct nlmsghdr *h;
1075 struct sockaddr_nl snl;
9d06e121 1076 struct msghdr msg = {};
f6feb48b 1077 int status, seq;
e63c7622 1078 const struct nlsock *nl;
f6feb48b
JU
1079 struct zebra_dplane_ctx *ctx;
1080 bool ignore_msg;
e63c7622
JU
1081
1082 nl = &(bth->zns->nls);
1083
1084 msg.msg_name = (void *)&snl;
1085 msg.msg_namelen = sizeof(snl);
1086
2f9dbd3a
JU
1087 /*
1088 * The responses are not batched, so we need to read and process one
1089 * message at a time.
1090 */
1091 while (true) {
1092 status = netlink_recv_msg(nl, msg, nl_batch_rx_buf,
1093 sizeof(nl_batch_rx_buf));
1094 if (status == -1 || status == 0)
1095 return status;
e63c7622 1096
2f9dbd3a 1097 h = (struct nlmsghdr *)nl_batch_rx_buf;
f6feb48b
JU
1098 ignore_msg = false;
1099 seq = h->nlmsg_seq;
e63c7622 1100 /*
f6feb48b
JU
1101 * Find the corresponding context object. Received responses are
1102 * in the same order as requests we sent, so we can simply
1103 * iterate over the context list and match responses with
1104 * requests at same time.
e63c7622 1105 */
f6feb48b
JU
1106 while (true) {
1107 ctx = dplane_ctx_dequeue(&(bth->ctx_list));
1108 if (ctx == NULL)
1109 break;
1110
1111 dplane_ctx_enqueue_tail(bth->ctx_out_q, ctx);
1112
1113 /* We have found corresponding context object. */
1114 if (dplane_ctx_get_ns(ctx)->nls.seq == seq)
e63c7622 1115 break;
f6feb48b
JU
1116
1117 /*
1118 * 'update' context objects take two consecutive
1119 * sequence numbers.
1120 */
1121 if (dplane_ctx_is_update(ctx)
1122 && dplane_ctx_get_ns(ctx)->nls.seq + 1 == seq) {
1123 /*
1124 * This is the situation where we get a response
1125 * to a message that should be ignored.
1126 */
1127 ignore_msg = true;
1128 break;
1129 }
e63c7622
JU
1130 }
1131
f6feb48b
JU
1132 if (ignore_msg)
1133 continue;
1134
e63c7622
JU
1135 /*
1136 * We received a message with the sequence number that isn't
1137 * associated with any dplane context object.
1138 */
f6feb48b 1139 if (ctx == NULL) {
4c99d413
MS
1140 if (IS_ZEBRA_DEBUG_KERNEL)
1141 zlog_debug(
1142 "%s: skipping unassociated response, seq number %d NS %u",
1143 __func__, h->nlmsg_seq,
1144 bth->zns->ns_id);
e63c7622
JU
1145 continue;
1146 }
1147
1148 if (h->nlmsg_type == NLMSG_ERROR) {
1149 int err = netlink_parse_error(nl, h, bth->zns, 0);
1150
1151 if (err == -1)
f6feb48b
JU
1152 dplane_ctx_set_status(
1153 ctx, ZEBRA_DPLANE_REQUEST_FAILURE);
e63c7622 1154
4c99d413
MS
1155 if (IS_ZEBRA_DEBUG_KERNEL)
1156 zlog_debug("%s: netlink error message seq=%d ",
1157 __func__, h->nlmsg_seq);
e63c7622
JU
1158 continue;
1159 }
1160
1161 /*
1162 * If we get here then we did not receive neither the ack nor
1163 * the error and instead received some other message in an
1164 * unexpected way.
1165 */
4c99d413
MS
1166 if (IS_ZEBRA_DEBUG_KERNEL)
1167 zlog_debug("%s: ignoring message type 0x%04x(%s) NS %u",
1168 __func__, h->nlmsg_type,
1169 nl_msg_type_to_str(h->nlmsg_type),
1170 bth->zns->ns_id);
e63c7622
JU
1171 }
1172
1173 return 0;
1174}
1175
1176static void nl_batch_reset(struct nl_batch *bth)
1177{
e63c7622
JU
1178 bth->buf_head = bth->buf;
1179 bth->curlen = 0;
1180 bth->msgcnt = 0;
1181 bth->zns = NULL;
1182
f6feb48b 1183 TAILQ_INIT(&(bth->ctx_list));
e63c7622
JU
1184}
1185
f6feb48b 1186static void nl_batch_init(struct nl_batch *bth, struct dplane_ctx_q *ctx_out_q)
e63c7622 1187{
531c92b8
JU
1188 /*
1189 * If the size of the buffer has changed, free and then allocate a new
1190 * one.
1191 */
1192 size_t bufsize =
1193 atomic_load_explicit(&nl_batch_bufsize, memory_order_relaxed);
1194 if (bufsize != nl_batch_tx_bufsize) {
1195 if (nl_batch_tx_buf)
1196 XFREE(MTYPE_NL_BUF, nl_batch_tx_buf);
1197
1198 nl_batch_tx_buf = XCALLOC(MTYPE_NL_BUF, bufsize);
1199 nl_batch_tx_bufsize = bufsize;
1200 }
1201
f6feb48b 1202 bth->buf = nl_batch_tx_buf;
531c92b8
JU
1203 bth->bufsiz = bufsize;
1204 bth->limit = atomic_load_explicit(&nl_batch_send_threshold,
1205 memory_order_relaxed);
e63c7622 1206
f6feb48b 1207 bth->ctx_out_q = ctx_out_q;
e63c7622 1208
f6feb48b
JU
1209 nl_batch_reset(bth);
1210}
1211
1212static void nl_batch_send(struct nl_batch *bth)
1213{
1214 struct zebra_dplane_ctx *ctx;
1215 bool err = false;
e63c7622 1216
f6feb48b
JU
1217 if (bth->curlen != 0 && bth->zns != NULL) {
1218 if (IS_ZEBRA_DEBUG_KERNEL)
1219 zlog_debug("%s: %s, batch size=%zu, msg cnt=%zu",
1220 __func__, bth->zns->nls.name, bth->curlen,
1221 bth->msgcnt);
e63c7622 1222
f6feb48b
JU
1223 if (netlink_send_msg(&(bth->zns->nls), bth->buf, bth->curlen)
1224 == -1)
e63c7622 1225 err = true;
e63c7622 1226
f6feb48b
JU
1227 if (!err) {
1228 if (nl_batch_read_resp(bth) == -1)
1229 err = true;
1230 }
1231 }
e63c7622 1232
f6feb48b
JU
1233 /* Move remaining contexts to the outbound queue. */
1234 while (true) {
1235 ctx = dplane_ctx_dequeue(&(bth->ctx_list));
1236 if (ctx == NULL)
1237 break;
e63c7622 1238
f6feb48b
JU
1239 if (err)
1240 dplane_ctx_set_status(ctx,
1241 ZEBRA_DPLANE_REQUEST_FAILURE);
e63c7622 1242
f6feb48b 1243 dplane_ctx_enqueue_tail(bth->ctx_out_q, ctx);
e63c7622
JU
1244 }
1245
1246 nl_batch_reset(bth);
1247}
1248
e63c7622
JU
1249enum netlink_msg_status netlink_batch_add_msg(
1250 struct nl_batch *bth, struct zebra_dplane_ctx *ctx,
1251 ssize_t (*msg_encoder)(struct zebra_dplane_ctx *, void *, size_t),
f6feb48b 1252 bool ignore_res)
e63c7622
JU
1253{
1254 int seq;
1255 ssize_t size;
1256 struct nlmsghdr *msgh;
1257
e63c7622
JU
1258 size = (*msg_encoder)(ctx, bth->buf_head, bth->bufsiz - bth->curlen);
1259
1260 /*
1261 * If there was an error while encoding the message (other than buffer
1262 * overflow) then return an error.
1263 */
1264 if (size < 0)
1265 return FRR_NETLINK_ERROR;
1266
1267 /*
1268 * If the message doesn't fit entirely in the buffer then send the batch
1269 * and retry.
1270 */
1271 if (size == 0) {
1272 nl_batch_send(bth);
1273 size = (*msg_encoder)(ctx, bth->buf_head,
1274 bth->bufsiz - bth->curlen);
1275 /*
1276 * If the message doesn't fit in the empty buffer then just
1277 * return an error.
1278 */
1279 if (size <= 0)
1280 return FRR_NETLINK_ERROR;
1281 }
1282
1283 seq = dplane_ctx_get_ns(ctx)->nls.seq;
f6feb48b 1284 if (ignore_res)
e63c7622
JU
1285 seq++;
1286
1287 msgh = (struct nlmsghdr *)bth->buf_head;
1288 msgh->nlmsg_seq = seq;
1289 msgh->nlmsg_pid = dplane_ctx_get_ns(ctx)->nls.snl.nl_pid;
1290
e63c7622
JU
1291 bth->zns = dplane_ctx_get_ns(ctx);
1292 bth->buf_head = ((char *)bth->buf_head) + size;
1293 bth->curlen += size;
1294 bth->msgcnt++;
1295
e63c7622
JU
1296 return FRR_NETLINK_QUEUED;
1297}
1298
67e3369e
JU
1299static enum netlink_msg_status nl_put_msg(struct nl_batch *bth,
1300 struct zebra_dplane_ctx *ctx)
1301{
1302 if (dplane_ctx_is_skip_kernel(ctx))
1303 return FRR_NETLINK_SUCCESS;
1304
1305 switch (dplane_ctx_get_op(ctx)) {
1306
1307 case DPLANE_OP_ROUTE_INSTALL:
1308 case DPLANE_OP_ROUTE_UPDATE:
1309 case DPLANE_OP_ROUTE_DELETE:
1310 return netlink_put_route_update_msg(bth, ctx);
1311
1312 case DPLANE_OP_NH_INSTALL:
1313 case DPLANE_OP_NH_UPDATE:
1314 case DPLANE_OP_NH_DELETE:
1315 return netlink_put_nexthop_update_msg(bth, ctx);
1316
1317 case DPLANE_OP_LSP_INSTALL:
1318 case DPLANE_OP_LSP_UPDATE:
1319 case DPLANE_OP_LSP_DELETE:
1320 return netlink_put_lsp_update_msg(bth, ctx);
1321
1322 case DPLANE_OP_PW_INSTALL:
1323 case DPLANE_OP_PW_UNINSTALL:
1324 return netlink_put_pw_update_msg(bth, ctx);
1325
1326 case DPLANE_OP_ADDR_INSTALL:
1327 case DPLANE_OP_ADDR_UNINSTALL:
1328 return netlink_put_address_update_msg(bth, ctx);
1329
1330 case DPLANE_OP_MAC_INSTALL:
1331 case DPLANE_OP_MAC_DELETE:
1332 return netlink_put_mac_update_msg(bth, ctx);
1333
1334 case DPLANE_OP_NEIGH_INSTALL:
1335 case DPLANE_OP_NEIGH_UPDATE:
1336 case DPLANE_OP_NEIGH_DELETE:
1337 case DPLANE_OP_VTEP_ADD:
1338 case DPLANE_OP_VTEP_DELETE:
d68e74b4 1339 case DPLANE_OP_NEIGH_DISCOVER:
0a27a2fe
PG
1340 case DPLANE_OP_NEIGH_IP_INSTALL:
1341 case DPLANE_OP_NEIGH_IP_DELETE:
e18747a9 1342 case DPLANE_OP_NEIGH_TABLE_UPDATE:
67e3369e
JU
1343 return netlink_put_neigh_update_msg(bth, ctx);
1344
1345 case DPLANE_OP_RULE_ADD:
1346 case DPLANE_OP_RULE_DELETE:
1347 case DPLANE_OP_RULE_UPDATE:
1348 return netlink_put_rule_update_msg(bth, ctx);
1349
1350 case DPLANE_OP_SYS_ROUTE_ADD:
1351 case DPLANE_OP_SYS_ROUTE_DELETE:
1352 case DPLANE_OP_ROUTE_NOTIFY:
1353 case DPLANE_OP_LSP_NOTIFY:
c60522f7 1354 case DPLANE_OP_BR_PORT_UPDATE:
67e3369e
JU
1355 return FRR_NETLINK_SUCCESS;
1356
5162e000
PG
1357 case DPLANE_OP_IPTABLE_ADD:
1358 case DPLANE_OP_IPTABLE_DELETE:
ef524230
PG
1359 case DPLANE_OP_IPSET_ADD:
1360 case DPLANE_OP_IPSET_DELETE:
1361 case DPLANE_OP_IPSET_ENTRY_ADD:
1362 case DPLANE_OP_IPSET_ENTRY_DELETE:
5162e000
PG
1363 return FRR_NETLINK_ERROR;
1364
62b4b7e4
PG
1365 case DPLANE_OP_GRE_SET:
1366 return netlink_put_gre_set_msg(bth, ctx);
1367
67e3369e
JU
1368 case DPLANE_OP_NONE:
1369 return FRR_NETLINK_ERROR;
1370 }
1371
1372 return FRR_NETLINK_ERROR;
1373}
1374
fef24b03
JU
1375void kernel_update_multi(struct dplane_ctx_q *ctx_list)
1376{
67e3369e
JU
1377 struct nl_batch batch;
1378 struct zebra_dplane_ctx *ctx;
1379 struct dplane_ctx_q handled_list;
1380 enum netlink_msg_status res;
1381
67e3369e 1382 TAILQ_INIT(&handled_list);
f6feb48b 1383 nl_batch_init(&batch, &handled_list);
67e3369e
JU
1384
1385 while (true) {
1386 ctx = dplane_ctx_dequeue(ctx_list);
1387 if (ctx == NULL)
1388 break;
1389
f6feb48b
JU
1390 if (batch.zns != NULL
1391 && batch.zns->ns_id != dplane_ctx_get_ns(ctx)->ns_id)
1392 nl_batch_send(&batch);
67e3369e
JU
1393
1394 /*
f6feb48b
JU
1395 * Assume all messages will succeed and then mark only the ones
1396 * that failed.
67e3369e 1397 */
f6feb48b
JU
1398 dplane_ctx_set_status(ctx, ZEBRA_DPLANE_REQUEST_SUCCESS);
1399
1400 res = nl_put_msg(&batch, ctx);
1401
1402 dplane_ctx_enqueue_tail(&(batch.ctx_list), ctx);
1403 if (res == FRR_NETLINK_ERROR)
67e3369e
JU
1404 dplane_ctx_set_status(ctx,
1405 ZEBRA_DPLANE_REQUEST_FAILURE);
1406
f6feb48b
JU
1407 if (batch.curlen > batch.limit)
1408 nl_batch_send(&batch);
67e3369e
JU
1409 }
1410
1411 nl_batch_send(&batch);
1412
1413 TAILQ_INIT(ctx_list);
1414 dplane_ctx_list_append(ctx_list, &handled_list);
fef24b03
JU
1415}
1416
1fdc9eae 1417/* Exported interface function. This function simply calls
1418 netlink_socket (). */
d62a17ae 1419void kernel_init(struct zebra_ns *zns)
1fdc9eae 1420{
67188ca2 1421 uint32_t groups;
5d307d5d
DS
1422#if defined SOL_NETLINK
1423 int one, ret;
1424#endif
d62a17ae 1425
026a316f
DS
1426 /*
1427 * Initialize netlink sockets
1428 *
1429 * If RTMGRP_XXX exists use that, but at some point
1430 * I think the kernel developers realized that
1431 * keeping track of all the different values would
1432 * lead to confusion, so we need to convert the
1433 * RTNLGRP_XXX to a bit position for ourself
1434 */
1435 groups = RTMGRP_LINK |
1436 RTMGRP_IPV4_ROUTE |
1437 RTMGRP_IPV4_IFADDR |
1438 RTMGRP_IPV6_ROUTE |
1439 RTMGRP_IPV6_IFADDR |
1440 RTMGRP_IPV4_MROUTE |
1441 RTMGRP_NEIGH |
67188ca2
QY
1442 ((uint32_t) 1 << (RTNLGRP_IPV4_RULE - 1)) |
1443 ((uint32_t) 1 << (RTNLGRP_IPV6_RULE - 1)) |
1444 ((uint32_t) 1 << (RTNLGRP_NEXTHOP - 1));
d62a17ae 1445
1446 snprintf(zns->netlink.name, sizeof(zns->netlink.name),
1447 "netlink-listen (NS %u)", zns->ns_id);
1448 zns->netlink.sock = -1;
19d5a4fe
DS
1449 if (netlink_socket(&zns->netlink, groups, zns->ns_id) < 0) {
1450 zlog_err("Failure to create %s socket",
1451 zns->netlink.name);
1452 exit(-1);
1453 }
d62a17ae 1454
1455 snprintf(zns->netlink_cmd.name, sizeof(zns->netlink_cmd.name),
1456 "netlink-cmd (NS %u)", zns->ns_id);
1457 zns->netlink_cmd.sock = -1;
19d5a4fe
DS
1458 if (netlink_socket(&zns->netlink_cmd, 0, zns->ns_id) < 0) {
1459 zlog_err("Failure to create %s socket",
1460 zns->netlink_cmd.name);
1461 exit(-1);
1462 }
d62a17ae 1463
62b8bb7a
MS
1464 snprintf(zns->netlink_dplane.name, sizeof(zns->netlink_dplane.name),
1465 "netlink-dp (NS %u)", zns->ns_id);
1466 zns->netlink_dplane.sock = -1;
1467 if (netlink_socket(&zns->netlink_dplane, 0, zns->ns_id) < 0) {
1468 zlog_err("Failure to create %s socket",
1469 zns->netlink_dplane.name);
1470 exit(-1);
1471 }
1472
5d307d5d
DS
1473 /*
1474 * SOL_NETLINK is not available on all platforms yet
1475 * apparently. It's in bits/socket.h which I am not
1476 * sure that we want to pull into our build system.
1477 */
1478#if defined SOL_NETLINK
1479 /*
1480 * Let's tell the kernel that we want to receive extended
62b8bb7a 1481 * ACKS over our command socket(s)
5d307d5d
DS
1482 */
1483 one = 1;
1484 ret = setsockopt(zns->netlink_cmd.sock, SOL_NETLINK, NETLINK_EXT_ACK,
1485 &one, sizeof(one));
1486
1487 if (ret < 0)
62b8bb7a
MS
1488 zlog_notice("Registration for extended cmd ACK failed : %d %s",
1489 errno, safe_strerror(errno));
1490
1491 one = 1;
1492 ret = setsockopt(zns->netlink_dplane.sock, SOL_NETLINK, NETLINK_EXT_ACK,
1493 &one, sizeof(one));
1494
1495 if (ret < 0)
1496 zlog_notice("Registration for extended dp ACK failed : %d %s",
5d307d5d 1497 errno, safe_strerror(errno));
97f85144
JU
1498
1499 /*
1500 * Trim off the payload of the original netlink message in the
1501 * acknowledgment. This option is available since Linux 4.2, so if
1502 * setsockopt fails, ignore the error.
1503 */
1504 one = 1;
1505 ret = setsockopt(zns->netlink_dplane.sock, SOL_NETLINK, NETLINK_CAP_ACK,
1506 &one, sizeof(one));
9781e6a0
DS
1507 if (ret < 0)
1508 zlog_notice(
1509 "Registration for reduced ACK packet size failed, probably running an early kernel");
5d307d5d
DS
1510#endif
1511
d62a17ae 1512 /* Register kernel socket. */
19d5a4fe 1513 if (fcntl(zns->netlink.sock, F_SETFL, O_NONBLOCK) < 0)
450971aa 1514 flog_err_sys(EC_LIB_SOCKET, "Can't set %s socket flags: %s",
09c866e3 1515 zns->netlink.name, safe_strerror(errno));
8c85e8ea
DS
1516
1517 if (fcntl(zns->netlink_cmd.sock, F_SETFL, O_NONBLOCK) < 0)
1518 zlog_err("Can't set %s socket error: %s(%d)",
1519 zns->netlink_cmd.name, safe_strerror(errno), errno);
19d5a4fe 1520
62b8bb7a
MS
1521 if (fcntl(zns->netlink_dplane.sock, F_SETFL, O_NONBLOCK) < 0)
1522 zlog_err("Can't set %s socket error: %s(%d)",
1523 zns->netlink_dplane.name, safe_strerror(errno), errno);
1524
19d5a4fe 1525 /* Set receive buffer size if it's set from command line */
97f85144 1526 if (nl_rcvbufsize) {
19d5a4fe 1527 netlink_recvbuf(&zns->netlink, nl_rcvbufsize);
97f85144
JU
1528 netlink_recvbuf(&zns->netlink_cmd, nl_rcvbufsize);
1529 netlink_recvbuf(&zns->netlink_dplane, nl_rcvbufsize);
1530 }
19d5a4fe
DS
1531
1532 netlink_install_filter(zns->netlink.sock,
62b8bb7a
MS
1533 zns->netlink_cmd.snl.nl_pid,
1534 zns->netlink_dplane.snl.nl_pid);
1535
19d5a4fe
DS
1536 zns->t_netlink = NULL;
1537
3801e764 1538 thread_add_read(zrouter.master, kernel_read, zns,
19d5a4fe 1539 zns->netlink.sock, &zns->t_netlink);
d62a17ae 1540
1541 rt_netlink_init();
1fdc9eae 1542}
1543
62b8bb7a 1544void kernel_terminate(struct zebra_ns *zns, bool complete)
1fdc9eae 1545{
50478845 1546 thread_cancel(&zns->t_netlink);
d62a17ae 1547
1548 if (zns->netlink.sock >= 0) {
1549 close(zns->netlink.sock);
1550 zns->netlink.sock = -1;
1551 }
1552
1553 if (zns->netlink_cmd.sock >= 0) {
1554 close(zns->netlink_cmd.sock);
1555 zns->netlink_cmd.sock = -1;
1556 }
ddfeb486 1557
62b8bb7a
MS
1558 /* During zebra shutdown, we need to leave the dataplane socket
1559 * around until all work is done.
1560 */
1561 if (complete) {
1562 if (zns->netlink_dplane.sock >= 0) {
1563 close(zns->netlink_dplane.sock);
1564 zns->netlink_dplane.sock = -1;
1565 }
1566 }
1567}
ddfeb486 1568#endif /* HAVE_NETLINK */