]> git.proxmox.com Git - mirror_frr.git/blame - zebra/kernel_netlink.c
Merge pull request #10495 from anlancs/doc-ospf-range
[mirror_frr.git] / zebra / kernel_netlink.c
CommitLineData
718e3744 1/* Kernel communication using netlink interface.
2 * Copyright (C) 1999 Kunihiro Ishiguro
3 *
4 * This file is part of GNU Zebra.
5 *
6 * GNU Zebra is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the
8 * Free Software Foundation; either version 2, or (at your option) any
9 * later version.
10 *
11 * GNU Zebra is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
896014f4
DL
16 * You should have received a copy of the GNU General Public License along
17 * with this program; see the file COPYING; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
718e3744 19 */
1fdc9eae 20
21#include <zebra.h>
22
ddfeb486
DL
23#ifdef HAVE_NETLINK
24
1fdc9eae 25#include "linklist.h"
26#include "if.h"
27#include "log.h"
28#include "prefix.h"
29#include "connected.h"
30#include "table.h"
31#include "memory.h"
1fdc9eae 32#include "rib.h"
33#include "thread.h"
34#include "privs.h"
35#include "nexthop.h"
36#include "vrf.h"
37#include "mpls.h"
174482ef 38#include "lib_errors.h"
1fdc9eae 39
3801e764
DS
40//#include "zebra/zserv.h"
41#include "zebra/zebra_router.h"
1fdc9eae 42#include "zebra/zebra_ns.h"
43#include "zebra/zebra_vrf.h"
05f7f5db 44#include "zebra/rt.h"
1fdc9eae 45#include "zebra/debug.h"
46#include "zebra/kernel_netlink.h"
47#include "zebra/rt_netlink.h"
48#include "zebra/if_netlink.h"
942bf97b 49#include "zebra/rule_netlink.h"
43e52561 50#include "zebra/zebra_errors.h"
1fdc9eae 51
52#ifndef SO_RCVBUFFORCE
53#define SO_RCVBUFFORCE (33)
54#endif
55
56/* Hack for GNU libc version 2. */
57#ifndef MSG_TRUNC
58#define MSG_TRUNC 0x20
59#endif /* MSG_TRUNC */
60
61#ifndef NLMSG_TAIL
d62a17ae 62#define NLMSG_TAIL(nmsg) \
d7c0a89a
QY
63 ((struct rtattr *)(((uint8_t *)(nmsg)) \
64 + NLMSG_ALIGN((nmsg)->nlmsg_len)))
1fdc9eae 65#endif
66
67#ifndef RTA_TAIL
d62a17ae 68#define RTA_TAIL(rta) \
d7c0a89a 69 ((struct rtattr *)(((uint8_t *)(rta)) + RTA_ALIGN((rta)->rta_len)))
1fdc9eae 70#endif
71
f909c673
DS
72#ifndef RTNL_FAMILY_IP6MR
73#define RTNL_FAMILY_IP6MR 129
74#endif
75
76#ifndef RTPROT_MROUTED
77#define RTPROT_MROUTED 17
78#endif
79
531c92b8 80#define NL_DEFAULT_BATCH_BUFSIZE (16 * NL_PKT_BUF_SIZE)
e63c7622
JU
81
82/*
83 * We limit the batch's size to a number smaller than the length of the
84 * underlying buffer since the last message that wouldn't fit the batch would go
85 * over the upper boundary and then it would have to be encoded again into a new
86 * buffer. If the difference between the limit and the length of the buffer is
87 * big enough (bigger than the biggest Netlink message) then this situation
88 * won't occur.
89 */
531c92b8
JU
90#define NL_DEFAULT_BATCH_SEND_THRESHOLD (15 * NL_PKT_BUF_SIZE)
91
2f9dbd3a 92#define NL_BATCH_RX_BUFSIZE NL_RCV_PKT_BUF_SIZE
e63c7622 93
d62a17ae 94static const struct message nlmsg_str[] = {{RTM_NEWROUTE, "RTM_NEWROUTE"},
95 {RTM_DELROUTE, "RTM_DELROUTE"},
96 {RTM_GETROUTE, "RTM_GETROUTE"},
97 {RTM_NEWLINK, "RTM_NEWLINK"},
98 {RTM_DELLINK, "RTM_DELLINK"},
99 {RTM_GETLINK, "RTM_GETLINK"},
100 {RTM_NEWADDR, "RTM_NEWADDR"},
101 {RTM_DELADDR, "RTM_DELADDR"},
102 {RTM_GETADDR, "RTM_GETADDR"},
103 {RTM_NEWNEIGH, "RTM_NEWNEIGH"},
104 {RTM_DELNEIGH, "RTM_DELNEIGH"},
105 {RTM_GETNEIGH, "RTM_GETNEIGH"},
942bf97b 106 {RTM_NEWRULE, "RTM_NEWRULE"},
107 {RTM_DELRULE, "RTM_DELRULE"},
108 {RTM_GETRULE, "RTM_GETRULE"},
79580b5a
SW
109 {RTM_NEWNEXTHOP, "RTM_NEWNEXTHOP"},
110 {RTM_DELNEXTHOP, "RTM_DELNEXTHOP"},
111 {RTM_GETNEXTHOP, "RTM_GETNEXTHOP"},
d62a17ae 112 {0}};
1fdc9eae 113
114static const struct message rtproto_str[] = {
d62a17ae 115 {RTPROT_REDIRECT, "redirect"},
116 {RTPROT_KERNEL, "kernel"},
117 {RTPROT_BOOT, "boot"},
118 {RTPROT_STATIC, "static"},
119 {RTPROT_GATED, "GateD"},
120 {RTPROT_RA, "router advertisement"},
121 {RTPROT_MRT, "MRT"},
122 {RTPROT_ZEBRA, "Zebra"},
1fdc9eae 123#ifdef RTPROT_BIRD
d62a17ae 124 {RTPROT_BIRD, "BIRD"},
1fdc9eae 125#endif /* RTPROT_BIRD */
d62a17ae 126 {RTPROT_MROUTED, "mroute"},
127 {RTPROT_BGP, "BGP"},
128 {RTPROT_OSPF, "OSPF"},
129 {RTPROT_ISIS, "IS-IS"},
130 {RTPROT_RIP, "RIP"},
131 {RTPROT_RIPNG, "RIPNG"},
d4d71f11 132 {RTPROT_ZSTATIC, "static"},
d62a17ae 133 {0}};
134
135static const struct message family_str[] = {{AF_INET, "ipv4"},
136 {AF_INET6, "ipv6"},
137 {AF_BRIDGE, "bridge"},
138 {RTNL_FAMILY_IPMR, "ipv4MR"},
139 {RTNL_FAMILY_IP6MR, "ipv6MR"},
140 {0}};
141
8c8f250b
DS
142static const struct message rttype_str[] = {{RTN_UNSPEC, "none"},
143 {RTN_UNICAST, "unicast"},
144 {RTN_LOCAL, "local"},
145 {RTN_BROADCAST, "broadcast"},
146 {RTN_ANYCAST, "anycast"},
d62a17ae 147 {RTN_MULTICAST, "multicast"},
8c8f250b
DS
148 {RTN_BLACKHOLE, "blackhole"},
149 {RTN_UNREACHABLE, "unreachable"},
150 {RTN_PROHIBIT, "prohibited"},
151 {RTN_THROW, "throw"},
152 {RTN_NAT, "nat"},
153 {RTN_XRESOLVE, "resolver"},
d62a17ae 154 {0}};
b339bde7 155
1fdc9eae 156extern struct thread_master *master;
d7c0a89a 157extern uint32_t nl_rcvbufsize;
1fdc9eae 158
159extern struct zebra_privs_t zserv_privs;
160
bf8d3d6a 161DEFINE_MTYPE_STATIC(ZEBRA, NL_BUF, "Zebra Netlink buffers");
531c92b8
JU
162
163size_t nl_batch_tx_bufsize;
164char *nl_batch_tx_buf;
165
e63c7622
JU
166char nl_batch_rx_buf[NL_BATCH_RX_BUFSIZE];
167
531c92b8
JU
168_Atomic uint32_t nl_batch_bufsize = NL_DEFAULT_BATCH_BUFSIZE;
169_Atomic uint32_t nl_batch_send_threshold = NL_DEFAULT_BATCH_SEND_THRESHOLD;
170
e63c7622
JU
171struct nl_batch {
172 void *buf;
173 size_t bufsiz;
174 size_t limit;
175
176 void *buf_head;
177 size_t curlen;
178 size_t msgcnt;
179
180 const struct zebra_dplane_info *zns;
e63c7622 181
f6feb48b 182 struct dplane_ctx_q ctx_list;
e63c7622 183
f6feb48b
JU
184 /*
185 * Pointer to the queue of completed contexts outbound back
186 * towards the dataplane module.
187 */
188 struct dplane_ctx_q *ctx_out_q;
e63c7622
JU
189};
190
531c92b8
JU
191int netlink_config_write_helper(struct vty *vty)
192{
193 uint32_t size =
194 atomic_load_explicit(&nl_batch_bufsize, memory_order_relaxed);
195 uint32_t threshold = atomic_load_explicit(&nl_batch_send_threshold,
196 memory_order_relaxed);
197
198 if (size != NL_DEFAULT_BATCH_BUFSIZE
199 || threshold != NL_DEFAULT_BATCH_SEND_THRESHOLD)
200 vty_out(vty, "zebra kernel netlink batch-tx-buf %u %u\n", size,
201 threshold);
202
203 return 0;
204}
205
206void netlink_set_batch_buffer_size(uint32_t size, uint32_t threshold, bool set)
207{
208 if (!set) {
209 size = NL_DEFAULT_BATCH_BUFSIZE;
210 threshold = NL_DEFAULT_BATCH_SEND_THRESHOLD;
211 }
212
213 atomic_store_explicit(&nl_batch_bufsize, size, memory_order_relaxed);
214 atomic_store_explicit(&nl_batch_send_threshold, threshold,
215 memory_order_relaxed);
216}
217
2414abd3 218int netlink_talk_filter(struct nlmsghdr *h, ns_id_t ns_id, int startup)
1fdc9eae 219{
3575d9e8
DS
220 /*
221 * This is an error condition that must be handled during
222 * development.
223 *
224 * The netlink_talk_filter function is used for communication
225 * down the netlink_cmd pipe and we are expecting
226 * an ack being received. So if we get here
227 * then we did not receive the ack and instead
228 * received some other message in an unexpected
229 * way.
230 */
43e52561
QY
231 zlog_debug("%s: ignoring message type 0x%04x(%s) NS %u", __func__,
232 h->nlmsg_type, nl_msg_type_to_str(h->nlmsg_type), ns_id);
d62a17ae 233 return 0;
1fdc9eae 234}
235
d62a17ae 236static int netlink_recvbuf(struct nlsock *nl, uint32_t newsize)
1fdc9eae 237{
d7c0a89a 238 uint32_t oldsize;
d62a17ae 239 socklen_t newlen = sizeof(newsize);
240 socklen_t oldlen = sizeof(oldsize);
241 int ret;
242
243 ret = getsockopt(nl->sock, SOL_SOCKET, SO_RCVBUF, &oldsize, &oldlen);
244 if (ret < 0) {
450971aa 245 flog_err_sys(EC_LIB_SOCKET,
09c866e3
QY
246 "Can't get %s receive buffer size: %s", nl->name,
247 safe_strerror(errno));
d62a17ae 248 return -1;
249 }
250
251 /* Try force option (linux >= 2.6.14) and fall back to normal set */
0cf6db21 252 frr_with_privs(&zserv_privs) {
01b9e3fd
DL
253 ret = setsockopt(nl->sock, SOL_SOCKET, SO_RCVBUFFORCE,
254 &nl_rcvbufsize,
255 sizeof(nl_rcvbufsize));
256 }
d62a17ae 257 if (ret < 0)
258 ret = setsockopt(nl->sock, SOL_SOCKET, SO_RCVBUF,
259 &nl_rcvbufsize, sizeof(nl_rcvbufsize));
260 if (ret < 0) {
450971aa 261 flog_err_sys(EC_LIB_SOCKET,
09c866e3
QY
262 "Can't set %s receive buffer size: %s", nl->name,
263 safe_strerror(errno));
d62a17ae 264 return -1;
265 }
266
267 ret = getsockopt(nl->sock, SOL_SOCKET, SO_RCVBUF, &newsize, &newlen);
268 if (ret < 0) {
450971aa 269 flog_err_sys(EC_LIB_SOCKET,
09c866e3
QY
270 "Can't get %s receive buffer size: %s", nl->name,
271 safe_strerror(errno));
d62a17ae 272 return -1;
273 }
d62a17ae 274 return 0;
1fdc9eae 275}
276
277/* Make socket for Linux netlink interface. */
d62a17ae 278static int netlink_socket(struct nlsock *nl, unsigned long groups,
279 ns_id_t ns_id)
1fdc9eae 280{
d62a17ae 281 int ret;
282 struct sockaddr_nl snl;
283 int sock;
284 int namelen;
d62a17ae 285
0cf6db21 286 frr_with_privs(&zserv_privs) {
6bb30c2c
DL
287 sock = ns_socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE, ns_id);
288 if (sock < 0) {
289 zlog_err("Can't open %s socket: %s", nl->name,
290 safe_strerror(errno));
291 return -1;
292 }
d62a17ae 293
0d6f7fd6 294 memset(&snl, 0, sizeof(snl));
6bb30c2c
DL
295 snl.nl_family = AF_NETLINK;
296 snl.nl_groups = groups;
d62a17ae 297
6bb30c2c 298 /* Bind the socket to the netlink structure for anything. */
0d6f7fd6 299 ret = bind(sock, (struct sockaddr *)&snl, sizeof(snl));
6bb30c2c 300 }
d62a17ae 301
302 if (ret < 0) {
6bb30c2c
DL
303 zlog_err("Can't bind %s socket to group 0x%x: %s", nl->name,
304 snl.nl_groups, safe_strerror(errno));
d62a17ae 305 close(sock);
306 return -1;
307 }
308
309 /* multiple netlink sockets will have different nl_pid */
0d6f7fd6 310 namelen = sizeof(snl);
d62a17ae 311 ret = getsockname(sock, (struct sockaddr *)&snl, (socklen_t *)&namelen);
0d6f7fd6 312 if (ret < 0 || namelen != sizeof(snl)) {
450971aa 313 flog_err_sys(EC_LIB_SOCKET, "Can't get %s socket name: %s",
09c866e3 314 nl->name, safe_strerror(errno));
d62a17ae 315 close(sock);
316 return -1;
317 }
318
319 nl->snl = snl;
320 nl->sock = sock;
321 return ret;
1fdc9eae 322}
323
d166308b
MS
324/*
325 * Dispatch an incoming netlink message; used by the zebra main pthread's
326 * netlink event reader.
327 */
2414abd3 328static int netlink_information_fetch(struct nlmsghdr *h, ns_id_t ns_id,
d62a17ae 329 int startup)
1fdc9eae 330{
3575d9e8
DS
331 /*
332 * When we handle new message types here
333 * because we are starting to install them
334 * then lets check the netlink_install_filter
335 * and see if we should add the corresponding
336 * allow through entry there.
337 * Probably not needed to do but please
338 * think about it.
339 */
d62a17ae 340 switch (h->nlmsg_type) {
341 case RTM_NEWROUTE:
2414abd3 342 return netlink_route_change(h, ns_id, startup);
d62a17ae 343 case RTM_DELROUTE:
2414abd3 344 return netlink_route_change(h, ns_id, startup);
d62a17ae 345 case RTM_NEWLINK:
2414abd3 346 return netlink_link_change(h, ns_id, startup);
d62a17ae 347 case RTM_DELLINK:
2414abd3 348 return netlink_link_change(h, ns_id, startup);
d62a17ae 349 case RTM_NEWNEIGH:
d62a17ae 350 case RTM_DELNEIGH:
951f8bcb 351 case RTM_GETNEIGH:
7a52f27e 352 return netlink_neigh_change(h, ns_id);
942bf97b 353 case RTM_NEWRULE:
2414abd3 354 return netlink_rule_change(h, ns_id, startup);
942bf97b 355 case RTM_DELRULE:
2414abd3 356 return netlink_rule_change(h, ns_id, startup);
79580b5a 357 case RTM_NEWNEXTHOP:
d9f5b2f5 358 return netlink_nexthop_change(h, ns_id, startup);
79580b5a 359 case RTM_DELNEXTHOP:
d9f5b2f5 360 return netlink_nexthop_change(h, ns_id, startup);
d166308b
MS
361
362 /* Messages handled in the dplane thread */
363 case RTM_NEWADDR:
364 case RTM_DELADDR:
365 return 0;
366
d62a17ae 367 default:
3575d9e8
DS
368 /*
369 * If we have received this message then
370 * we have made a mistake during development
371 * and we need to write some code to handle
372 * this message type or not ask for
373 * it to be sent up to us
374 */
e914ccbe 375 flog_err(EC_ZEBRA_UNKNOWN_NLMSG,
1d5453d6 376 "Unknown netlink nlmsg_type %s(%d) vrf %u",
1c50c1c0
QY
377 nl_msg_type_to_str(h->nlmsg_type), h->nlmsg_type,
378 ns_id);
d62a17ae 379 break;
380 }
381 return 0;
1fdc9eae 382}
383
d166308b
MS
384/*
385 * Dispatch an incoming netlink message; used by the dataplane pthread's
386 * netlink event reader code.
387 */
388static int dplane_netlink_information_fetch(struct nlmsghdr *h, ns_id_t ns_id,
389 int startup)
390{
391 /*
392 * Dispatch the incoming messages that the dplane pthread handles
393 */
394 switch (h->nlmsg_type) {
395 case RTM_NEWADDR:
396 case RTM_DELADDR:
397 return netlink_interface_addr_dplane(h, ns_id, startup);
398
399 /* TODO */
400 case RTM_NEWLINK:
401 case RTM_DELLINK:
402
403 default:
404 break;
405 }
406
407 return 0;
408}
409
d62a17ae 410static int kernel_read(struct thread *thread)
1fdc9eae 411{
d62a17ae 412 struct zebra_ns *zns = (struct zebra_ns *)THREAD_ARG(thread);
85a75f1e
MS
413 struct zebra_dplane_info dp_info;
414
415 /* Capture key info from ns struct */
416 zebra_dplane_info_from_zns(&dp_info, zns, false);
417
418 netlink_parse_info(netlink_information_fetch, &zns->netlink, &dp_info,
9bfadae8 419 5, false);
d166308b 420
3801e764 421 thread_add_read(zrouter.master, kernel_read, zns, zns->netlink.sock,
d62a17ae 422 &zns->t_netlink);
1fdc9eae 423
d62a17ae 424 return 0;
1fdc9eae 425}
426
d166308b
MS
427/*
428 * Called by the dplane pthread to read incoming OS messages and dispatch them.
429 */
430int kernel_dplane_read(struct zebra_dplane_info *info)
431{
432 netlink_parse_info(dplane_netlink_information_fetch, &info->nls, info,
9bfadae8 433 5, false);
d166308b
MS
434
435 return 0;
436}
437
3575d9e8
DS
438/*
439 * Filter out messages from self that occur on listener socket,
62b8bb7a 440 * caused by our actions on the command socket(s)
3575d9e8
DS
441 *
442 * When we add new Netlink message types we probably
443 * do not need to add them here as that we are filtering
444 * on the routes we actually care to receive( which is rarer
445 * then the normal course of operations). We are intentionally
446 * allowing some messages from ourselves through
447 * ( I'm looking at you Interface based netlink messages )
448 * so that we only had to write one way to handle incoming
449 * address add/delete changes.
1fdc9eae 450 */
ff45112c 451static void netlink_install_filter(int sock, uint32_t pid, uint32_t dplane_pid)
1fdc9eae 452{
3575d9e8
DS
453 /*
454 * BPF_JUMP instructions and where you jump to are based upon
455 * 0 as being the next statement. So count from 0. Writing
456 * this down because every time I look at this I have to
457 * re-remember it.
458 */
d62a17ae 459 struct sock_filter filter[] = {
3575d9e8
DS
460 /*
461 * Logic:
62b8bb7a
MS
462 * if (nlmsg_pid == pid ||
463 * nlmsg_pid == dplane_pid) {
3575d9e8
DS
464 * if (the incoming nlmsg_type ==
465 * RTM_NEWADDR | RTM_DELADDR)
466 * keep this message
467 * else
468 * skip this message
469 * } else
470 * keep this netlink message
471 */
472 /*
473 * 0: Load the nlmsg_pid into the BPF register
474 */
d62a17ae 475 BPF_STMT(BPF_LD | BPF_ABS | BPF_W,
476 offsetof(struct nlmsghdr, nlmsg_pid)),
3575d9e8
DS
477 /*
478 * 1: Compare to pid
479 */
62b8bb7a 480 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htonl(pid), 1, 0),
3575d9e8 481 /*
62b8bb7a
MS
482 * 2: Compare to dplane pid
483 */
484 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htonl(dplane_pid), 0, 4),
485 /*
486 * 3: Load the nlmsg_type into BPF register
3575d9e8
DS
487 */
488 BPF_STMT(BPF_LD | BPF_ABS | BPF_H,
489 offsetof(struct nlmsghdr, nlmsg_type)),
490 /*
62b8bb7a 491 * 4: Compare to RTM_NEWADDR
3575d9e8
DS
492 */
493 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htons(RTM_NEWADDR), 2, 0),
494 /*
62b8bb7a 495 * 5: Compare to RTM_DELADDR
3575d9e8
DS
496 */
497 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htons(RTM_DELADDR), 1, 0),
498 /*
62b8bb7a 499 * 6: This is the end state of we want to skip the
3575d9e8
DS
500 * message
501 */
d62a17ae 502 BPF_STMT(BPF_RET | BPF_K, 0),
62b8bb7a 503 /* 7: This is the end state of we want to keep
3575d9e8
DS
504 * the message
505 */
d62a17ae 506 BPF_STMT(BPF_RET | BPF_K, 0xffff),
507 };
508
509 struct sock_fprog prog = {
9d303b37 510 .len = array_size(filter), .filter = filter,
d62a17ae 511 };
512
513 if (setsockopt(sock, SOL_SOCKET, SO_ATTACH_FILTER, &prog, sizeof(prog))
514 < 0)
1d5453d6 515 flog_err_sys(EC_LIB_SOCKET, "Can't install socket filter: %s",
9df414fe 516 safe_strerror(errno));
1fdc9eae 517}
518
d166308b
MS
519void netlink_parse_rtattr_flags(struct rtattr **tb, int max, struct rtattr *rta,
520 int len, unsigned short flags)
4bcdb608
NA
521{
522 unsigned short type;
523
269b69d7 524 memset(tb, 0, sizeof(struct rtattr *) * (max + 1));
4bcdb608
NA
525 while (RTA_OK(rta, len)) {
526 type = rta->rta_type & ~flags;
527 if ((type <= max) && (!tb[type]))
528 tb[type] = rta;
529 rta = RTA_NEXT(rta, len);
530 }
531}
532
d62a17ae 533void netlink_parse_rtattr(struct rtattr **tb, int max, struct rtattr *rta,
534 int len)
1fdc9eae 535{
269b69d7 536 memset(tb, 0, sizeof(struct rtattr *) * (max + 1));
d62a17ae 537 while (RTA_OK(rta, len)) {
538 if (rta->rta_type <= max)
539 tb[rta->rta_type] = rta;
540 rta = RTA_NEXT(rta, len);
541 }
1fdc9eae 542}
543
87da6a60
SW
544/**
545 * netlink_parse_rtattr_nested() - Parses a nested route attribute
546 * @tb: Pointer to array for storing rtattr in.
547 * @max: Max number to store.
548 * @rta: Pointer to rtattr to look for nested items in.
549 */
550void netlink_parse_rtattr_nested(struct rtattr **tb, int max,
551 struct rtattr *rta)
552{
553 netlink_parse_rtattr(tb, max, RTA_DATA(rta), RTA_PAYLOAD(rta));
554}
555
312a6bee
JU
556bool nl_attr_put(struct nlmsghdr *n, unsigned int maxlen, int type,
557 const void *data, unsigned int alen)
1fdc9eae 558{
d62a17ae 559 int len;
560 struct rtattr *rta;
1fdc9eae 561
d62a17ae 562 len = RTA_LENGTH(alen);
1fdc9eae 563
d62a17ae 564 if (NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len) > maxlen)
312a6bee 565 return false;
1fdc9eae 566
d62a17ae 567 rta = (struct rtattr *)(((char *)n) + NLMSG_ALIGN(n->nlmsg_len));
568 rta->rta_type = type;
569 rta->rta_len = len;
4b2792b5 570
d62a17ae 571 if (data)
572 memcpy(RTA_DATA(rta), data, alen);
573 else
574 assert(alen == 0);
4b2792b5 575
d62a17ae 576 n->nlmsg_len = NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len);
1fdc9eae 577
312a6bee 578 return true;
1fdc9eae 579}
580
94d70a65
DS
581bool nl_attr_put8(struct nlmsghdr *n, unsigned int maxlen, int type,
582 uint8_t data)
583{
584 return nl_attr_put(n, maxlen, type, &data, sizeof(uint8_t));
585}
586
312a6bee
JU
587bool nl_attr_put16(struct nlmsghdr *n, unsigned int maxlen, int type,
588 uint16_t data)
bbc16902 589{
312a6bee 590 return nl_attr_put(n, maxlen, type, &data, sizeof(uint16_t));
bbc16902 591}
592
312a6bee
JU
593bool nl_attr_put32(struct nlmsghdr *n, unsigned int maxlen, int type,
594 uint32_t data)
1fdc9eae 595{
312a6bee 596 return nl_attr_put(n, maxlen, type, &data, sizeof(uint32_t));
1fdc9eae 597}
598
312a6bee 599struct rtattr *nl_attr_nest(struct nlmsghdr *n, unsigned int maxlen, int type)
1fdc9eae 600{
d62a17ae 601 struct rtattr *nest = NLMSG_TAIL(n);
1fdc9eae 602
312a6bee
JU
603 if (!nl_attr_put(n, maxlen, type, NULL, 0))
604 return NULL;
605
40d86eba 606 nest->rta_type |= NLA_F_NESTED;
d62a17ae 607 return nest;
1fdc9eae 608}
609
312a6bee 610int nl_attr_nest_end(struct nlmsghdr *n, struct rtattr *nest)
1fdc9eae 611{
d7c0a89a 612 nest->rta_len = (uint8_t *)NLMSG_TAIL(n) - (uint8_t *)nest;
d62a17ae 613 return n->nlmsg_len;
1fdc9eae 614}
615
312a6bee 616struct rtnexthop *nl_attr_rtnh(struct nlmsghdr *n, unsigned int maxlen)
1fdc9eae 617{
312a6bee 618 struct rtnexthop *rtnh = (struct rtnexthop *)NLMSG_TAIL(n);
1fdc9eae 619
312a6bee
JU
620 if (NLMSG_ALIGN(n->nlmsg_len) + RTNH_ALIGN(sizeof(struct rtnexthop))
621 > maxlen)
622 return NULL;
623
624 memset(rtnh, 0, sizeof(struct rtnexthop));
625 n->nlmsg_len =
626 NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(sizeof(struct rtnexthop));
627
628 return rtnh;
1fdc9eae 629}
630
312a6bee 631void nl_attr_rtnh_end(struct nlmsghdr *n, struct rtnexthop *rtnh)
1fdc9eae 632{
312a6bee 633 rtnh->rtnh_len = (uint8_t *)NLMSG_TAIL(n) - (uint8_t *)rtnh;
1fdc9eae 634}
635
d62a17ae 636const char *nl_msg_type_to_str(uint16_t msg_type)
1fdc9eae 637{
d62a17ae 638 return lookup_msg(nlmsg_str, msg_type, "");
1fdc9eae 639}
640
d7c0a89a 641const char *nl_rtproto_to_str(uint8_t rtproto)
1fdc9eae 642{
d62a17ae 643 return lookup_msg(rtproto_str, rtproto, "");
1fdc9eae 644}
b339bde7 645
d7c0a89a 646const char *nl_family_to_str(uint8_t family)
b339bde7 647{
d62a17ae 648 return lookup_msg(family_str, family, "");
b339bde7
DS
649}
650
d7c0a89a 651const char *nl_rttype_to_str(uint8_t rttype)
b339bde7 652{
d62a17ae 653 return lookup_msg(rttype_str, rttype, "");
b339bde7
DS
654}
655
4cebb2b6 656#define NLA_OK(nla, len) \
5d307d5d
DS
657 ((len) >= (int)sizeof(struct nlattr) \
658 && (nla)->nla_len >= sizeof(struct nlattr) \
659 && (nla)->nla_len <= (len))
4cebb2b6
SW
660#define NLA_NEXT(nla, attrlen) \
661 ((attrlen) -= NLA_ALIGN((nla)->nla_len), \
662 (struct nlattr *)(((char *)(nla)) + NLA_ALIGN((nla)->nla_len)))
663#define NLA_LENGTH(len) (NLA_ALIGN(sizeof(struct nlattr)) + (len))
664#define NLA_DATA(nla) ((struct nlattr *)(((char *)(nla)) + NLA_LENGTH(0)))
665
666#define ERR_NLA(err, inner_len) \
667 ((struct nlattr *)(((char *)(err)) \
668 + NLMSG_ALIGN(sizeof(struct nlmsgerr)) \
669 + NLMSG_ALIGN((inner_len))))
5d307d5d
DS
670
671static void netlink_parse_nlattr(struct nlattr **tb, int max,
672 struct nlattr *nla, int len)
673{
4cebb2b6 674 while (NLA_OK(nla, len)) {
5d307d5d
DS
675 if (nla->nla_type <= max)
676 tb[nla->nla_type] = nla;
4cebb2b6 677 nla = NLA_NEXT(nla, len);
5d307d5d
DS
678 }
679}
680
681static void netlink_parse_extended_ack(struct nlmsghdr *h)
682{
4cebb2b6
SW
683 struct nlattr *tb[NLMSGERR_ATTR_MAX + 1] = {};
684 const struct nlmsgerr *err = (const struct nlmsgerr *)NLMSG_DATA(h);
5d307d5d 685 const struct nlmsghdr *err_nlh = NULL;
4cebb2b6
SW
686 /* Length not including nlmsghdr */
687 uint32_t len = 0;
688 /* Inner error netlink message length */
689 uint32_t inner_len = 0;
5d307d5d
DS
690 const char *msg = NULL;
691 uint32_t off = 0;
692
693 if (!(h->nlmsg_flags & NLM_F_CAPPED))
4cebb2b6
SW
694 inner_len = (uint32_t)NLMSG_PAYLOAD(&err->msg, 0);
695
696 len = (uint32_t)(NLMSG_PAYLOAD(h, sizeof(struct nlmsgerr)) - inner_len);
5d307d5d 697
4cebb2b6
SW
698 netlink_parse_nlattr(tb, NLMSGERR_ATTR_MAX, ERR_NLA(err, inner_len),
699 len);
5d307d5d
DS
700
701 if (tb[NLMSGERR_ATTR_MSG])
4cebb2b6 702 msg = (const char *)NLA_DATA(tb[NLMSGERR_ATTR_MSG]);
5d307d5d
DS
703
704 if (tb[NLMSGERR_ATTR_OFFS]) {
4cebb2b6 705 off = *(uint32_t *)NLA_DATA(tb[NLMSGERR_ATTR_OFFS]);
5d307d5d
DS
706
707 if (off > h->nlmsg_len) {
9165c5f5 708 zlog_err("Invalid offset for NLMSGERR_ATTR_OFFS");
5d307d5d
DS
709 } else if (!(h->nlmsg_flags & NLM_F_CAPPED)) {
710 /*
711 * Header of failed message
712 * we are not doing anything currently with it
713 * but noticing it for later.
714 */
715 err_nlh = &err->msg;
15569c58 716 zlog_debug("%s: Received %s extended Ack", __func__,
87b5d1b0 717 nl_msg_type_to_str(err_nlh->nlmsg_type));
5d307d5d
DS
718 }
719 }
720
721 if (msg && *msg != '\0') {
722 bool is_err = !!err->error;
723
724 if (is_err)
725 zlog_err("Extended Error: %s", msg);
726 else
e914ccbe 727 flog_warn(EC_ZEBRA_NETLINK_EXTENDED_WARNING,
9df414fe 728 "Extended Warning: %s", msg);
5d307d5d
DS
729 }
730}
731
ae6138bf
JU
732/*
733 * netlink_send_msg - send a netlink message of a certain size.
734 *
735 * Returns -1 on error. Otherwise, it returns the number of bytes sent.
736 */
f8653393
JU
737static ssize_t netlink_send_msg(const struct nlsock *nl, void *buf,
738 size_t buflen)
ae6138bf 739{
f8653393
JU
740 struct sockaddr_nl snl = {};
741 struct iovec iov = {};
742 struct msghdr msg = {};
743 ssize_t status;
744 int save_errno = 0;
ae6138bf
JU
745
746 iov.iov_base = buf;
747 iov.iov_len = buflen;
f8653393 748 msg.msg_name = &snl;
ae6138bf
JU
749 msg.msg_namelen = sizeof(snl);
750 msg.msg_iov = &iov;
751 msg.msg_iovlen = 1;
752
753 snl.nl_family = AF_NETLINK;
754
755 /* Send message to netlink interface. */
756 frr_with_privs(&zserv_privs) {
757 status = sendmsg(nl->sock, &msg, 0);
758 save_errno = errno;
759 }
760
761 if (IS_ZEBRA_DEBUG_KERNEL_MSGDUMP_SEND) {
762 zlog_debug("%s: >> netlink message dump [sent]", __func__);
eead0bc4
RZ
763#ifdef NETLINK_DEBUG
764 nl_dump(buf, buflen);
765#else
ae6138bf 766 zlog_hexdump(buf, buflen);
eead0bc4 767#endif /* NETLINK_DEBUG */
ae6138bf
JU
768 }
769
f8653393 770 if (status == -1) {
ae6138bf
JU
771 flog_err_sys(EC_LIB_SOCKET, "%s error: %s", __func__,
772 safe_strerror(save_errno));
773 return -1;
774 }
775
776 return status;
777}
778
779/*
780 * netlink_recv_msg - receive a netlink message.
781 *
782 * Returns -1 on error, 0 if read would block or the number of bytes received.
783 */
784static int netlink_recv_msg(const struct nlsock *nl, struct msghdr msg,
785 void *buf, size_t buflen)
786{
787 struct iovec iov;
788 int status;
789
790 iov.iov_base = buf;
791 iov.iov_len = buflen;
792 msg.msg_iov = &iov;
793 msg.msg_iovlen = 1;
794
795 do {
ae6138bf 796 status = recvmsg(nl->sock, &msg, 0);
f8653393 797 } while (status == -1 && errno == EINTR);
ae6138bf 798
f8653393 799 if (status == -1) {
ae6138bf
JU
800 if (errno == EWOULDBLOCK || errno == EAGAIN)
801 return 0;
802 flog_err(EC_ZEBRA_RECVMSG_OVERRUN, "%s recvmsg overrun: %s",
803 nl->name, safe_strerror(errno));
804 /*
805 * In this case we are screwed. There is no good way to recover
806 * zebra at this point.
807 */
808 exit(-1);
809 }
810
811 if (status == 0) {
812 flog_err_sys(EC_LIB_SOCKET, "%s EOF", nl->name);
813 return -1;
814 }
815
816 if (msg.msg_namelen != sizeof(struct sockaddr_nl)) {
817 flog_err(EC_ZEBRA_NETLINK_LENGTH_ERROR,
818 "%s sender address length error: length %d", nl->name,
819 msg.msg_namelen);
820 return -1;
821 }
822
823 if (IS_ZEBRA_DEBUG_KERNEL_MSGDUMP_RECV) {
824 zlog_debug("%s: << netlink message dump [recv]", __func__);
eead0bc4
RZ
825#ifdef NETLINK_DEBUG
826 nl_dump(buf, status);
827#else
ae6138bf 828 zlog_hexdump(buf, status);
eead0bc4 829#endif /* NETLINK_DEBUG */
ae6138bf
JU
830 }
831
ae6138bf
JU
832 return status;
833}
834
835/*
836 * netlink_parse_error - parse a netlink error message
837 *
838 * Returns 1 if this message is acknowledgement, 0 if this error should be
839 * ignored, -1 otherwise.
840 */
841static int netlink_parse_error(const struct nlsock *nl, struct nlmsghdr *h,
d166308b 842 bool is_cmd, bool startup)
ae6138bf
JU
843{
844 struct nlmsgerr *err = (struct nlmsgerr *)NLMSG_DATA(h);
845 int errnum = err->error;
846 int msg_type = err->msg.nlmsg_type;
847
848 if (h->nlmsg_len < NLMSG_LENGTH(sizeof(struct nlmsgerr))) {
849 flog_err(EC_ZEBRA_NETLINK_LENGTH_ERROR,
850 "%s error: message truncated", nl->name);
851 return -1;
852 }
853
854 /*
855 * Parse the extended information before we actually handle it. At this
856 * point in time we do not do anything other than report the issue.
857 */
858 if (h->nlmsg_flags & NLM_F_ACK_TLVS)
859 netlink_parse_extended_ack(h);
860
861 /* If the error field is zero, then this is an ACK. */
862 if (err->error == 0) {
863 if (IS_ZEBRA_DEBUG_KERNEL) {
864 zlog_debug("%s: %s ACK: type=%s(%u), seq=%u, pid=%u",
865 __func__, nl->name,
866 nl_msg_type_to_str(err->msg.nlmsg_type),
867 err->msg.nlmsg_type, err->msg.nlmsg_seq,
868 err->msg.nlmsg_pid);
869 }
870
871 return 1;
872 }
873
874 /* Deal with errors that occur because of races in link handling. */
d166308b 875 if (is_cmd
ae6138bf
JU
876 && ((msg_type == RTM_DELROUTE
877 && (-errnum == ENODEV || -errnum == ESRCH))
878 || (msg_type == RTM_NEWROUTE
879 && (-errnum == ENETDOWN || -errnum == EEXIST)))) {
880 if (IS_ZEBRA_DEBUG_KERNEL)
881 zlog_debug("%s: error: %s type=%s(%u), seq=%u, pid=%u",
882 nl->name, safe_strerror(-errnum),
883 nl_msg_type_to_str(msg_type), msg_type,
884 err->msg.nlmsg_seq, err->msg.nlmsg_pid);
885 return 0;
886 }
887
888 /*
889 * We see RTM_DELNEIGH when shutting down an interface with an IPv4
890 * link-local. The kernel should have already deleted the neighbor so
891 * do not log these as an error.
892 */
893 if (msg_type == RTM_DELNEIGH
d166308b 894 || (is_cmd && msg_type == RTM_NEWROUTE
ae6138bf
JU
895 && (-errnum == ESRCH || -errnum == ENETUNREACH))) {
896 /*
897 * This is known to happen in some situations, don't log as
898 * error.
899 */
900 if (IS_ZEBRA_DEBUG_KERNEL)
901 zlog_debug("%s error: %s, type=%s(%u), seq=%u, pid=%u",
902 nl->name, safe_strerror(-errnum),
903 nl_msg_type_to_str(msg_type), msg_type,
904 err->msg.nlmsg_seq, err->msg.nlmsg_pid);
905 } else {
906 if ((msg_type != RTM_GETNEXTHOP) || !startup)
907 flog_err(EC_ZEBRA_UNEXPECTED_MESSAGE,
908 "%s error: %s, type=%s(%u), seq=%u, pid=%u",
909 nl->name, safe_strerror(-errnum),
910 nl_msg_type_to_str(msg_type), msg_type,
911 err->msg.nlmsg_seq, err->msg.nlmsg_pid);
912 }
913
914 return -1;
915}
916
936ebf0a
DS
917/*
918 * netlink_parse_info
919 *
920 * Receive message from netlink interface and pass those information
921 * to the given function.
922 *
923 * filter -> Function to call to read the results
924 * nl -> netlink socket information
925 * zns -> The zebra namespace data
926 * count -> How many we should read in, 0 means as much as possible
927 * startup -> Are we reading in under startup conditions? passed to
928 * the filter.
929 */
2414abd3 930int netlink_parse_info(int (*filter)(struct nlmsghdr *, ns_id_t, int),
7cdb1a84
MS
931 const struct nlsock *nl,
932 const struct zebra_dplane_info *zns,
9bfadae8 933 int count, bool startup)
1fdc9eae 934{
d62a17ae 935 int status;
936 int ret = 0;
937 int error;
938 int read_in = 0;
939
940 while (1) {
9ed7517b 941 char buf[NL_RCV_PKT_BUF_SIZE];
d62a17ae 942 struct sockaddr_nl snl;
943 struct msghdr msg = {.msg_name = (void *)&snl,
ae6138bf 944 .msg_namelen = sizeof(snl)};
d62a17ae 945 struct nlmsghdr *h;
946
947 if (count && read_in >= count)
948 return 0;
949
ae6138bf
JU
950 status = netlink_recv_msg(nl, msg, buf, sizeof(buf));
951 if (status == -1)
d62a17ae 952 return -1;
ae6138bf
JU
953 else if (status == 0)
954 break;
81a2f870 955
d62a17ae 956 read_in++;
957 for (h = (struct nlmsghdr *)buf;
e6a0e0d1 958 (status >= 0 && NLMSG_OK(h, (unsigned int)status));
d62a17ae 959 h = NLMSG_NEXT(h, status)) {
960 /* Finish of reading. */
961 if (h->nlmsg_type == NLMSG_DONE)
962 return ret;
963
964 /* Error handling. */
965 if (h->nlmsg_type == NLMSG_ERROR) {
d166308b
MS
966 int err = netlink_parse_error(
967 nl, h, zns->is_cmd, startup);
968
ae6138bf 969 if (err == 1) {
d62a17ae 970 if (!(h->nlmsg_flags & NLM_F_MULTI))
971 return 0;
972 continue;
ae6138bf
JU
973 } else
974 return err;
d62a17ae 975 }
976
977 /* OK we got netlink message. */
978 if (IS_ZEBRA_DEBUG_KERNEL)
979 zlog_debug(
d166308b
MS
980 "%s: %s type %s(%u), len=%d, seq=%u, pid=%u",
981 __func__, nl->name,
d62a17ae 982 nl_msg_type_to_str(h->nlmsg_type),
983 h->nlmsg_type, h->nlmsg_len,
984 h->nlmsg_seq, h->nlmsg_pid);
985
783827ae
DS
986
987 /*
988 * Ignore messages that maybe sent from
989 * other actors besides the kernel
990 */
991 if (snl.nl_pid != 0) {
43e52561
QY
992 zlog_debug("Ignoring message from pid %u",
993 snl.nl_pid);
d62a17ae 994 continue;
995 }
996
2414abd3 997 error = (*filter)(h, zns->ns_id, startup);
d62a17ae 998 if (error < 0) {
9df414fe
QY
999 zlog_debug("%s filter function error",
1000 nl->name);
d62a17ae 1001 ret = error;
1002 }
1003 }
1004
1005 /* After error care. */
1006 if (msg.msg_flags & MSG_TRUNC) {
e914ccbe 1007 flog_err(EC_ZEBRA_NETLINK_LENGTH_ERROR,
1c50c1c0 1008 "%s error: message truncated", nl->name);
d62a17ae 1009 continue;
1010 }
1011 if (status) {
e914ccbe 1012 flog_err(EC_ZEBRA_NETLINK_LENGTH_ERROR,
1c50c1c0
QY
1013 "%s error: data remnant size %d", nl->name,
1014 status);
d62a17ae 1015 return -1;
1016 }
1017 }
1018 return ret;
1fdc9eae 1019}
1020
936ebf0a 1021/*
7cdb1a84 1022 * netlink_talk_info
936ebf0a
DS
1023 *
1024 * sendmsg() to netlink socket then recvmsg().
1025 * Calls netlink_parse_info to parse returned data
1026 *
1027 * filter -> The filter to read final results from kernel
1028 * nlmsghdr -> The data to send to the kernel
8b962e77 1029 * dp_info -> The dataplane and netlink socket information
936ebf0a
DS
1030 * startup -> Are we reading in under startup conditions
1031 * This is passed through eventually to filter.
1032 */
67e3369e
JU
1033static int
1034netlink_talk_info(int (*filter)(struct nlmsghdr *, ns_id_t, int startup),
1035 struct nlmsghdr *n, const struct zebra_dplane_info *dp_info,
9bfadae8 1036 bool startup)
1fdc9eae 1037{
7cdb1a84 1038 const struct nlsock *nl;
d62a17ae 1039
7cdb1a84
MS
1040 nl = &(dp_info->nls);
1041 n->nlmsg_seq = nl->seq;
d62a17ae 1042 n->nlmsg_pid = nl->snl.nl_pid;
1043
d62a17ae 1044 if (IS_ZEBRA_DEBUG_KERNEL)
1045 zlog_debug(
1046 "netlink_talk: %s type %s(%u), len=%d seq=%u flags 0x%x",
1047 nl->name, nl_msg_type_to_str(n->nlmsg_type),
1048 n->nlmsg_type, n->nlmsg_len, n->nlmsg_seq,
1049 n->nlmsg_flags);
1050
f8653393 1051 if (netlink_send_msg(nl, n, n->nlmsg_len) == -1)
d62a17ae 1052 return -1;
d62a17ae 1053
d62a17ae 1054 /*
1055 * Get reply from netlink socket.
1056 * The reply should either be an acknowlegement or an error.
1057 */
7cdb1a84
MS
1058 return netlink_parse_info(filter, nl, dp_info, 0, startup);
1059}
1060
1061/*
1062 * Synchronous version of netlink_talk_info. Converts args to suit the
1063 * common version, which is suitable for both sync and async use.
7cdb1a84
MS
1064 */
1065int netlink_talk(int (*filter)(struct nlmsghdr *, ns_id_t, int startup),
1066 struct nlmsghdr *n, struct nlsock *nl, struct zebra_ns *zns,
9bfadae8 1067 bool startup)
7cdb1a84
MS
1068{
1069 struct zebra_dplane_info dp_info;
1070
1071 /* Increment sequence number before capturing snapshot of ns socket
1072 * info.
1073 */
1074 nl->seq++;
1075
1076 /* Capture info in intermediate info struct */
85a75f1e 1077 zebra_dplane_info_from_zns(&dp_info, zns, (nl == &(zns->netlink_cmd)));
7cdb1a84 1078
5709131c 1079 return netlink_talk_info(filter, n, &dp_info, startup);
1fdc9eae 1080}
1081
289602d7 1082/* Issue request message to kernel via netlink socket. GET messages
1083 * are issued through this interface.
1084 */
fd3f8e52 1085int netlink_request(struct nlsock *nl, void *req)
1fdc9eae 1086{
fd3f8e52 1087 struct nlmsghdr *n = (struct nlmsghdr *)req;
d62a17ae 1088
1089 /* Check netlink socket. */
1090 if (nl->sock < 0) {
450971aa 1091 flog_err_sys(EC_LIB_SOCKET, "%s socket isn't active.",
09c866e3 1092 nl->name);
d62a17ae 1093 return -1;
1094 }
1095
1096 /* Fill common fields for all requests. */
d62a17ae 1097 n->nlmsg_pid = nl->snl.nl_pid;
1098 n->nlmsg_seq = ++nl->seq;
1099
f8653393 1100 if (netlink_send_msg(nl, req, n->nlmsg_len) == -1)
d62a17ae 1101 return -1;
d62a17ae 1102
1103 return 0;
1fdc9eae 1104}
1105
e63c7622
JU
1106static int nl_batch_read_resp(struct nl_batch *bth)
1107{
1108 struct nlmsghdr *h;
1109 struct sockaddr_nl snl;
9d06e121 1110 struct msghdr msg = {};
f6feb48b 1111 int status, seq;
e63c7622 1112 const struct nlsock *nl;
f6feb48b
JU
1113 struct zebra_dplane_ctx *ctx;
1114 bool ignore_msg;
e63c7622
JU
1115
1116 nl = &(bth->zns->nls);
1117
1118 msg.msg_name = (void *)&snl;
1119 msg.msg_namelen = sizeof(snl);
1120
2f9dbd3a
JU
1121 /*
1122 * The responses are not batched, so we need to read and process one
1123 * message at a time.
1124 */
1125 while (true) {
1126 status = netlink_recv_msg(nl, msg, nl_batch_rx_buf,
1127 sizeof(nl_batch_rx_buf));
1128 if (status == -1 || status == 0)
1129 return status;
e63c7622 1130
2f9dbd3a 1131 h = (struct nlmsghdr *)nl_batch_rx_buf;
f6feb48b
JU
1132 ignore_msg = false;
1133 seq = h->nlmsg_seq;
e63c7622 1134 /*
f6feb48b
JU
1135 * Find the corresponding context object. Received responses are
1136 * in the same order as requests we sent, so we can simply
1137 * iterate over the context list and match responses with
1138 * requests at same time.
e63c7622 1139 */
f6feb48b
JU
1140 while (true) {
1141 ctx = dplane_ctx_dequeue(&(bth->ctx_list));
1142 if (ctx == NULL)
1143 break;
1144
1145 dplane_ctx_enqueue_tail(bth->ctx_out_q, ctx);
1146
1147 /* We have found corresponding context object. */
1148 if (dplane_ctx_get_ns(ctx)->nls.seq == seq)
e63c7622 1149 break;
f6feb48b
JU
1150
1151 /*
1152 * 'update' context objects take two consecutive
1153 * sequence numbers.
1154 */
1155 if (dplane_ctx_is_update(ctx)
1156 && dplane_ctx_get_ns(ctx)->nls.seq + 1 == seq) {
1157 /*
1158 * This is the situation where we get a response
1159 * to a message that should be ignored.
1160 */
1161 ignore_msg = true;
1162 break;
1163 }
e63c7622
JU
1164 }
1165
f6feb48b
JU
1166 if (ignore_msg)
1167 continue;
1168
e63c7622
JU
1169 /*
1170 * We received a message with the sequence number that isn't
1171 * associated with any dplane context object.
1172 */
f6feb48b 1173 if (ctx == NULL) {
4c99d413
MS
1174 if (IS_ZEBRA_DEBUG_KERNEL)
1175 zlog_debug(
1176 "%s: skipping unassociated response, seq number %d NS %u",
1177 __func__, h->nlmsg_seq,
1178 bth->zns->ns_id);
e63c7622
JU
1179 continue;
1180 }
1181
1182 if (h->nlmsg_type == NLMSG_ERROR) {
d166308b
MS
1183 int err = netlink_parse_error(nl, h, bth->zns->is_cmd,
1184 false);
e63c7622
JU
1185
1186 if (err == -1)
f6feb48b
JU
1187 dplane_ctx_set_status(
1188 ctx, ZEBRA_DPLANE_REQUEST_FAILURE);
e63c7622 1189
4c99d413
MS
1190 if (IS_ZEBRA_DEBUG_KERNEL)
1191 zlog_debug("%s: netlink error message seq=%d ",
1192 __func__, h->nlmsg_seq);
e63c7622
JU
1193 continue;
1194 }
1195
1196 /*
1197 * If we get here then we did not receive neither the ack nor
1198 * the error and instead received some other message in an
1199 * unexpected way.
1200 */
4c99d413
MS
1201 if (IS_ZEBRA_DEBUG_KERNEL)
1202 zlog_debug("%s: ignoring message type 0x%04x(%s) NS %u",
1203 __func__, h->nlmsg_type,
1204 nl_msg_type_to_str(h->nlmsg_type),
1205 bth->zns->ns_id);
e63c7622
JU
1206 }
1207
1208 return 0;
1209}
1210
1211static void nl_batch_reset(struct nl_batch *bth)
1212{
e63c7622
JU
1213 bth->buf_head = bth->buf;
1214 bth->curlen = 0;
1215 bth->msgcnt = 0;
1216 bth->zns = NULL;
1217
f6feb48b 1218 TAILQ_INIT(&(bth->ctx_list));
e63c7622
JU
1219}
1220
f6feb48b 1221static void nl_batch_init(struct nl_batch *bth, struct dplane_ctx_q *ctx_out_q)
e63c7622 1222{
531c92b8
JU
1223 /*
1224 * If the size of the buffer has changed, free and then allocate a new
1225 * one.
1226 */
1227 size_t bufsize =
1228 atomic_load_explicit(&nl_batch_bufsize, memory_order_relaxed);
1229 if (bufsize != nl_batch_tx_bufsize) {
1230 if (nl_batch_tx_buf)
1231 XFREE(MTYPE_NL_BUF, nl_batch_tx_buf);
1232
1233 nl_batch_tx_buf = XCALLOC(MTYPE_NL_BUF, bufsize);
1234 nl_batch_tx_bufsize = bufsize;
1235 }
1236
f6feb48b 1237 bth->buf = nl_batch_tx_buf;
531c92b8
JU
1238 bth->bufsiz = bufsize;
1239 bth->limit = atomic_load_explicit(&nl_batch_send_threshold,
1240 memory_order_relaxed);
e63c7622 1241
f6feb48b 1242 bth->ctx_out_q = ctx_out_q;
e63c7622 1243
f6feb48b
JU
1244 nl_batch_reset(bth);
1245}
1246
1247static void nl_batch_send(struct nl_batch *bth)
1248{
1249 struct zebra_dplane_ctx *ctx;
1250 bool err = false;
e63c7622 1251
f6feb48b
JU
1252 if (bth->curlen != 0 && bth->zns != NULL) {
1253 if (IS_ZEBRA_DEBUG_KERNEL)
1254 zlog_debug("%s: %s, batch size=%zu, msg cnt=%zu",
1255 __func__, bth->zns->nls.name, bth->curlen,
1256 bth->msgcnt);
e63c7622 1257
f6feb48b
JU
1258 if (netlink_send_msg(&(bth->zns->nls), bth->buf, bth->curlen)
1259 == -1)
e63c7622 1260 err = true;
e63c7622 1261
f6feb48b
JU
1262 if (!err) {
1263 if (nl_batch_read_resp(bth) == -1)
1264 err = true;
1265 }
1266 }
e63c7622 1267
f6feb48b
JU
1268 /* Move remaining contexts to the outbound queue. */
1269 while (true) {
1270 ctx = dplane_ctx_dequeue(&(bth->ctx_list));
1271 if (ctx == NULL)
1272 break;
e63c7622 1273
f6feb48b
JU
1274 if (err)
1275 dplane_ctx_set_status(ctx,
1276 ZEBRA_DPLANE_REQUEST_FAILURE);
e63c7622 1277
f6feb48b 1278 dplane_ctx_enqueue_tail(bth->ctx_out_q, ctx);
e63c7622
JU
1279 }
1280
1281 nl_batch_reset(bth);
1282}
1283
e63c7622
JU
1284enum netlink_msg_status netlink_batch_add_msg(
1285 struct nl_batch *bth, struct zebra_dplane_ctx *ctx,
1286 ssize_t (*msg_encoder)(struct zebra_dplane_ctx *, void *, size_t),
f6feb48b 1287 bool ignore_res)
e63c7622
JU
1288{
1289 int seq;
1290 ssize_t size;
1291 struct nlmsghdr *msgh;
1292
e63c7622
JU
1293 size = (*msg_encoder)(ctx, bth->buf_head, bth->bufsiz - bth->curlen);
1294
1295 /*
1296 * If there was an error while encoding the message (other than buffer
1297 * overflow) then return an error.
1298 */
1299 if (size < 0)
1300 return FRR_NETLINK_ERROR;
1301
1302 /*
1303 * If the message doesn't fit entirely in the buffer then send the batch
1304 * and retry.
1305 */
1306 if (size == 0) {
1307 nl_batch_send(bth);
1308 size = (*msg_encoder)(ctx, bth->buf_head,
1309 bth->bufsiz - bth->curlen);
1310 /*
1311 * If the message doesn't fit in the empty buffer then just
1312 * return an error.
1313 */
1314 if (size <= 0)
1315 return FRR_NETLINK_ERROR;
1316 }
1317
1318 seq = dplane_ctx_get_ns(ctx)->nls.seq;
f6feb48b 1319 if (ignore_res)
e63c7622
JU
1320 seq++;
1321
1322 msgh = (struct nlmsghdr *)bth->buf_head;
1323 msgh->nlmsg_seq = seq;
1324 msgh->nlmsg_pid = dplane_ctx_get_ns(ctx)->nls.snl.nl_pid;
1325
e63c7622
JU
1326 bth->zns = dplane_ctx_get_ns(ctx);
1327 bth->buf_head = ((char *)bth->buf_head) + size;
1328 bth->curlen += size;
1329 bth->msgcnt++;
1330
e63c7622
JU
1331 return FRR_NETLINK_QUEUED;
1332}
1333
67e3369e
JU
1334static enum netlink_msg_status nl_put_msg(struct nl_batch *bth,
1335 struct zebra_dplane_ctx *ctx)
1336{
1337 if (dplane_ctx_is_skip_kernel(ctx))
1338 return FRR_NETLINK_SUCCESS;
1339
1340 switch (dplane_ctx_get_op(ctx)) {
1341
1342 case DPLANE_OP_ROUTE_INSTALL:
1343 case DPLANE_OP_ROUTE_UPDATE:
1344 case DPLANE_OP_ROUTE_DELETE:
1345 return netlink_put_route_update_msg(bth, ctx);
1346
1347 case DPLANE_OP_NH_INSTALL:
1348 case DPLANE_OP_NH_UPDATE:
1349 case DPLANE_OP_NH_DELETE:
1350 return netlink_put_nexthop_update_msg(bth, ctx);
1351
1352 case DPLANE_OP_LSP_INSTALL:
1353 case DPLANE_OP_LSP_UPDATE:
1354 case DPLANE_OP_LSP_DELETE:
1355 return netlink_put_lsp_update_msg(bth, ctx);
1356
1357 case DPLANE_OP_PW_INSTALL:
1358 case DPLANE_OP_PW_UNINSTALL:
1359 return netlink_put_pw_update_msg(bth, ctx);
1360
1361 case DPLANE_OP_ADDR_INSTALL:
1362 case DPLANE_OP_ADDR_UNINSTALL:
1363 return netlink_put_address_update_msg(bth, ctx);
1364
1365 case DPLANE_OP_MAC_INSTALL:
1366 case DPLANE_OP_MAC_DELETE:
1367 return netlink_put_mac_update_msg(bth, ctx);
1368
1369 case DPLANE_OP_NEIGH_INSTALL:
1370 case DPLANE_OP_NEIGH_UPDATE:
1371 case DPLANE_OP_NEIGH_DELETE:
1372 case DPLANE_OP_VTEP_ADD:
1373 case DPLANE_OP_VTEP_DELETE:
d68e74b4 1374 case DPLANE_OP_NEIGH_DISCOVER:
0a27a2fe
PG
1375 case DPLANE_OP_NEIGH_IP_INSTALL:
1376 case DPLANE_OP_NEIGH_IP_DELETE:
e18747a9 1377 case DPLANE_OP_NEIGH_TABLE_UPDATE:
67e3369e
JU
1378 return netlink_put_neigh_update_msg(bth, ctx);
1379
1380 case DPLANE_OP_RULE_ADD:
1381 case DPLANE_OP_RULE_DELETE:
1382 case DPLANE_OP_RULE_UPDATE:
1383 return netlink_put_rule_update_msg(bth, ctx);
1384
1385 case DPLANE_OP_SYS_ROUTE_ADD:
1386 case DPLANE_OP_SYS_ROUTE_DELETE:
1387 case DPLANE_OP_ROUTE_NOTIFY:
1388 case DPLANE_OP_LSP_NOTIFY:
c60522f7 1389 case DPLANE_OP_BR_PORT_UPDATE:
67e3369e
JU
1390 return FRR_NETLINK_SUCCESS;
1391
5162e000
PG
1392 case DPLANE_OP_IPTABLE_ADD:
1393 case DPLANE_OP_IPTABLE_DELETE:
ef524230
PG
1394 case DPLANE_OP_IPSET_ADD:
1395 case DPLANE_OP_IPSET_DELETE:
1396 case DPLANE_OP_IPSET_ENTRY_ADD:
1397 case DPLANE_OP_IPSET_ENTRY_DELETE:
5162e000
PG
1398 return FRR_NETLINK_ERROR;
1399
62b4b7e4
PG
1400 case DPLANE_OP_GRE_SET:
1401 return netlink_put_gre_set_msg(bth, ctx);
1402
9d59df63
MS
1403 case DPLANE_OP_INTF_ADDR_ADD:
1404 case DPLANE_OP_INTF_ADDR_DEL:
67e3369e
JU
1405 case DPLANE_OP_NONE:
1406 return FRR_NETLINK_ERROR;
1407 }
1408
1409 return FRR_NETLINK_ERROR;
1410}
1411
fef24b03
JU
1412void kernel_update_multi(struct dplane_ctx_q *ctx_list)
1413{
67e3369e
JU
1414 struct nl_batch batch;
1415 struct zebra_dplane_ctx *ctx;
1416 struct dplane_ctx_q handled_list;
1417 enum netlink_msg_status res;
1418
67e3369e 1419 TAILQ_INIT(&handled_list);
f6feb48b 1420 nl_batch_init(&batch, &handled_list);
67e3369e
JU
1421
1422 while (true) {
1423 ctx = dplane_ctx_dequeue(ctx_list);
1424 if (ctx == NULL)
1425 break;
1426
f6feb48b
JU
1427 if (batch.zns != NULL
1428 && batch.zns->ns_id != dplane_ctx_get_ns(ctx)->ns_id)
1429 nl_batch_send(&batch);
67e3369e
JU
1430
1431 /*
f6feb48b
JU
1432 * Assume all messages will succeed and then mark only the ones
1433 * that failed.
67e3369e 1434 */
f6feb48b
JU
1435 dplane_ctx_set_status(ctx, ZEBRA_DPLANE_REQUEST_SUCCESS);
1436
1437 res = nl_put_msg(&batch, ctx);
1438
1439 dplane_ctx_enqueue_tail(&(batch.ctx_list), ctx);
1440 if (res == FRR_NETLINK_ERROR)
67e3369e
JU
1441 dplane_ctx_set_status(ctx,
1442 ZEBRA_DPLANE_REQUEST_FAILURE);
1443
f6feb48b
JU
1444 if (batch.curlen > batch.limit)
1445 nl_batch_send(&batch);
67e3369e
JU
1446 }
1447
1448 nl_batch_send(&batch);
1449
1450 TAILQ_INIT(ctx_list);
1451 dplane_ctx_list_append(ctx_list, &handled_list);
fef24b03
JU
1452}
1453
1fdc9eae 1454/* Exported interface function. This function simply calls
1455 netlink_socket (). */
d62a17ae 1456void kernel_init(struct zebra_ns *zns)
1fdc9eae 1457{
67188ca2 1458 uint32_t groups;
5d307d5d
DS
1459#if defined SOL_NETLINK
1460 int one, ret;
1461#endif
d62a17ae 1462
026a316f
DS
1463 /*
1464 * Initialize netlink sockets
1465 *
1466 * If RTMGRP_XXX exists use that, but at some point
1467 * I think the kernel developers realized that
1468 * keeping track of all the different values would
1469 * lead to confusion, so we need to convert the
1470 * RTNLGRP_XXX to a bit position for ourself
1471 */
1472 groups = RTMGRP_LINK |
1473 RTMGRP_IPV4_ROUTE |
1474 RTMGRP_IPV4_IFADDR |
1475 RTMGRP_IPV6_ROUTE |
1476 RTMGRP_IPV6_IFADDR |
1477 RTMGRP_IPV4_MROUTE |
1478 RTMGRP_NEIGH |
67188ca2
QY
1479 ((uint32_t) 1 << (RTNLGRP_IPV4_RULE - 1)) |
1480 ((uint32_t) 1 << (RTNLGRP_IPV6_RULE - 1)) |
1481 ((uint32_t) 1 << (RTNLGRP_NEXTHOP - 1));
d62a17ae 1482
1483 snprintf(zns->netlink.name, sizeof(zns->netlink.name),
1484 "netlink-listen (NS %u)", zns->ns_id);
1485 zns->netlink.sock = -1;
19d5a4fe
DS
1486 if (netlink_socket(&zns->netlink, groups, zns->ns_id) < 0) {
1487 zlog_err("Failure to create %s socket",
1488 zns->netlink.name);
1489 exit(-1);
1490 }
d62a17ae 1491
1492 snprintf(zns->netlink_cmd.name, sizeof(zns->netlink_cmd.name),
1493 "netlink-cmd (NS %u)", zns->ns_id);
1494 zns->netlink_cmd.sock = -1;
19d5a4fe
DS
1495 if (netlink_socket(&zns->netlink_cmd, 0, zns->ns_id) < 0) {
1496 zlog_err("Failure to create %s socket",
1497 zns->netlink_cmd.name);
1498 exit(-1);
1499 }
d62a17ae 1500
80dcc388
MS
1501 /* Outbound socket for dplane programming of the host OS. */
1502 snprintf(zns->netlink_dplane_out.name,
1503 sizeof(zns->netlink_dplane_out.name), "netlink-dp (NS %u)",
1504 zns->ns_id);
1505 zns->netlink_dplane_out.sock = -1;
1506 if (netlink_socket(&zns->netlink_dplane_out, 0, zns->ns_id) < 0) {
62b8bb7a 1507 zlog_err("Failure to create %s socket",
80dcc388
MS
1508 zns->netlink_dplane_out.name);
1509 exit(-1);
1510 }
1511
1512 /* Inbound socket for OS events coming to the dplane. */
1513 snprintf(zns->netlink_dplane_in.name,
1514 sizeof(zns->netlink_dplane_in.name), "netlink-dp-in (NS %u)",
1515 zns->ns_id);
1516 zns->netlink_dplane_in.sock = -1;
1517 if (netlink_socket(&zns->netlink_dplane_in, groups, zns->ns_id) < 0) {
1518 zlog_err("Failure to create %s socket",
1519 zns->netlink_dplane_in.name);
62b8bb7a
MS
1520 exit(-1);
1521 }
1522
5d307d5d
DS
1523 /*
1524 * SOL_NETLINK is not available on all platforms yet
1525 * apparently. It's in bits/socket.h which I am not
1526 * sure that we want to pull into our build system.
1527 */
1528#if defined SOL_NETLINK
1529 /*
1530 * Let's tell the kernel that we want to receive extended
62b8bb7a 1531 * ACKS over our command socket(s)
5d307d5d
DS
1532 */
1533 one = 1;
1534 ret = setsockopt(zns->netlink_cmd.sock, SOL_NETLINK, NETLINK_EXT_ACK,
1535 &one, sizeof(one));
1536
1537 if (ret < 0)
62b8bb7a
MS
1538 zlog_notice("Registration for extended cmd ACK failed : %d %s",
1539 errno, safe_strerror(errno));
1540
1541 one = 1;
80dcc388
MS
1542 ret = setsockopt(zns->netlink_dplane_out.sock, SOL_NETLINK,
1543 NETLINK_EXT_ACK, &one, sizeof(one));
62b8bb7a
MS
1544
1545 if (ret < 0)
1546 zlog_notice("Registration for extended dp ACK failed : %d %s",
5d307d5d 1547 errno, safe_strerror(errno));
97f85144
JU
1548
1549 /*
1550 * Trim off the payload of the original netlink message in the
1551 * acknowledgment. This option is available since Linux 4.2, so if
1552 * setsockopt fails, ignore the error.
1553 */
1554 one = 1;
80dcc388
MS
1555 ret = setsockopt(zns->netlink_dplane_out.sock, SOL_NETLINK,
1556 NETLINK_CAP_ACK, &one, sizeof(one));
9781e6a0
DS
1557 if (ret < 0)
1558 zlog_notice(
1559 "Registration for reduced ACK packet size failed, probably running an early kernel");
5d307d5d
DS
1560#endif
1561
d62a17ae 1562 /* Register kernel socket. */
19d5a4fe 1563 if (fcntl(zns->netlink.sock, F_SETFL, O_NONBLOCK) < 0)
450971aa 1564 flog_err_sys(EC_LIB_SOCKET, "Can't set %s socket flags: %s",
09c866e3 1565 zns->netlink.name, safe_strerror(errno));
8c85e8ea
DS
1566
1567 if (fcntl(zns->netlink_cmd.sock, F_SETFL, O_NONBLOCK) < 0)
1568 zlog_err("Can't set %s socket error: %s(%d)",
1569 zns->netlink_cmd.name, safe_strerror(errno), errno);
19d5a4fe 1570
80dcc388 1571 if (fcntl(zns->netlink_dplane_out.sock, F_SETFL, O_NONBLOCK) < 0)
62b8bb7a 1572 zlog_err("Can't set %s socket error: %s(%d)",
80dcc388
MS
1573 zns->netlink_dplane_out.name, safe_strerror(errno),
1574 errno);
1575
1576 if (fcntl(zns->netlink_dplane_in.sock, F_SETFL, O_NONBLOCK) < 0)
1577 zlog_err("Can't set %s socket error: %s(%d)",
1578 zns->netlink_dplane_in.name, safe_strerror(errno),
1579 errno);
62b8bb7a 1580
19d5a4fe 1581 /* Set receive buffer size if it's set from command line */
97f85144 1582 if (nl_rcvbufsize) {
19d5a4fe 1583 netlink_recvbuf(&zns->netlink, nl_rcvbufsize);
97f85144 1584 netlink_recvbuf(&zns->netlink_cmd, nl_rcvbufsize);
80dcc388
MS
1585 netlink_recvbuf(&zns->netlink_dplane_out, nl_rcvbufsize);
1586 netlink_recvbuf(&zns->netlink_dplane_in, nl_rcvbufsize);
97f85144 1587 }
19d5a4fe 1588
80dcc388
MS
1589 /* Set filter for inbound sockets, to exclude events we've generated
1590 * ourselves.
1591 */
1592 netlink_install_filter(zns->netlink.sock, zns->netlink_cmd.snl.nl_pid,
1593 zns->netlink_dplane_out.snl.nl_pid);
1594
1595 netlink_install_filter(zns->netlink_dplane_in.sock,
62b8bb7a 1596 zns->netlink_cmd.snl.nl_pid,
80dcc388 1597 zns->netlink_dplane_out.snl.nl_pid);
62b8bb7a 1598
19d5a4fe
DS
1599 zns->t_netlink = NULL;
1600
3801e764 1601 thread_add_read(zrouter.master, kernel_read, zns,
19d5a4fe 1602 zns->netlink.sock, &zns->t_netlink);
d62a17ae 1603
1604 rt_netlink_init();
1fdc9eae 1605}
1606
62b8bb7a 1607void kernel_terminate(struct zebra_ns *zns, bool complete)
1fdc9eae 1608{
50478845 1609 thread_cancel(&zns->t_netlink);
d62a17ae 1610
1611 if (zns->netlink.sock >= 0) {
1612 close(zns->netlink.sock);
1613 zns->netlink.sock = -1;
1614 }
1615
1616 if (zns->netlink_cmd.sock >= 0) {
1617 close(zns->netlink_cmd.sock);
1618 zns->netlink_cmd.sock = -1;
1619 }
ddfeb486 1620
80dcc388
MS
1621 if (zns->netlink_dplane_in.sock >= 0) {
1622 close(zns->netlink_dplane_in.sock);
1623 zns->netlink_dplane_in.sock = -1;
1624 }
1625
62b8bb7a
MS
1626 /* During zebra shutdown, we need to leave the dataplane socket
1627 * around until all work is done.
1628 */
1629 if (complete) {
80dcc388
MS
1630 if (zns->netlink_dplane_out.sock >= 0) {
1631 close(zns->netlink_dplane_out.sock);
1632 zns->netlink_dplane_out.sock = -1;
62b8bb7a
MS
1633 }
1634 }
1635}
ddfeb486 1636#endif /* HAVE_NETLINK */