]> git.proxmox.com Git - mirror_frr.git/blame - zebra/kernel_netlink.c
Merge pull request #8036 from qlyoung/disable-mallinfo
[mirror_frr.git] / zebra / kernel_netlink.c
CommitLineData
718e3744 1/* Kernel communication using netlink interface.
2 * Copyright (C) 1999 Kunihiro Ishiguro
3 *
4 * This file is part of GNU Zebra.
5 *
6 * GNU Zebra is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the
8 * Free Software Foundation; either version 2, or (at your option) any
9 * later version.
10 *
11 * GNU Zebra is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
896014f4
DL
16 * You should have received a copy of the GNU General Public License along
17 * with this program; see the file COPYING; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
718e3744 19 */
1fdc9eae 20
21#include <zebra.h>
22
ddfeb486
DL
23#ifdef HAVE_NETLINK
24
1fdc9eae 25#include "linklist.h"
26#include "if.h"
27#include "log.h"
28#include "prefix.h"
29#include "connected.h"
30#include "table.h"
31#include "memory.h"
32#include "zebra_memory.h"
33#include "rib.h"
34#include "thread.h"
35#include "privs.h"
36#include "nexthop.h"
37#include "vrf.h"
38#include "mpls.h"
174482ef 39#include "lib_errors.h"
1fdc9eae 40
3801e764
DS
41//#include "zebra/zserv.h"
42#include "zebra/zebra_router.h"
1fdc9eae 43#include "zebra/zebra_ns.h"
44#include "zebra/zebra_vrf.h"
05f7f5db 45#include "zebra/rt.h"
1fdc9eae 46#include "zebra/debug.h"
47#include "zebra/kernel_netlink.h"
48#include "zebra/rt_netlink.h"
49#include "zebra/if_netlink.h"
942bf97b 50#include "zebra/rule_netlink.h"
43e52561 51#include "zebra/zebra_errors.h"
1fdc9eae 52
53#ifndef SO_RCVBUFFORCE
54#define SO_RCVBUFFORCE (33)
55#endif
56
57/* Hack for GNU libc version 2. */
58#ifndef MSG_TRUNC
59#define MSG_TRUNC 0x20
60#endif /* MSG_TRUNC */
61
62#ifndef NLMSG_TAIL
d62a17ae 63#define NLMSG_TAIL(nmsg) \
d7c0a89a
QY
64 ((struct rtattr *)(((uint8_t *)(nmsg)) \
65 + NLMSG_ALIGN((nmsg)->nlmsg_len)))
1fdc9eae 66#endif
67
68#ifndef RTA_TAIL
d62a17ae 69#define RTA_TAIL(rta) \
d7c0a89a 70 ((struct rtattr *)(((uint8_t *)(rta)) + RTA_ALIGN((rta)->rta_len)))
1fdc9eae 71#endif
72
f909c673
DS
73#ifndef RTNL_FAMILY_IP6MR
74#define RTNL_FAMILY_IP6MR 129
75#endif
76
77#ifndef RTPROT_MROUTED
78#define RTPROT_MROUTED 17
79#endif
80
531c92b8 81#define NL_DEFAULT_BATCH_BUFSIZE (16 * NL_PKT_BUF_SIZE)
e63c7622
JU
82
83/*
84 * We limit the batch's size to a number smaller than the length of the
85 * underlying buffer since the last message that wouldn't fit the batch would go
86 * over the upper boundary and then it would have to be encoded again into a new
87 * buffer. If the difference between the limit and the length of the buffer is
88 * big enough (bigger than the biggest Netlink message) then this situation
89 * won't occur.
90 */
531c92b8
JU
91#define NL_DEFAULT_BATCH_SEND_THRESHOLD (15 * NL_PKT_BUF_SIZE)
92
2f9dbd3a 93#define NL_BATCH_RX_BUFSIZE NL_RCV_PKT_BUF_SIZE
e63c7622 94
d62a17ae 95static const struct message nlmsg_str[] = {{RTM_NEWROUTE, "RTM_NEWROUTE"},
96 {RTM_DELROUTE, "RTM_DELROUTE"},
97 {RTM_GETROUTE, "RTM_GETROUTE"},
98 {RTM_NEWLINK, "RTM_NEWLINK"},
99 {RTM_DELLINK, "RTM_DELLINK"},
100 {RTM_GETLINK, "RTM_GETLINK"},
101 {RTM_NEWADDR, "RTM_NEWADDR"},
102 {RTM_DELADDR, "RTM_DELADDR"},
103 {RTM_GETADDR, "RTM_GETADDR"},
104 {RTM_NEWNEIGH, "RTM_NEWNEIGH"},
105 {RTM_DELNEIGH, "RTM_DELNEIGH"},
106 {RTM_GETNEIGH, "RTM_GETNEIGH"},
942bf97b 107 {RTM_NEWRULE, "RTM_NEWRULE"},
108 {RTM_DELRULE, "RTM_DELRULE"},
109 {RTM_GETRULE, "RTM_GETRULE"},
79580b5a
SW
110 {RTM_NEWNEXTHOP, "RTM_NEWNEXTHOP"},
111 {RTM_DELNEXTHOP, "RTM_DELNEXTHOP"},
112 {RTM_GETNEXTHOP, "RTM_GETNEXTHOP"},
d62a17ae 113 {0}};
1fdc9eae 114
115static const struct message rtproto_str[] = {
d62a17ae 116 {RTPROT_REDIRECT, "redirect"},
117 {RTPROT_KERNEL, "kernel"},
118 {RTPROT_BOOT, "boot"},
119 {RTPROT_STATIC, "static"},
120 {RTPROT_GATED, "GateD"},
121 {RTPROT_RA, "router advertisement"},
122 {RTPROT_MRT, "MRT"},
123 {RTPROT_ZEBRA, "Zebra"},
1fdc9eae 124#ifdef RTPROT_BIRD
d62a17ae 125 {RTPROT_BIRD, "BIRD"},
1fdc9eae 126#endif /* RTPROT_BIRD */
d62a17ae 127 {RTPROT_MROUTED, "mroute"},
128 {RTPROT_BGP, "BGP"},
129 {RTPROT_OSPF, "OSPF"},
130 {RTPROT_ISIS, "IS-IS"},
131 {RTPROT_RIP, "RIP"},
132 {RTPROT_RIPNG, "RIPNG"},
d4d71f11 133 {RTPROT_ZSTATIC, "static"},
d62a17ae 134 {0}};
135
136static const struct message family_str[] = {{AF_INET, "ipv4"},
137 {AF_INET6, "ipv6"},
138 {AF_BRIDGE, "bridge"},
139 {RTNL_FAMILY_IPMR, "ipv4MR"},
140 {RTNL_FAMILY_IP6MR, "ipv6MR"},
141 {0}};
142
8c8f250b
DS
143static const struct message rttype_str[] = {{RTN_UNSPEC, "none"},
144 {RTN_UNICAST, "unicast"},
145 {RTN_LOCAL, "local"},
146 {RTN_BROADCAST, "broadcast"},
147 {RTN_ANYCAST, "anycast"},
d62a17ae 148 {RTN_MULTICAST, "multicast"},
8c8f250b
DS
149 {RTN_BLACKHOLE, "blackhole"},
150 {RTN_UNREACHABLE, "unreachable"},
151 {RTN_PROHIBIT, "prohibited"},
152 {RTN_THROW, "throw"},
153 {RTN_NAT, "nat"},
154 {RTN_XRESOLVE, "resolver"},
d62a17ae 155 {0}};
b339bde7 156
1fdc9eae 157extern struct thread_master *master;
d7c0a89a 158extern uint32_t nl_rcvbufsize;
1fdc9eae 159
160extern struct zebra_privs_t zserv_privs;
161
531c92b8
JU
162DEFINE_MTYPE_STATIC(ZEBRA, NL_BUF, "Zebra Netlink buffers")
163
164size_t nl_batch_tx_bufsize;
165char *nl_batch_tx_buf;
166
e63c7622
JU
167char nl_batch_rx_buf[NL_BATCH_RX_BUFSIZE];
168
531c92b8
JU
169_Atomic uint32_t nl_batch_bufsize = NL_DEFAULT_BATCH_BUFSIZE;
170_Atomic uint32_t nl_batch_send_threshold = NL_DEFAULT_BATCH_SEND_THRESHOLD;
171
e63c7622
JU
172struct nl_batch {
173 void *buf;
174 size_t bufsiz;
175 size_t limit;
176
177 void *buf_head;
178 size_t curlen;
179 size_t msgcnt;
180
181 const struct zebra_dplane_info *zns;
e63c7622 182
f6feb48b 183 struct dplane_ctx_q ctx_list;
e63c7622 184
f6feb48b
JU
185 /*
186 * Pointer to the queue of completed contexts outbound back
187 * towards the dataplane module.
188 */
189 struct dplane_ctx_q *ctx_out_q;
e63c7622
JU
190};
191
531c92b8
JU
192int netlink_config_write_helper(struct vty *vty)
193{
194 uint32_t size =
195 atomic_load_explicit(&nl_batch_bufsize, memory_order_relaxed);
196 uint32_t threshold = atomic_load_explicit(&nl_batch_send_threshold,
197 memory_order_relaxed);
198
199 if (size != NL_DEFAULT_BATCH_BUFSIZE
200 || threshold != NL_DEFAULT_BATCH_SEND_THRESHOLD)
201 vty_out(vty, "zebra kernel netlink batch-tx-buf %u %u\n", size,
202 threshold);
203
204 return 0;
205}
206
207void netlink_set_batch_buffer_size(uint32_t size, uint32_t threshold, bool set)
208{
209 if (!set) {
210 size = NL_DEFAULT_BATCH_BUFSIZE;
211 threshold = NL_DEFAULT_BATCH_SEND_THRESHOLD;
212 }
213
214 atomic_store_explicit(&nl_batch_bufsize, size, memory_order_relaxed);
215 atomic_store_explicit(&nl_batch_send_threshold, threshold,
216 memory_order_relaxed);
217}
218
2414abd3 219int netlink_talk_filter(struct nlmsghdr *h, ns_id_t ns_id, int startup)
1fdc9eae 220{
3575d9e8
DS
221 /*
222 * This is an error condition that must be handled during
223 * development.
224 *
225 * The netlink_talk_filter function is used for communication
226 * down the netlink_cmd pipe and we are expecting
227 * an ack being received. So if we get here
228 * then we did not receive the ack and instead
229 * received some other message in an unexpected
230 * way.
231 */
43e52561
QY
232 zlog_debug("%s: ignoring message type 0x%04x(%s) NS %u", __func__,
233 h->nlmsg_type, nl_msg_type_to_str(h->nlmsg_type), ns_id);
d62a17ae 234 return 0;
1fdc9eae 235}
236
d62a17ae 237static int netlink_recvbuf(struct nlsock *nl, uint32_t newsize)
1fdc9eae 238{
d7c0a89a 239 uint32_t oldsize;
d62a17ae 240 socklen_t newlen = sizeof(newsize);
241 socklen_t oldlen = sizeof(oldsize);
242 int ret;
243
244 ret = getsockopt(nl->sock, SOL_SOCKET, SO_RCVBUF, &oldsize, &oldlen);
245 if (ret < 0) {
450971aa 246 flog_err_sys(EC_LIB_SOCKET,
09c866e3
QY
247 "Can't get %s receive buffer size: %s", nl->name,
248 safe_strerror(errno));
d62a17ae 249 return -1;
250 }
251
252 /* Try force option (linux >= 2.6.14) and fall back to normal set */
0cf6db21 253 frr_with_privs(&zserv_privs) {
01b9e3fd
DL
254 ret = setsockopt(nl->sock, SOL_SOCKET, SO_RCVBUFFORCE,
255 &nl_rcvbufsize,
256 sizeof(nl_rcvbufsize));
257 }
d62a17ae 258 if (ret < 0)
259 ret = setsockopt(nl->sock, SOL_SOCKET, SO_RCVBUF,
260 &nl_rcvbufsize, sizeof(nl_rcvbufsize));
261 if (ret < 0) {
450971aa 262 flog_err_sys(EC_LIB_SOCKET,
09c866e3
QY
263 "Can't set %s receive buffer size: %s", nl->name,
264 safe_strerror(errno));
d62a17ae 265 return -1;
266 }
267
268 ret = getsockopt(nl->sock, SOL_SOCKET, SO_RCVBUF, &newsize, &newlen);
269 if (ret < 0) {
450971aa 270 flog_err_sys(EC_LIB_SOCKET,
09c866e3
QY
271 "Can't get %s receive buffer size: %s", nl->name,
272 safe_strerror(errno));
d62a17ae 273 return -1;
274 }
275
276 zlog_info("Setting netlink socket receive buffer size: %u -> %u",
277 oldsize, newsize);
278 return 0;
1fdc9eae 279}
280
281/* Make socket for Linux netlink interface. */
d62a17ae 282static int netlink_socket(struct nlsock *nl, unsigned long groups,
283 ns_id_t ns_id)
1fdc9eae 284{
d62a17ae 285 int ret;
286 struct sockaddr_nl snl;
287 int sock;
288 int namelen;
d62a17ae 289
0cf6db21 290 frr_with_privs(&zserv_privs) {
6bb30c2c
DL
291 sock = ns_socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE, ns_id);
292 if (sock < 0) {
293 zlog_err("Can't open %s socket: %s", nl->name,
294 safe_strerror(errno));
295 return -1;
296 }
d62a17ae 297
0d6f7fd6 298 memset(&snl, 0, sizeof(snl));
6bb30c2c
DL
299 snl.nl_family = AF_NETLINK;
300 snl.nl_groups = groups;
d62a17ae 301
6bb30c2c 302 /* Bind the socket to the netlink structure for anything. */
0d6f7fd6 303 ret = bind(sock, (struct sockaddr *)&snl, sizeof(snl));
6bb30c2c 304 }
d62a17ae 305
306 if (ret < 0) {
6bb30c2c
DL
307 zlog_err("Can't bind %s socket to group 0x%x: %s", nl->name,
308 snl.nl_groups, safe_strerror(errno));
d62a17ae 309 close(sock);
310 return -1;
311 }
312
313 /* multiple netlink sockets will have different nl_pid */
0d6f7fd6 314 namelen = sizeof(snl);
d62a17ae 315 ret = getsockname(sock, (struct sockaddr *)&snl, (socklen_t *)&namelen);
0d6f7fd6 316 if (ret < 0 || namelen != sizeof(snl)) {
450971aa 317 flog_err_sys(EC_LIB_SOCKET, "Can't get %s socket name: %s",
09c866e3 318 nl->name, safe_strerror(errno));
d62a17ae 319 close(sock);
320 return -1;
321 }
322
323 nl->snl = snl;
324 nl->sock = sock;
325 return ret;
1fdc9eae 326}
327
2414abd3 328static int netlink_information_fetch(struct nlmsghdr *h, ns_id_t ns_id,
d62a17ae 329 int startup)
1fdc9eae 330{
3575d9e8
DS
331 /*
332 * When we handle new message types here
333 * because we are starting to install them
334 * then lets check the netlink_install_filter
335 * and see if we should add the corresponding
336 * allow through entry there.
337 * Probably not needed to do but please
338 * think about it.
339 */
d62a17ae 340 switch (h->nlmsg_type) {
341 case RTM_NEWROUTE:
2414abd3 342 return netlink_route_change(h, ns_id, startup);
d62a17ae 343 case RTM_DELROUTE:
2414abd3 344 return netlink_route_change(h, ns_id, startup);
d62a17ae 345 case RTM_NEWLINK:
2414abd3 346 return netlink_link_change(h, ns_id, startup);
d62a17ae 347 case RTM_DELLINK:
2414abd3 348 return netlink_link_change(h, ns_id, startup);
d62a17ae 349 case RTM_NEWADDR:
2414abd3 350 return netlink_interface_addr(h, ns_id, startup);
d62a17ae 351 case RTM_DELADDR:
2414abd3 352 return netlink_interface_addr(h, ns_id, startup);
d62a17ae 353 case RTM_NEWNEIGH:
2414abd3 354 return netlink_neigh_change(h, ns_id);
d62a17ae 355 case RTM_DELNEIGH:
2414abd3 356 return netlink_neigh_change(h, ns_id);
951f8bcb
DS
357 case RTM_GETNEIGH:
358 /*
359 * Kernel in some situations when it expects
360 * user space to resolve arp entries, we will
361 * receive this notification. As we don't
362 * need this notification and as that
363 * we don't want to spam the log file with
364 * below messages, just ignore.
365 */
366 if (IS_ZEBRA_DEBUG_KERNEL)
367 zlog_debug("Received RTM_GETNEIGH, ignoring");
368 break;
942bf97b 369 case RTM_NEWRULE:
2414abd3 370 return netlink_rule_change(h, ns_id, startup);
942bf97b 371 case RTM_DELRULE:
2414abd3 372 return netlink_rule_change(h, ns_id, startup);
79580b5a 373 case RTM_NEWNEXTHOP:
d9f5b2f5 374 return netlink_nexthop_change(h, ns_id, startup);
79580b5a 375 case RTM_DELNEXTHOP:
d9f5b2f5 376 return netlink_nexthop_change(h, ns_id, startup);
d62a17ae 377 default:
3575d9e8
DS
378 /*
379 * If we have received this message then
380 * we have made a mistake during development
381 * and we need to write some code to handle
382 * this message type or not ask for
383 * it to be sent up to us
384 */
e914ccbe 385 flog_err(EC_ZEBRA_UNKNOWN_NLMSG,
1c50c1c0
QY
386 "Unknown netlink nlmsg_type %s(%d) vrf %u\n",
387 nl_msg_type_to_str(h->nlmsg_type), h->nlmsg_type,
388 ns_id);
d62a17ae 389 break;
390 }
391 return 0;
1fdc9eae 392}
393
d62a17ae 394static int kernel_read(struct thread *thread)
1fdc9eae 395{
d62a17ae 396 struct zebra_ns *zns = (struct zebra_ns *)THREAD_ARG(thread);
85a75f1e
MS
397 struct zebra_dplane_info dp_info;
398
399 /* Capture key info from ns struct */
400 zebra_dplane_info_from_zns(&dp_info, zns, false);
401
402 netlink_parse_info(netlink_information_fetch, &zns->netlink, &dp_info,
403 5, 0);
d62a17ae 404 zns->t_netlink = NULL;
3801e764 405 thread_add_read(zrouter.master, kernel_read, zns, zns->netlink.sock,
d62a17ae 406 &zns->t_netlink);
1fdc9eae 407
d62a17ae 408 return 0;
1fdc9eae 409}
410
3575d9e8
DS
411/*
412 * Filter out messages from self that occur on listener socket,
62b8bb7a 413 * caused by our actions on the command socket(s)
3575d9e8
DS
414 *
415 * When we add new Netlink message types we probably
416 * do not need to add them here as that we are filtering
417 * on the routes we actually care to receive( which is rarer
418 * then the normal course of operations). We are intentionally
419 * allowing some messages from ourselves through
420 * ( I'm looking at you Interface based netlink messages )
421 * so that we only had to write one way to handle incoming
422 * address add/delete changes.
1fdc9eae 423 */
62b8bb7a 424static void netlink_install_filter(int sock, __u32 pid, __u32 dplane_pid)
1fdc9eae 425{
3575d9e8
DS
426 /*
427 * BPF_JUMP instructions and where you jump to are based upon
428 * 0 as being the next statement. So count from 0. Writing
429 * this down because every time I look at this I have to
430 * re-remember it.
431 */
d62a17ae 432 struct sock_filter filter[] = {
3575d9e8
DS
433 /*
434 * Logic:
62b8bb7a
MS
435 * if (nlmsg_pid == pid ||
436 * nlmsg_pid == dplane_pid) {
3575d9e8
DS
437 * if (the incoming nlmsg_type ==
438 * RTM_NEWADDR | RTM_DELADDR)
439 * keep this message
440 * else
441 * skip this message
442 * } else
443 * keep this netlink message
444 */
445 /*
446 * 0: Load the nlmsg_pid into the BPF register
447 */
d62a17ae 448 BPF_STMT(BPF_LD | BPF_ABS | BPF_W,
449 offsetof(struct nlmsghdr, nlmsg_pid)),
3575d9e8
DS
450 /*
451 * 1: Compare to pid
452 */
62b8bb7a 453 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htonl(pid), 1, 0),
3575d9e8 454 /*
62b8bb7a
MS
455 * 2: Compare to dplane pid
456 */
457 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htonl(dplane_pid), 0, 4),
458 /*
459 * 3: Load the nlmsg_type into BPF register
3575d9e8
DS
460 */
461 BPF_STMT(BPF_LD | BPF_ABS | BPF_H,
462 offsetof(struct nlmsghdr, nlmsg_type)),
463 /*
62b8bb7a 464 * 4: Compare to RTM_NEWADDR
3575d9e8
DS
465 */
466 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htons(RTM_NEWADDR), 2, 0),
467 /*
62b8bb7a 468 * 5: Compare to RTM_DELADDR
3575d9e8
DS
469 */
470 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htons(RTM_DELADDR), 1, 0),
471 /*
62b8bb7a 472 * 6: This is the end state of we want to skip the
3575d9e8
DS
473 * message
474 */
d62a17ae 475 BPF_STMT(BPF_RET | BPF_K, 0),
62b8bb7a 476 /* 7: This is the end state of we want to keep
3575d9e8
DS
477 * the message
478 */
d62a17ae 479 BPF_STMT(BPF_RET | BPF_K, 0xffff),
480 };
481
482 struct sock_fprog prog = {
9d303b37 483 .len = array_size(filter), .filter = filter,
d62a17ae 484 };
485
486 if (setsockopt(sock, SOL_SOCKET, SO_ATTACH_FILTER, &prog, sizeof(prog))
487 < 0)
1c50c1c0 488 flog_err_sys(EC_LIB_SOCKET, "Can't install socket filter: %s\n",
9df414fe 489 safe_strerror(errno));
1fdc9eae 490}
491
4bcdb608
NA
492void netlink_parse_rtattr_flags(struct rtattr **tb, int max,
493 struct rtattr *rta, int len, unsigned short flags)
494{
495 unsigned short type;
496
497 while (RTA_OK(rta, len)) {
498 type = rta->rta_type & ~flags;
499 if ((type <= max) && (!tb[type]))
500 tb[type] = rta;
501 rta = RTA_NEXT(rta, len);
502 }
503}
504
d62a17ae 505void netlink_parse_rtattr(struct rtattr **tb, int max, struct rtattr *rta,
506 int len)
1fdc9eae 507{
d62a17ae 508 while (RTA_OK(rta, len)) {
509 if (rta->rta_type <= max)
510 tb[rta->rta_type] = rta;
511 rta = RTA_NEXT(rta, len);
512 }
1fdc9eae 513}
514
87da6a60
SW
515/**
516 * netlink_parse_rtattr_nested() - Parses a nested route attribute
517 * @tb: Pointer to array for storing rtattr in.
518 * @max: Max number to store.
519 * @rta: Pointer to rtattr to look for nested items in.
520 */
521void netlink_parse_rtattr_nested(struct rtattr **tb, int max,
522 struct rtattr *rta)
523{
524 netlink_parse_rtattr(tb, max, RTA_DATA(rta), RTA_PAYLOAD(rta));
525}
526
312a6bee
JU
527bool nl_attr_put(struct nlmsghdr *n, unsigned int maxlen, int type,
528 const void *data, unsigned int alen)
1fdc9eae 529{
d62a17ae 530 int len;
531 struct rtattr *rta;
1fdc9eae 532
d62a17ae 533 len = RTA_LENGTH(alen);
1fdc9eae 534
d62a17ae 535 if (NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len) > maxlen)
312a6bee 536 return false;
1fdc9eae 537
d62a17ae 538 rta = (struct rtattr *)(((char *)n) + NLMSG_ALIGN(n->nlmsg_len));
539 rta->rta_type = type;
540 rta->rta_len = len;
4b2792b5 541
d62a17ae 542 if (data)
543 memcpy(RTA_DATA(rta), data, alen);
544 else
545 assert(alen == 0);
4b2792b5 546
d62a17ae 547 n->nlmsg_len = NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len);
1fdc9eae 548
312a6bee 549 return true;
1fdc9eae 550}
551
312a6bee
JU
552bool nl_attr_put16(struct nlmsghdr *n, unsigned int maxlen, int type,
553 uint16_t data)
bbc16902 554{
312a6bee 555 return nl_attr_put(n, maxlen, type, &data, sizeof(uint16_t));
bbc16902 556}
557
312a6bee
JU
558bool nl_attr_put32(struct nlmsghdr *n, unsigned int maxlen, int type,
559 uint32_t data)
1fdc9eae 560{
312a6bee 561 return nl_attr_put(n, maxlen, type, &data, sizeof(uint32_t));
1fdc9eae 562}
563
312a6bee 564struct rtattr *nl_attr_nest(struct nlmsghdr *n, unsigned int maxlen, int type)
1fdc9eae 565{
d62a17ae 566 struct rtattr *nest = NLMSG_TAIL(n);
1fdc9eae 567
312a6bee
JU
568 if (!nl_attr_put(n, maxlen, type, NULL, 0))
569 return NULL;
570
40d86eba 571 nest->rta_type |= NLA_F_NESTED;
d62a17ae 572 return nest;
1fdc9eae 573}
574
312a6bee 575int nl_attr_nest_end(struct nlmsghdr *n, struct rtattr *nest)
1fdc9eae 576{
d7c0a89a 577 nest->rta_len = (uint8_t *)NLMSG_TAIL(n) - (uint8_t *)nest;
d62a17ae 578 return n->nlmsg_len;
1fdc9eae 579}
580
312a6bee 581struct rtnexthop *nl_attr_rtnh(struct nlmsghdr *n, unsigned int maxlen)
1fdc9eae 582{
312a6bee 583 struct rtnexthop *rtnh = (struct rtnexthop *)NLMSG_TAIL(n);
1fdc9eae 584
312a6bee
JU
585 if (NLMSG_ALIGN(n->nlmsg_len) + RTNH_ALIGN(sizeof(struct rtnexthop))
586 > maxlen)
587 return NULL;
588
589 memset(rtnh, 0, sizeof(struct rtnexthop));
590 n->nlmsg_len =
591 NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(sizeof(struct rtnexthop));
592
593 return rtnh;
1fdc9eae 594}
595
312a6bee 596void nl_attr_rtnh_end(struct nlmsghdr *n, struct rtnexthop *rtnh)
1fdc9eae 597{
312a6bee 598 rtnh->rtnh_len = (uint8_t *)NLMSG_TAIL(n) - (uint8_t *)rtnh;
1fdc9eae 599}
600
d62a17ae 601const char *nl_msg_type_to_str(uint16_t msg_type)
1fdc9eae 602{
d62a17ae 603 return lookup_msg(nlmsg_str, msg_type, "");
1fdc9eae 604}
605
d7c0a89a 606const char *nl_rtproto_to_str(uint8_t rtproto)
1fdc9eae 607{
d62a17ae 608 return lookup_msg(rtproto_str, rtproto, "");
1fdc9eae 609}
b339bde7 610
d7c0a89a 611const char *nl_family_to_str(uint8_t family)
b339bde7 612{
d62a17ae 613 return lookup_msg(family_str, family, "");
b339bde7
DS
614}
615
d7c0a89a 616const char *nl_rttype_to_str(uint8_t rttype)
b339bde7 617{
d62a17ae 618 return lookup_msg(rttype_str, rttype, "");
b339bde7
DS
619}
620
4cebb2b6 621#define NLA_OK(nla, len) \
5d307d5d
DS
622 ((len) >= (int)sizeof(struct nlattr) \
623 && (nla)->nla_len >= sizeof(struct nlattr) \
624 && (nla)->nla_len <= (len))
4cebb2b6
SW
625#define NLA_NEXT(nla, attrlen) \
626 ((attrlen) -= NLA_ALIGN((nla)->nla_len), \
627 (struct nlattr *)(((char *)(nla)) + NLA_ALIGN((nla)->nla_len)))
628#define NLA_LENGTH(len) (NLA_ALIGN(sizeof(struct nlattr)) + (len))
629#define NLA_DATA(nla) ((struct nlattr *)(((char *)(nla)) + NLA_LENGTH(0)))
630
631#define ERR_NLA(err, inner_len) \
632 ((struct nlattr *)(((char *)(err)) \
633 + NLMSG_ALIGN(sizeof(struct nlmsgerr)) \
634 + NLMSG_ALIGN((inner_len))))
5d307d5d
DS
635
636static void netlink_parse_nlattr(struct nlattr **tb, int max,
637 struct nlattr *nla, int len)
638{
4cebb2b6 639 while (NLA_OK(nla, len)) {
5d307d5d
DS
640 if (nla->nla_type <= max)
641 tb[nla->nla_type] = nla;
4cebb2b6 642 nla = NLA_NEXT(nla, len);
5d307d5d
DS
643 }
644}
645
646static void netlink_parse_extended_ack(struct nlmsghdr *h)
647{
4cebb2b6
SW
648 struct nlattr *tb[NLMSGERR_ATTR_MAX + 1] = {};
649 const struct nlmsgerr *err = (const struct nlmsgerr *)NLMSG_DATA(h);
5d307d5d 650 const struct nlmsghdr *err_nlh = NULL;
4cebb2b6
SW
651 /* Length not including nlmsghdr */
652 uint32_t len = 0;
653 /* Inner error netlink message length */
654 uint32_t inner_len = 0;
5d307d5d
DS
655 const char *msg = NULL;
656 uint32_t off = 0;
657
658 if (!(h->nlmsg_flags & NLM_F_CAPPED))
4cebb2b6
SW
659 inner_len = (uint32_t)NLMSG_PAYLOAD(&err->msg, 0);
660
661 len = (uint32_t)(NLMSG_PAYLOAD(h, sizeof(struct nlmsgerr)) - inner_len);
5d307d5d 662
4cebb2b6
SW
663 netlink_parse_nlattr(tb, NLMSGERR_ATTR_MAX, ERR_NLA(err, inner_len),
664 len);
5d307d5d
DS
665
666 if (tb[NLMSGERR_ATTR_MSG])
4cebb2b6 667 msg = (const char *)NLA_DATA(tb[NLMSGERR_ATTR_MSG]);
5d307d5d
DS
668
669 if (tb[NLMSGERR_ATTR_OFFS]) {
4cebb2b6 670 off = *(uint32_t *)NLA_DATA(tb[NLMSGERR_ATTR_OFFS]);
5d307d5d
DS
671
672 if (off > h->nlmsg_len) {
9165c5f5 673 zlog_err("Invalid offset for NLMSGERR_ATTR_OFFS");
5d307d5d
DS
674 } else if (!(h->nlmsg_flags & NLM_F_CAPPED)) {
675 /*
676 * Header of failed message
677 * we are not doing anything currently with it
678 * but noticing it for later.
679 */
680 err_nlh = &err->msg;
15569c58 681 zlog_debug("%s: Received %s extended Ack", __func__,
87b5d1b0 682 nl_msg_type_to_str(err_nlh->nlmsg_type));
5d307d5d
DS
683 }
684 }
685
686 if (msg && *msg != '\0') {
687 bool is_err = !!err->error;
688
689 if (is_err)
690 zlog_err("Extended Error: %s", msg);
691 else
e914ccbe 692 flog_warn(EC_ZEBRA_NETLINK_EXTENDED_WARNING,
9df414fe 693 "Extended Warning: %s", msg);
5d307d5d
DS
694 }
695}
696
ae6138bf
JU
697/*
698 * netlink_send_msg - send a netlink message of a certain size.
699 *
700 * Returns -1 on error. Otherwise, it returns the number of bytes sent.
701 */
f8653393
JU
702static ssize_t netlink_send_msg(const struct nlsock *nl, void *buf,
703 size_t buflen)
ae6138bf 704{
f8653393
JU
705 struct sockaddr_nl snl = {};
706 struct iovec iov = {};
707 struct msghdr msg = {};
708 ssize_t status;
709 int save_errno = 0;
ae6138bf
JU
710
711 iov.iov_base = buf;
712 iov.iov_len = buflen;
f8653393 713 msg.msg_name = &snl;
ae6138bf
JU
714 msg.msg_namelen = sizeof(snl);
715 msg.msg_iov = &iov;
716 msg.msg_iovlen = 1;
717
718 snl.nl_family = AF_NETLINK;
719
720 /* Send message to netlink interface. */
721 frr_with_privs(&zserv_privs) {
722 status = sendmsg(nl->sock, &msg, 0);
723 save_errno = errno;
724 }
725
726 if (IS_ZEBRA_DEBUG_KERNEL_MSGDUMP_SEND) {
727 zlog_debug("%s: >> netlink message dump [sent]", __func__);
eead0bc4
RZ
728#ifdef NETLINK_DEBUG
729 nl_dump(buf, buflen);
730#else
ae6138bf 731 zlog_hexdump(buf, buflen);
eead0bc4 732#endif /* NETLINK_DEBUG */
ae6138bf
JU
733 }
734
f8653393 735 if (status == -1) {
ae6138bf
JU
736 flog_err_sys(EC_LIB_SOCKET, "%s error: %s", __func__,
737 safe_strerror(save_errno));
738 return -1;
739 }
740
741 return status;
742}
743
744/*
745 * netlink_recv_msg - receive a netlink message.
746 *
747 * Returns -1 on error, 0 if read would block or the number of bytes received.
748 */
749static int netlink_recv_msg(const struct nlsock *nl, struct msghdr msg,
750 void *buf, size_t buflen)
751{
752 struct iovec iov;
753 int status;
754
755 iov.iov_base = buf;
756 iov.iov_len = buflen;
757 msg.msg_iov = &iov;
758 msg.msg_iovlen = 1;
759
760 do {
ae6138bf 761 status = recvmsg(nl->sock, &msg, 0);
f8653393 762 } while (status == -1 && errno == EINTR);
ae6138bf 763
f8653393 764 if (status == -1) {
ae6138bf
JU
765 if (errno == EWOULDBLOCK || errno == EAGAIN)
766 return 0;
767 flog_err(EC_ZEBRA_RECVMSG_OVERRUN, "%s recvmsg overrun: %s",
768 nl->name, safe_strerror(errno));
769 /*
770 * In this case we are screwed. There is no good way to recover
771 * zebra at this point.
772 */
773 exit(-1);
774 }
775
776 if (status == 0) {
777 flog_err_sys(EC_LIB_SOCKET, "%s EOF", nl->name);
778 return -1;
779 }
780
781 if (msg.msg_namelen != sizeof(struct sockaddr_nl)) {
782 flog_err(EC_ZEBRA_NETLINK_LENGTH_ERROR,
783 "%s sender address length error: length %d", nl->name,
784 msg.msg_namelen);
785 return -1;
786 }
787
788 if (IS_ZEBRA_DEBUG_KERNEL_MSGDUMP_RECV) {
789 zlog_debug("%s: << netlink message dump [recv]", __func__);
eead0bc4
RZ
790#ifdef NETLINK_DEBUG
791 nl_dump(buf, status);
792#else
ae6138bf 793 zlog_hexdump(buf, status);
eead0bc4 794#endif /* NETLINK_DEBUG */
ae6138bf
JU
795 }
796
ae6138bf
JU
797 return status;
798}
799
800/*
801 * netlink_parse_error - parse a netlink error message
802 *
803 * Returns 1 if this message is acknowledgement, 0 if this error should be
804 * ignored, -1 otherwise.
805 */
806static int netlink_parse_error(const struct nlsock *nl, struct nlmsghdr *h,
807 const struct zebra_dplane_info *zns,
808 bool startup)
809{
810 struct nlmsgerr *err = (struct nlmsgerr *)NLMSG_DATA(h);
811 int errnum = err->error;
812 int msg_type = err->msg.nlmsg_type;
813
814 if (h->nlmsg_len < NLMSG_LENGTH(sizeof(struct nlmsgerr))) {
815 flog_err(EC_ZEBRA_NETLINK_LENGTH_ERROR,
816 "%s error: message truncated", nl->name);
817 return -1;
818 }
819
820 /*
821 * Parse the extended information before we actually handle it. At this
822 * point in time we do not do anything other than report the issue.
823 */
824 if (h->nlmsg_flags & NLM_F_ACK_TLVS)
825 netlink_parse_extended_ack(h);
826
827 /* If the error field is zero, then this is an ACK. */
828 if (err->error == 0) {
829 if (IS_ZEBRA_DEBUG_KERNEL) {
830 zlog_debug("%s: %s ACK: type=%s(%u), seq=%u, pid=%u",
831 __func__, nl->name,
832 nl_msg_type_to_str(err->msg.nlmsg_type),
833 err->msg.nlmsg_type, err->msg.nlmsg_seq,
834 err->msg.nlmsg_pid);
835 }
836
837 return 1;
838 }
839
840 /* Deal with errors that occur because of races in link handling. */
841 if (zns->is_cmd
842 && ((msg_type == RTM_DELROUTE
843 && (-errnum == ENODEV || -errnum == ESRCH))
844 || (msg_type == RTM_NEWROUTE
845 && (-errnum == ENETDOWN || -errnum == EEXIST)))) {
846 if (IS_ZEBRA_DEBUG_KERNEL)
847 zlog_debug("%s: error: %s type=%s(%u), seq=%u, pid=%u",
848 nl->name, safe_strerror(-errnum),
849 nl_msg_type_to_str(msg_type), msg_type,
850 err->msg.nlmsg_seq, err->msg.nlmsg_pid);
851 return 0;
852 }
853
854 /*
855 * We see RTM_DELNEIGH when shutting down an interface with an IPv4
856 * link-local. The kernel should have already deleted the neighbor so
857 * do not log these as an error.
858 */
859 if (msg_type == RTM_DELNEIGH
860 || (zns->is_cmd && msg_type == RTM_NEWROUTE
861 && (-errnum == ESRCH || -errnum == ENETUNREACH))) {
862 /*
863 * This is known to happen in some situations, don't log as
864 * error.
865 */
866 if (IS_ZEBRA_DEBUG_KERNEL)
867 zlog_debug("%s error: %s, type=%s(%u), seq=%u, pid=%u",
868 nl->name, safe_strerror(-errnum),
869 nl_msg_type_to_str(msg_type), msg_type,
870 err->msg.nlmsg_seq, err->msg.nlmsg_pid);
871 } else {
872 if ((msg_type != RTM_GETNEXTHOP) || !startup)
873 flog_err(EC_ZEBRA_UNEXPECTED_MESSAGE,
874 "%s error: %s, type=%s(%u), seq=%u, pid=%u",
875 nl->name, safe_strerror(-errnum),
876 nl_msg_type_to_str(msg_type), msg_type,
877 err->msg.nlmsg_seq, err->msg.nlmsg_pid);
878 }
879
880 return -1;
881}
882
936ebf0a
DS
883/*
884 * netlink_parse_info
885 *
886 * Receive message from netlink interface and pass those information
887 * to the given function.
888 *
889 * filter -> Function to call to read the results
890 * nl -> netlink socket information
891 * zns -> The zebra namespace data
892 * count -> How many we should read in, 0 means as much as possible
893 * startup -> Are we reading in under startup conditions? passed to
894 * the filter.
895 */
2414abd3 896int netlink_parse_info(int (*filter)(struct nlmsghdr *, ns_id_t, int),
7cdb1a84
MS
897 const struct nlsock *nl,
898 const struct zebra_dplane_info *zns,
85a75f1e 899 int count, int startup)
1fdc9eae 900{
d62a17ae 901 int status;
902 int ret = 0;
903 int error;
904 int read_in = 0;
905
906 while (1) {
9ed7517b 907 char buf[NL_RCV_PKT_BUF_SIZE];
d62a17ae 908 struct sockaddr_nl snl;
909 struct msghdr msg = {.msg_name = (void *)&snl,
ae6138bf 910 .msg_namelen = sizeof(snl)};
d62a17ae 911 struct nlmsghdr *h;
912
913 if (count && read_in >= count)
914 return 0;
915
ae6138bf
JU
916 status = netlink_recv_msg(nl, msg, buf, sizeof(buf));
917 if (status == -1)
d62a17ae 918 return -1;
ae6138bf
JU
919 else if (status == 0)
920 break;
81a2f870 921
d62a17ae 922 read_in++;
923 for (h = (struct nlmsghdr *)buf;
e6a0e0d1 924 (status >= 0 && NLMSG_OK(h, (unsigned int)status));
d62a17ae 925 h = NLMSG_NEXT(h, status)) {
926 /* Finish of reading. */
927 if (h->nlmsg_type == NLMSG_DONE)
928 return ret;
929
930 /* Error handling. */
931 if (h->nlmsg_type == NLMSG_ERROR) {
ae6138bf
JU
932 int err = netlink_parse_error(nl, h, zns,
933 startup);
934 if (err == 1) {
d62a17ae 935 if (!(h->nlmsg_flags & NLM_F_MULTI))
936 return 0;
937 continue;
ae6138bf
JU
938 } else
939 return err;
d62a17ae 940 }
941
942 /* OK we got netlink message. */
943 if (IS_ZEBRA_DEBUG_KERNEL)
944 zlog_debug(
945 "netlink_parse_info: %s type %s(%u), len=%d, seq=%u, pid=%u",
946 nl->name,
947 nl_msg_type_to_str(h->nlmsg_type),
948 h->nlmsg_type, h->nlmsg_len,
949 h->nlmsg_seq, h->nlmsg_pid);
950
783827ae
DS
951
952 /*
953 * Ignore messages that maybe sent from
954 * other actors besides the kernel
955 */
956 if (snl.nl_pid != 0) {
43e52561
QY
957 zlog_debug("Ignoring message from pid %u",
958 snl.nl_pid);
d62a17ae 959 continue;
960 }
961
2414abd3 962 error = (*filter)(h, zns->ns_id, startup);
d62a17ae 963 if (error < 0) {
9df414fe
QY
964 zlog_debug("%s filter function error",
965 nl->name);
d62a17ae 966 ret = error;
967 }
968 }
969
970 /* After error care. */
971 if (msg.msg_flags & MSG_TRUNC) {
e914ccbe 972 flog_err(EC_ZEBRA_NETLINK_LENGTH_ERROR,
1c50c1c0 973 "%s error: message truncated", nl->name);
d62a17ae 974 continue;
975 }
976 if (status) {
e914ccbe 977 flog_err(EC_ZEBRA_NETLINK_LENGTH_ERROR,
1c50c1c0
QY
978 "%s error: data remnant size %d", nl->name,
979 status);
d62a17ae 980 return -1;
981 }
982 }
983 return ret;
1fdc9eae 984}
985
936ebf0a 986/*
7cdb1a84 987 * netlink_talk_info
936ebf0a
DS
988 *
989 * sendmsg() to netlink socket then recvmsg().
990 * Calls netlink_parse_info to parse returned data
991 *
992 * filter -> The filter to read final results from kernel
993 * nlmsghdr -> The data to send to the kernel
8b962e77 994 * dp_info -> The dataplane and netlink socket information
936ebf0a
DS
995 * startup -> Are we reading in under startup conditions
996 * This is passed through eventually to filter.
997 */
67e3369e
JU
998static int
999netlink_talk_info(int (*filter)(struct nlmsghdr *, ns_id_t, int startup),
1000 struct nlmsghdr *n, const struct zebra_dplane_info *dp_info,
1001 int startup)
1fdc9eae 1002{
7cdb1a84 1003 const struct nlsock *nl;
d62a17ae 1004
7cdb1a84
MS
1005 nl = &(dp_info->nls);
1006 n->nlmsg_seq = nl->seq;
d62a17ae 1007 n->nlmsg_pid = nl->snl.nl_pid;
1008
d62a17ae 1009 if (IS_ZEBRA_DEBUG_KERNEL)
1010 zlog_debug(
1011 "netlink_talk: %s type %s(%u), len=%d seq=%u flags 0x%x",
1012 nl->name, nl_msg_type_to_str(n->nlmsg_type),
1013 n->nlmsg_type, n->nlmsg_len, n->nlmsg_seq,
1014 n->nlmsg_flags);
1015
f8653393 1016 if (netlink_send_msg(nl, n, n->nlmsg_len) == -1)
d62a17ae 1017 return -1;
d62a17ae 1018
d62a17ae 1019 /*
1020 * Get reply from netlink socket.
1021 * The reply should either be an acknowlegement or an error.
1022 */
7cdb1a84
MS
1023 return netlink_parse_info(filter, nl, dp_info, 0, startup);
1024}
1025
1026/*
1027 * Synchronous version of netlink_talk_info. Converts args to suit the
1028 * common version, which is suitable for both sync and async use.
7cdb1a84
MS
1029 */
1030int netlink_talk(int (*filter)(struct nlmsghdr *, ns_id_t, int startup),
1031 struct nlmsghdr *n, struct nlsock *nl, struct zebra_ns *zns,
1032 int startup)
1033{
1034 struct zebra_dplane_info dp_info;
1035
1036 /* Increment sequence number before capturing snapshot of ns socket
1037 * info.
1038 */
1039 nl->seq++;
1040
1041 /* Capture info in intermediate info struct */
85a75f1e 1042 zebra_dplane_info_from_zns(&dp_info, zns, (nl == &(zns->netlink_cmd)));
7cdb1a84 1043
5709131c 1044 return netlink_talk_info(filter, n, &dp_info, startup);
1fdc9eae 1045}
1046
289602d7 1047/* Issue request message to kernel via netlink socket. GET messages
1048 * are issued through this interface.
1049 */
fd3f8e52 1050int netlink_request(struct nlsock *nl, void *req)
1fdc9eae 1051{
fd3f8e52 1052 struct nlmsghdr *n = (struct nlmsghdr *)req;
d62a17ae 1053
1054 /* Check netlink socket. */
1055 if (nl->sock < 0) {
450971aa 1056 flog_err_sys(EC_LIB_SOCKET, "%s socket isn't active.",
09c866e3 1057 nl->name);
d62a17ae 1058 return -1;
1059 }
1060
1061 /* Fill common fields for all requests. */
d62a17ae 1062 n->nlmsg_pid = nl->snl.nl_pid;
1063 n->nlmsg_seq = ++nl->seq;
1064
f8653393 1065 if (netlink_send_msg(nl, req, n->nlmsg_len) == -1)
d62a17ae 1066 return -1;
d62a17ae 1067
1068 return 0;
1fdc9eae 1069}
1070
e63c7622
JU
1071static int nl_batch_read_resp(struct nl_batch *bth)
1072{
1073 struct nlmsghdr *h;
1074 struct sockaddr_nl snl;
9d06e121 1075 struct msghdr msg = {};
f6feb48b 1076 int status, seq;
e63c7622 1077 const struct nlsock *nl;
f6feb48b
JU
1078 struct zebra_dplane_ctx *ctx;
1079 bool ignore_msg;
e63c7622
JU
1080
1081 nl = &(bth->zns->nls);
1082
1083 msg.msg_name = (void *)&snl;
1084 msg.msg_namelen = sizeof(snl);
1085
2f9dbd3a
JU
1086 /*
1087 * The responses are not batched, so we need to read and process one
1088 * message at a time.
1089 */
1090 while (true) {
1091 status = netlink_recv_msg(nl, msg, nl_batch_rx_buf,
1092 sizeof(nl_batch_rx_buf));
1093 if (status == -1 || status == 0)
1094 return status;
e63c7622 1095
2f9dbd3a 1096 h = (struct nlmsghdr *)nl_batch_rx_buf;
f6feb48b
JU
1097 ignore_msg = false;
1098 seq = h->nlmsg_seq;
e63c7622 1099 /*
f6feb48b
JU
1100 * Find the corresponding context object. Received responses are
1101 * in the same order as requests we sent, so we can simply
1102 * iterate over the context list and match responses with
1103 * requests at same time.
e63c7622 1104 */
f6feb48b
JU
1105 while (true) {
1106 ctx = dplane_ctx_dequeue(&(bth->ctx_list));
1107 if (ctx == NULL)
1108 break;
1109
1110 dplane_ctx_enqueue_tail(bth->ctx_out_q, ctx);
1111
1112 /* We have found corresponding context object. */
1113 if (dplane_ctx_get_ns(ctx)->nls.seq == seq)
e63c7622 1114 break;
f6feb48b
JU
1115
1116 /*
1117 * 'update' context objects take two consecutive
1118 * sequence numbers.
1119 */
1120 if (dplane_ctx_is_update(ctx)
1121 && dplane_ctx_get_ns(ctx)->nls.seq + 1 == seq) {
1122 /*
1123 * This is the situation where we get a response
1124 * to a message that should be ignored.
1125 */
1126 ignore_msg = true;
1127 break;
1128 }
e63c7622
JU
1129 }
1130
f6feb48b
JU
1131 if (ignore_msg)
1132 continue;
1133
e63c7622
JU
1134 /*
1135 * We received a message with the sequence number that isn't
1136 * associated with any dplane context object.
1137 */
f6feb48b 1138 if (ctx == NULL) {
4c99d413
MS
1139 if (IS_ZEBRA_DEBUG_KERNEL)
1140 zlog_debug(
1141 "%s: skipping unassociated response, seq number %d NS %u",
1142 __func__, h->nlmsg_seq,
1143 bth->zns->ns_id);
e63c7622
JU
1144 continue;
1145 }
1146
1147 if (h->nlmsg_type == NLMSG_ERROR) {
1148 int err = netlink_parse_error(nl, h, bth->zns, 0);
1149
1150 if (err == -1)
f6feb48b
JU
1151 dplane_ctx_set_status(
1152 ctx, ZEBRA_DPLANE_REQUEST_FAILURE);
e63c7622 1153
4c99d413
MS
1154 if (IS_ZEBRA_DEBUG_KERNEL)
1155 zlog_debug("%s: netlink error message seq=%d ",
1156 __func__, h->nlmsg_seq);
e63c7622
JU
1157 continue;
1158 }
1159
1160 /*
1161 * If we get here then we did not receive neither the ack nor
1162 * the error and instead received some other message in an
1163 * unexpected way.
1164 */
4c99d413
MS
1165 if (IS_ZEBRA_DEBUG_KERNEL)
1166 zlog_debug("%s: ignoring message type 0x%04x(%s) NS %u",
1167 __func__, h->nlmsg_type,
1168 nl_msg_type_to_str(h->nlmsg_type),
1169 bth->zns->ns_id);
e63c7622
JU
1170 }
1171
1172 return 0;
1173}
1174
1175static void nl_batch_reset(struct nl_batch *bth)
1176{
e63c7622
JU
1177 bth->buf_head = bth->buf;
1178 bth->curlen = 0;
1179 bth->msgcnt = 0;
1180 bth->zns = NULL;
1181
f6feb48b 1182 TAILQ_INIT(&(bth->ctx_list));
e63c7622
JU
1183}
1184
f6feb48b 1185static void nl_batch_init(struct nl_batch *bth, struct dplane_ctx_q *ctx_out_q)
e63c7622 1186{
531c92b8
JU
1187 /*
1188 * If the size of the buffer has changed, free and then allocate a new
1189 * one.
1190 */
1191 size_t bufsize =
1192 atomic_load_explicit(&nl_batch_bufsize, memory_order_relaxed);
1193 if (bufsize != nl_batch_tx_bufsize) {
1194 if (nl_batch_tx_buf)
1195 XFREE(MTYPE_NL_BUF, nl_batch_tx_buf);
1196
1197 nl_batch_tx_buf = XCALLOC(MTYPE_NL_BUF, bufsize);
1198 nl_batch_tx_bufsize = bufsize;
1199 }
1200
f6feb48b 1201 bth->buf = nl_batch_tx_buf;
531c92b8
JU
1202 bth->bufsiz = bufsize;
1203 bth->limit = atomic_load_explicit(&nl_batch_send_threshold,
1204 memory_order_relaxed);
e63c7622 1205
f6feb48b 1206 bth->ctx_out_q = ctx_out_q;
e63c7622 1207
f6feb48b
JU
1208 nl_batch_reset(bth);
1209}
1210
1211static void nl_batch_send(struct nl_batch *bth)
1212{
1213 struct zebra_dplane_ctx *ctx;
1214 bool err = false;
e63c7622 1215
f6feb48b
JU
1216 if (bth->curlen != 0 && bth->zns != NULL) {
1217 if (IS_ZEBRA_DEBUG_KERNEL)
1218 zlog_debug("%s: %s, batch size=%zu, msg cnt=%zu",
1219 __func__, bth->zns->nls.name, bth->curlen,
1220 bth->msgcnt);
e63c7622 1221
f6feb48b
JU
1222 if (netlink_send_msg(&(bth->zns->nls), bth->buf, bth->curlen)
1223 == -1)
e63c7622 1224 err = true;
e63c7622 1225
f6feb48b
JU
1226 if (!err) {
1227 if (nl_batch_read_resp(bth) == -1)
1228 err = true;
1229 }
1230 }
e63c7622 1231
f6feb48b
JU
1232 /* Move remaining contexts to the outbound queue. */
1233 while (true) {
1234 ctx = dplane_ctx_dequeue(&(bth->ctx_list));
1235 if (ctx == NULL)
1236 break;
e63c7622 1237
f6feb48b
JU
1238 if (err)
1239 dplane_ctx_set_status(ctx,
1240 ZEBRA_DPLANE_REQUEST_FAILURE);
e63c7622 1241
f6feb48b 1242 dplane_ctx_enqueue_tail(bth->ctx_out_q, ctx);
e63c7622
JU
1243 }
1244
1245 nl_batch_reset(bth);
1246}
1247
e63c7622
JU
1248enum netlink_msg_status netlink_batch_add_msg(
1249 struct nl_batch *bth, struct zebra_dplane_ctx *ctx,
1250 ssize_t (*msg_encoder)(struct zebra_dplane_ctx *, void *, size_t),
f6feb48b 1251 bool ignore_res)
e63c7622
JU
1252{
1253 int seq;
1254 ssize_t size;
1255 struct nlmsghdr *msgh;
1256
e63c7622
JU
1257 size = (*msg_encoder)(ctx, bth->buf_head, bth->bufsiz - bth->curlen);
1258
1259 /*
1260 * If there was an error while encoding the message (other than buffer
1261 * overflow) then return an error.
1262 */
1263 if (size < 0)
1264 return FRR_NETLINK_ERROR;
1265
1266 /*
1267 * If the message doesn't fit entirely in the buffer then send the batch
1268 * and retry.
1269 */
1270 if (size == 0) {
1271 nl_batch_send(bth);
1272 size = (*msg_encoder)(ctx, bth->buf_head,
1273 bth->bufsiz - bth->curlen);
1274 /*
1275 * If the message doesn't fit in the empty buffer then just
1276 * return an error.
1277 */
1278 if (size <= 0)
1279 return FRR_NETLINK_ERROR;
1280 }
1281
1282 seq = dplane_ctx_get_ns(ctx)->nls.seq;
f6feb48b 1283 if (ignore_res)
e63c7622
JU
1284 seq++;
1285
1286 msgh = (struct nlmsghdr *)bth->buf_head;
1287 msgh->nlmsg_seq = seq;
1288 msgh->nlmsg_pid = dplane_ctx_get_ns(ctx)->nls.snl.nl_pid;
1289
e63c7622
JU
1290 bth->zns = dplane_ctx_get_ns(ctx);
1291 bth->buf_head = ((char *)bth->buf_head) + size;
1292 bth->curlen += size;
1293 bth->msgcnt++;
1294
e63c7622
JU
1295 return FRR_NETLINK_QUEUED;
1296}
1297
67e3369e
JU
1298static enum netlink_msg_status nl_put_msg(struct nl_batch *bth,
1299 struct zebra_dplane_ctx *ctx)
1300{
1301 if (dplane_ctx_is_skip_kernel(ctx))
1302 return FRR_NETLINK_SUCCESS;
1303
1304 switch (dplane_ctx_get_op(ctx)) {
1305
1306 case DPLANE_OP_ROUTE_INSTALL:
1307 case DPLANE_OP_ROUTE_UPDATE:
1308 case DPLANE_OP_ROUTE_DELETE:
1309 return netlink_put_route_update_msg(bth, ctx);
1310
1311 case DPLANE_OP_NH_INSTALL:
1312 case DPLANE_OP_NH_UPDATE:
1313 case DPLANE_OP_NH_DELETE:
1314 return netlink_put_nexthop_update_msg(bth, ctx);
1315
1316 case DPLANE_OP_LSP_INSTALL:
1317 case DPLANE_OP_LSP_UPDATE:
1318 case DPLANE_OP_LSP_DELETE:
1319 return netlink_put_lsp_update_msg(bth, ctx);
1320
1321 case DPLANE_OP_PW_INSTALL:
1322 case DPLANE_OP_PW_UNINSTALL:
1323 return netlink_put_pw_update_msg(bth, ctx);
1324
1325 case DPLANE_OP_ADDR_INSTALL:
1326 case DPLANE_OP_ADDR_UNINSTALL:
1327 return netlink_put_address_update_msg(bth, ctx);
1328
1329 case DPLANE_OP_MAC_INSTALL:
1330 case DPLANE_OP_MAC_DELETE:
1331 return netlink_put_mac_update_msg(bth, ctx);
1332
1333 case DPLANE_OP_NEIGH_INSTALL:
1334 case DPLANE_OP_NEIGH_UPDATE:
1335 case DPLANE_OP_NEIGH_DELETE:
1336 case DPLANE_OP_VTEP_ADD:
1337 case DPLANE_OP_VTEP_DELETE:
d68e74b4 1338 case DPLANE_OP_NEIGH_DISCOVER:
67e3369e
JU
1339 return netlink_put_neigh_update_msg(bth, ctx);
1340
1341 case DPLANE_OP_RULE_ADD:
1342 case DPLANE_OP_RULE_DELETE:
1343 case DPLANE_OP_RULE_UPDATE:
1344 return netlink_put_rule_update_msg(bth, ctx);
1345
1346 case DPLANE_OP_SYS_ROUTE_ADD:
1347 case DPLANE_OP_SYS_ROUTE_DELETE:
1348 case DPLANE_OP_ROUTE_NOTIFY:
1349 case DPLANE_OP_LSP_NOTIFY:
c60522f7 1350 case DPLANE_OP_BR_PORT_UPDATE:
67e3369e
JU
1351 return FRR_NETLINK_SUCCESS;
1352
1353 case DPLANE_OP_NONE:
1354 return FRR_NETLINK_ERROR;
1355 }
1356
1357 return FRR_NETLINK_ERROR;
1358}
1359
fef24b03
JU
1360void kernel_update_multi(struct dplane_ctx_q *ctx_list)
1361{
67e3369e
JU
1362 struct nl_batch batch;
1363 struct zebra_dplane_ctx *ctx;
1364 struct dplane_ctx_q handled_list;
1365 enum netlink_msg_status res;
1366
67e3369e 1367 TAILQ_INIT(&handled_list);
f6feb48b 1368 nl_batch_init(&batch, &handled_list);
67e3369e
JU
1369
1370 while (true) {
1371 ctx = dplane_ctx_dequeue(ctx_list);
1372 if (ctx == NULL)
1373 break;
1374
f6feb48b
JU
1375 if (batch.zns != NULL
1376 && batch.zns->ns_id != dplane_ctx_get_ns(ctx)->ns_id)
1377 nl_batch_send(&batch);
67e3369e
JU
1378
1379 /*
f6feb48b
JU
1380 * Assume all messages will succeed and then mark only the ones
1381 * that failed.
67e3369e 1382 */
f6feb48b
JU
1383 dplane_ctx_set_status(ctx, ZEBRA_DPLANE_REQUEST_SUCCESS);
1384
1385 res = nl_put_msg(&batch, ctx);
1386
1387 dplane_ctx_enqueue_tail(&(batch.ctx_list), ctx);
1388 if (res == FRR_NETLINK_ERROR)
67e3369e
JU
1389 dplane_ctx_set_status(ctx,
1390 ZEBRA_DPLANE_REQUEST_FAILURE);
1391
f6feb48b
JU
1392 if (batch.curlen > batch.limit)
1393 nl_batch_send(&batch);
67e3369e
JU
1394 }
1395
1396 nl_batch_send(&batch);
1397
1398 TAILQ_INIT(ctx_list);
1399 dplane_ctx_list_append(ctx_list, &handled_list);
fef24b03
JU
1400}
1401
1fdc9eae 1402/* Exported interface function. This function simply calls
1403 netlink_socket (). */
d62a17ae 1404void kernel_init(struct zebra_ns *zns)
1fdc9eae 1405{
67188ca2 1406 uint32_t groups;
5d307d5d
DS
1407#if defined SOL_NETLINK
1408 int one, ret;
1409#endif
d62a17ae 1410
026a316f
DS
1411 /*
1412 * Initialize netlink sockets
1413 *
1414 * If RTMGRP_XXX exists use that, but at some point
1415 * I think the kernel developers realized that
1416 * keeping track of all the different values would
1417 * lead to confusion, so we need to convert the
1418 * RTNLGRP_XXX to a bit position for ourself
1419 */
1420 groups = RTMGRP_LINK |
1421 RTMGRP_IPV4_ROUTE |
1422 RTMGRP_IPV4_IFADDR |
1423 RTMGRP_IPV6_ROUTE |
1424 RTMGRP_IPV6_IFADDR |
1425 RTMGRP_IPV4_MROUTE |
1426 RTMGRP_NEIGH |
67188ca2
QY
1427 ((uint32_t) 1 << (RTNLGRP_IPV4_RULE - 1)) |
1428 ((uint32_t) 1 << (RTNLGRP_IPV6_RULE - 1)) |
1429 ((uint32_t) 1 << (RTNLGRP_NEXTHOP - 1));
d62a17ae 1430
1431 snprintf(zns->netlink.name, sizeof(zns->netlink.name),
1432 "netlink-listen (NS %u)", zns->ns_id);
1433 zns->netlink.sock = -1;
19d5a4fe
DS
1434 if (netlink_socket(&zns->netlink, groups, zns->ns_id) < 0) {
1435 zlog_err("Failure to create %s socket",
1436 zns->netlink.name);
1437 exit(-1);
1438 }
d62a17ae 1439
1440 snprintf(zns->netlink_cmd.name, sizeof(zns->netlink_cmd.name),
1441 "netlink-cmd (NS %u)", zns->ns_id);
1442 zns->netlink_cmd.sock = -1;
19d5a4fe
DS
1443 if (netlink_socket(&zns->netlink_cmd, 0, zns->ns_id) < 0) {
1444 zlog_err("Failure to create %s socket",
1445 zns->netlink_cmd.name);
1446 exit(-1);
1447 }
d62a17ae 1448
62b8bb7a
MS
1449 snprintf(zns->netlink_dplane.name, sizeof(zns->netlink_dplane.name),
1450 "netlink-dp (NS %u)", zns->ns_id);
1451 zns->netlink_dplane.sock = -1;
1452 if (netlink_socket(&zns->netlink_dplane, 0, zns->ns_id) < 0) {
1453 zlog_err("Failure to create %s socket",
1454 zns->netlink_dplane.name);
1455 exit(-1);
1456 }
1457
5d307d5d
DS
1458 /*
1459 * SOL_NETLINK is not available on all platforms yet
1460 * apparently. It's in bits/socket.h which I am not
1461 * sure that we want to pull into our build system.
1462 */
1463#if defined SOL_NETLINK
1464 /*
1465 * Let's tell the kernel that we want to receive extended
62b8bb7a 1466 * ACKS over our command socket(s)
5d307d5d
DS
1467 */
1468 one = 1;
1469 ret = setsockopt(zns->netlink_cmd.sock, SOL_NETLINK, NETLINK_EXT_ACK,
1470 &one, sizeof(one));
1471
1472 if (ret < 0)
62b8bb7a
MS
1473 zlog_notice("Registration for extended cmd ACK failed : %d %s",
1474 errno, safe_strerror(errno));
1475
1476 one = 1;
1477 ret = setsockopt(zns->netlink_dplane.sock, SOL_NETLINK, NETLINK_EXT_ACK,
1478 &one, sizeof(one));
1479
1480 if (ret < 0)
1481 zlog_notice("Registration for extended dp ACK failed : %d %s",
5d307d5d 1482 errno, safe_strerror(errno));
97f85144
JU
1483
1484 /*
1485 * Trim off the payload of the original netlink message in the
1486 * acknowledgment. This option is available since Linux 4.2, so if
1487 * setsockopt fails, ignore the error.
1488 */
1489 one = 1;
1490 ret = setsockopt(zns->netlink_dplane.sock, SOL_NETLINK, NETLINK_CAP_ACK,
1491 &one, sizeof(one));
9781e6a0
DS
1492 if (ret < 0)
1493 zlog_notice(
1494 "Registration for reduced ACK packet size failed, probably running an early kernel");
5d307d5d
DS
1495#endif
1496
d62a17ae 1497 /* Register kernel socket. */
19d5a4fe 1498 if (fcntl(zns->netlink.sock, F_SETFL, O_NONBLOCK) < 0)
450971aa 1499 flog_err_sys(EC_LIB_SOCKET, "Can't set %s socket flags: %s",
09c866e3 1500 zns->netlink.name, safe_strerror(errno));
8c85e8ea
DS
1501
1502 if (fcntl(zns->netlink_cmd.sock, F_SETFL, O_NONBLOCK) < 0)
1503 zlog_err("Can't set %s socket error: %s(%d)",
1504 zns->netlink_cmd.name, safe_strerror(errno), errno);
19d5a4fe 1505
62b8bb7a
MS
1506 if (fcntl(zns->netlink_dplane.sock, F_SETFL, O_NONBLOCK) < 0)
1507 zlog_err("Can't set %s socket error: %s(%d)",
1508 zns->netlink_dplane.name, safe_strerror(errno), errno);
1509
19d5a4fe 1510 /* Set receive buffer size if it's set from command line */
97f85144 1511 if (nl_rcvbufsize) {
19d5a4fe 1512 netlink_recvbuf(&zns->netlink, nl_rcvbufsize);
97f85144
JU
1513 netlink_recvbuf(&zns->netlink_cmd, nl_rcvbufsize);
1514 netlink_recvbuf(&zns->netlink_dplane, nl_rcvbufsize);
1515 }
19d5a4fe
DS
1516
1517 netlink_install_filter(zns->netlink.sock,
62b8bb7a
MS
1518 zns->netlink_cmd.snl.nl_pid,
1519 zns->netlink_dplane.snl.nl_pid);
1520
19d5a4fe
DS
1521 zns->t_netlink = NULL;
1522
3801e764 1523 thread_add_read(zrouter.master, kernel_read, zns,
19d5a4fe 1524 zns->netlink.sock, &zns->t_netlink);
d62a17ae 1525
1526 rt_netlink_init();
1fdc9eae 1527}
1528
62b8bb7a 1529void kernel_terminate(struct zebra_ns *zns, bool complete)
1fdc9eae 1530{
50478845 1531 thread_cancel(&zns->t_netlink);
d62a17ae 1532
1533 if (zns->netlink.sock >= 0) {
1534 close(zns->netlink.sock);
1535 zns->netlink.sock = -1;
1536 }
1537
1538 if (zns->netlink_cmd.sock >= 0) {
1539 close(zns->netlink_cmd.sock);
1540 zns->netlink_cmd.sock = -1;
1541 }
ddfeb486 1542
62b8bb7a
MS
1543 /* During zebra shutdown, we need to leave the dataplane socket
1544 * around until all work is done.
1545 */
1546 if (complete) {
1547 if (zns->netlink_dplane.sock >= 0) {
1548 close(zns->netlink_dplane.sock);
1549 zns->netlink_dplane.sock = -1;
1550 }
1551 }
1552}
ddfeb486 1553#endif /* HAVE_NETLINK */