]> git.proxmox.com Git - mirror_frr.git/blame - zebra/kernel_netlink.c
zebra: add netlink message batching infrastructure
[mirror_frr.git] / zebra / kernel_netlink.c
CommitLineData
718e3744 1/* Kernel communication using netlink interface.
2 * Copyright (C) 1999 Kunihiro Ishiguro
3 *
4 * This file is part of GNU Zebra.
5 *
6 * GNU Zebra is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the
8 * Free Software Foundation; either version 2, or (at your option) any
9 * later version.
10 *
11 * GNU Zebra is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
896014f4
DL
16 * You should have received a copy of the GNU General Public License along
17 * with this program; see the file COPYING; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
718e3744 19 */
1fdc9eae 20
21#include <zebra.h>
22
acfa8927 23#if defined(HANDLE_NETLINK_FUZZING)
81a2f870
SW
24#include <stdio.h>
25#include <string.h>
96b43ab3 26#include "libfrr.h"
acfa8927 27#endif /* HANDLE_NETLINK_FUZZING */
81a2f870 28
ddfeb486
DL
29#ifdef HAVE_NETLINK
30
1fdc9eae 31#include "linklist.h"
32#include "if.h"
33#include "log.h"
34#include "prefix.h"
35#include "connected.h"
36#include "table.h"
37#include "memory.h"
38#include "zebra_memory.h"
39#include "rib.h"
40#include "thread.h"
41#include "privs.h"
42#include "nexthop.h"
43#include "vrf.h"
44#include "mpls.h"
174482ef 45#include "lib_errors.h"
1fdc9eae 46
3801e764
DS
47//#include "zebra/zserv.h"
48#include "zebra/zebra_router.h"
1fdc9eae 49#include "zebra/zebra_ns.h"
50#include "zebra/zebra_vrf.h"
05f7f5db 51#include "zebra/rt.h"
1fdc9eae 52#include "zebra/debug.h"
53#include "zebra/kernel_netlink.h"
54#include "zebra/rt_netlink.h"
55#include "zebra/if_netlink.h"
942bf97b 56#include "zebra/rule_netlink.h"
43e52561 57#include "zebra/zebra_errors.h"
1fdc9eae 58
59#ifndef SO_RCVBUFFORCE
60#define SO_RCVBUFFORCE (33)
61#endif
62
63/* Hack for GNU libc version 2. */
64#ifndef MSG_TRUNC
65#define MSG_TRUNC 0x20
66#endif /* MSG_TRUNC */
67
68#ifndef NLMSG_TAIL
d62a17ae 69#define NLMSG_TAIL(nmsg) \
d7c0a89a
QY
70 ((struct rtattr *)(((uint8_t *)(nmsg)) \
71 + NLMSG_ALIGN((nmsg)->nlmsg_len)))
1fdc9eae 72#endif
73
74#ifndef RTA_TAIL
d62a17ae 75#define RTA_TAIL(rta) \
d7c0a89a 76 ((struct rtattr *)(((uint8_t *)(rta)) + RTA_ALIGN((rta)->rta_len)))
1fdc9eae 77#endif
78
f909c673
DS
79#ifndef RTNL_FAMILY_IP6MR
80#define RTNL_FAMILY_IP6MR 129
81#endif
82
83#ifndef RTPROT_MROUTED
84#define RTPROT_MROUTED 17
85#endif
86
e63c7622
JU
87#define NL_BATCH_TX_BUFSIZE (16 * NL_PKT_BUF_SIZE)
88
89/*
90 * For every request sent to the kernel that has failed we get an error message,
91 * which contains a standard netlink message header and the payload consisting
92 * of an error code and the original netlink mesage. So the receiving buffer
93 * must be at least as big as the transmitting buffer increased by some space
94 * for headers.
95 */
96#define NL_BATCH_RX_BUFSIZE (NL_BATCH_TX_BUFSIZE + NL_PKT_BUF_SIZE)
97
98/*
99 * We limit the batch's size to a number smaller than the length of the
100 * underlying buffer since the last message that wouldn't fit the batch would go
101 * over the upper boundary and then it would have to be encoded again into a new
102 * buffer. If the difference between the limit and the length of the buffer is
103 * big enough (bigger than the biggest Netlink message) then this situation
104 * won't occur.
105 */
106#define NL_BATCH_SEND_THRESHOLD (NL_BATCH_TX_BUFSIZE - NL_PKT_BUF_SIZE)
107
d62a17ae 108static const struct message nlmsg_str[] = {{RTM_NEWROUTE, "RTM_NEWROUTE"},
109 {RTM_DELROUTE, "RTM_DELROUTE"},
110 {RTM_GETROUTE, "RTM_GETROUTE"},
111 {RTM_NEWLINK, "RTM_NEWLINK"},
112 {RTM_DELLINK, "RTM_DELLINK"},
113 {RTM_GETLINK, "RTM_GETLINK"},
114 {RTM_NEWADDR, "RTM_NEWADDR"},
115 {RTM_DELADDR, "RTM_DELADDR"},
116 {RTM_GETADDR, "RTM_GETADDR"},
117 {RTM_NEWNEIGH, "RTM_NEWNEIGH"},
118 {RTM_DELNEIGH, "RTM_DELNEIGH"},
119 {RTM_GETNEIGH, "RTM_GETNEIGH"},
942bf97b 120 {RTM_NEWRULE, "RTM_NEWRULE"},
121 {RTM_DELRULE, "RTM_DELRULE"},
122 {RTM_GETRULE, "RTM_GETRULE"},
79580b5a
SW
123 {RTM_NEWNEXTHOP, "RTM_NEWNEXTHOP"},
124 {RTM_DELNEXTHOP, "RTM_DELNEXTHOP"},
125 {RTM_GETNEXTHOP, "RTM_GETNEXTHOP"},
d62a17ae 126 {0}};
1fdc9eae 127
128static const struct message rtproto_str[] = {
d62a17ae 129 {RTPROT_REDIRECT, "redirect"},
130 {RTPROT_KERNEL, "kernel"},
131 {RTPROT_BOOT, "boot"},
132 {RTPROT_STATIC, "static"},
133 {RTPROT_GATED, "GateD"},
134 {RTPROT_RA, "router advertisement"},
135 {RTPROT_MRT, "MRT"},
136 {RTPROT_ZEBRA, "Zebra"},
1fdc9eae 137#ifdef RTPROT_BIRD
d62a17ae 138 {RTPROT_BIRD, "BIRD"},
1fdc9eae 139#endif /* RTPROT_BIRD */
d62a17ae 140 {RTPROT_MROUTED, "mroute"},
141 {RTPROT_BGP, "BGP"},
142 {RTPROT_OSPF, "OSPF"},
143 {RTPROT_ISIS, "IS-IS"},
144 {RTPROT_RIP, "RIP"},
145 {RTPROT_RIPNG, "RIPNG"},
d4d71f11 146 {RTPROT_ZSTATIC, "static"},
d62a17ae 147 {0}};
148
149static const struct message family_str[] = {{AF_INET, "ipv4"},
150 {AF_INET6, "ipv6"},
151 {AF_BRIDGE, "bridge"},
152 {RTNL_FAMILY_IPMR, "ipv4MR"},
153 {RTNL_FAMILY_IP6MR, "ipv6MR"},
154 {0}};
155
8c8f250b
DS
156static const struct message rttype_str[] = {{RTN_UNSPEC, "none"},
157 {RTN_UNICAST, "unicast"},
158 {RTN_LOCAL, "local"},
159 {RTN_BROADCAST, "broadcast"},
160 {RTN_ANYCAST, "anycast"},
d62a17ae 161 {RTN_MULTICAST, "multicast"},
8c8f250b
DS
162 {RTN_BLACKHOLE, "blackhole"},
163 {RTN_UNREACHABLE, "unreachable"},
164 {RTN_PROHIBIT, "prohibited"},
165 {RTN_THROW, "throw"},
166 {RTN_NAT, "nat"},
167 {RTN_XRESOLVE, "resolver"},
d62a17ae 168 {0}};
b339bde7 169
1fdc9eae 170extern struct thread_master *master;
d7c0a89a 171extern uint32_t nl_rcvbufsize;
1fdc9eae 172
173extern struct zebra_privs_t zserv_privs;
174
e63c7622
JU
175char nl_batch_tx_buf[NL_BATCH_TX_BUFSIZE];
176char nl_batch_rx_buf[NL_BATCH_RX_BUFSIZE];
177
178DEFINE_MTYPE_STATIC(ZEBRA, NL_BATCH_ITEM, "Netlink batch items")
179
180PREDECL_LIST(nl_batch_list)
181
182struct nl_batch {
183 void *buf;
184 size_t bufsiz;
185 size_t limit;
186
187 void *buf_head;
188 size_t curlen;
189 size_t msgcnt;
190
191 const struct zebra_dplane_info *zns;
192 struct nl_batch_list_head items;
193};
194
195struct nl_msg_batch_item {
196 int seq;
197 struct zebra_dplane_ctx *ctx;
198 bool ignore_res;
199 bool failure;
200
201 struct nl_batch_list_item item;
202};
203
204DECLARE_LIST(nl_batch_list, struct nl_msg_batch_item, item)
7cdb1a84 205
2414abd3 206int netlink_talk_filter(struct nlmsghdr *h, ns_id_t ns_id, int startup)
1fdc9eae 207{
3575d9e8
DS
208 /*
209 * This is an error condition that must be handled during
210 * development.
211 *
212 * The netlink_talk_filter function is used for communication
213 * down the netlink_cmd pipe and we are expecting
214 * an ack being received. So if we get here
215 * then we did not receive the ack and instead
216 * received some other message in an unexpected
217 * way.
218 */
43e52561
QY
219 zlog_debug("%s: ignoring message type 0x%04x(%s) NS %u", __func__,
220 h->nlmsg_type, nl_msg_type_to_str(h->nlmsg_type), ns_id);
d62a17ae 221 return 0;
1fdc9eae 222}
223
d62a17ae 224static int netlink_recvbuf(struct nlsock *nl, uint32_t newsize)
1fdc9eae 225{
d7c0a89a 226 uint32_t oldsize;
d62a17ae 227 socklen_t newlen = sizeof(newsize);
228 socklen_t oldlen = sizeof(oldsize);
229 int ret;
230
231 ret = getsockopt(nl->sock, SOL_SOCKET, SO_RCVBUF, &oldsize, &oldlen);
232 if (ret < 0) {
450971aa 233 flog_err_sys(EC_LIB_SOCKET,
09c866e3
QY
234 "Can't get %s receive buffer size: %s", nl->name,
235 safe_strerror(errno));
d62a17ae 236 return -1;
237 }
238
239 /* Try force option (linux >= 2.6.14) and fall back to normal set */
0cf6db21 240 frr_with_privs(&zserv_privs) {
01b9e3fd
DL
241 ret = setsockopt(nl->sock, SOL_SOCKET, SO_RCVBUFFORCE,
242 &nl_rcvbufsize,
243 sizeof(nl_rcvbufsize));
244 }
d62a17ae 245 if (ret < 0)
246 ret = setsockopt(nl->sock, SOL_SOCKET, SO_RCVBUF,
247 &nl_rcvbufsize, sizeof(nl_rcvbufsize));
248 if (ret < 0) {
450971aa 249 flog_err_sys(EC_LIB_SOCKET,
09c866e3
QY
250 "Can't set %s receive buffer size: %s", nl->name,
251 safe_strerror(errno));
d62a17ae 252 return -1;
253 }
254
255 ret = getsockopt(nl->sock, SOL_SOCKET, SO_RCVBUF, &newsize, &newlen);
256 if (ret < 0) {
450971aa 257 flog_err_sys(EC_LIB_SOCKET,
09c866e3
QY
258 "Can't get %s receive buffer size: %s", nl->name,
259 safe_strerror(errno));
d62a17ae 260 return -1;
261 }
262
263 zlog_info("Setting netlink socket receive buffer size: %u -> %u",
264 oldsize, newsize);
265 return 0;
1fdc9eae 266}
267
268/* Make socket for Linux netlink interface. */
d62a17ae 269static int netlink_socket(struct nlsock *nl, unsigned long groups,
270 ns_id_t ns_id)
1fdc9eae 271{
d62a17ae 272 int ret;
273 struct sockaddr_nl snl;
274 int sock;
275 int namelen;
d62a17ae 276
0cf6db21 277 frr_with_privs(&zserv_privs) {
6bb30c2c
DL
278 sock = ns_socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE, ns_id);
279 if (sock < 0) {
280 zlog_err("Can't open %s socket: %s", nl->name,
281 safe_strerror(errno));
282 return -1;
283 }
d62a17ae 284
0d6f7fd6 285 memset(&snl, 0, sizeof(snl));
6bb30c2c
DL
286 snl.nl_family = AF_NETLINK;
287 snl.nl_groups = groups;
d62a17ae 288
6bb30c2c 289 /* Bind the socket to the netlink structure for anything. */
0d6f7fd6 290 ret = bind(sock, (struct sockaddr *)&snl, sizeof(snl));
6bb30c2c 291 }
d62a17ae 292
293 if (ret < 0) {
6bb30c2c
DL
294 zlog_err("Can't bind %s socket to group 0x%x: %s", nl->name,
295 snl.nl_groups, safe_strerror(errno));
d62a17ae 296 close(sock);
297 return -1;
298 }
299
300 /* multiple netlink sockets will have different nl_pid */
0d6f7fd6 301 namelen = sizeof(snl);
d62a17ae 302 ret = getsockname(sock, (struct sockaddr *)&snl, (socklen_t *)&namelen);
0d6f7fd6 303 if (ret < 0 || namelen != sizeof(snl)) {
450971aa 304 flog_err_sys(EC_LIB_SOCKET, "Can't get %s socket name: %s",
09c866e3 305 nl->name, safe_strerror(errno));
d62a17ae 306 close(sock);
307 return -1;
308 }
309
310 nl->snl = snl;
311 nl->sock = sock;
312 return ret;
1fdc9eae 313}
314
2414abd3 315static int netlink_information_fetch(struct nlmsghdr *h, ns_id_t ns_id,
d62a17ae 316 int startup)
1fdc9eae 317{
3575d9e8
DS
318 /*
319 * When we handle new message types here
320 * because we are starting to install them
321 * then lets check the netlink_install_filter
322 * and see if we should add the corresponding
323 * allow through entry there.
324 * Probably not needed to do but please
325 * think about it.
326 */
d62a17ae 327 switch (h->nlmsg_type) {
328 case RTM_NEWROUTE:
2414abd3 329 return netlink_route_change(h, ns_id, startup);
d62a17ae 330 case RTM_DELROUTE:
2414abd3 331 return netlink_route_change(h, ns_id, startup);
d62a17ae 332 case RTM_NEWLINK:
2414abd3 333 return netlink_link_change(h, ns_id, startup);
d62a17ae 334 case RTM_DELLINK:
2414abd3 335 return netlink_link_change(h, ns_id, startup);
d62a17ae 336 case RTM_NEWADDR:
2414abd3 337 return netlink_interface_addr(h, ns_id, startup);
d62a17ae 338 case RTM_DELADDR:
2414abd3 339 return netlink_interface_addr(h, ns_id, startup);
d62a17ae 340 case RTM_NEWNEIGH:
2414abd3 341 return netlink_neigh_change(h, ns_id);
d62a17ae 342 case RTM_DELNEIGH:
2414abd3 343 return netlink_neigh_change(h, ns_id);
951f8bcb
DS
344 case RTM_GETNEIGH:
345 /*
346 * Kernel in some situations when it expects
347 * user space to resolve arp entries, we will
348 * receive this notification. As we don't
349 * need this notification and as that
350 * we don't want to spam the log file with
351 * below messages, just ignore.
352 */
353 if (IS_ZEBRA_DEBUG_KERNEL)
354 zlog_debug("Received RTM_GETNEIGH, ignoring");
355 break;
942bf97b 356 case RTM_NEWRULE:
2414abd3 357 return netlink_rule_change(h, ns_id, startup);
942bf97b 358 case RTM_DELRULE:
2414abd3 359 return netlink_rule_change(h, ns_id, startup);
79580b5a 360 case RTM_NEWNEXTHOP:
d9f5b2f5 361 return netlink_nexthop_change(h, ns_id, startup);
79580b5a 362 case RTM_DELNEXTHOP:
d9f5b2f5 363 return netlink_nexthop_change(h, ns_id, startup);
d62a17ae 364 default:
3575d9e8
DS
365 /*
366 * If we have received this message then
367 * we have made a mistake during development
368 * and we need to write some code to handle
369 * this message type or not ask for
370 * it to be sent up to us
371 */
e914ccbe 372 flog_err(EC_ZEBRA_UNKNOWN_NLMSG,
1c50c1c0
QY
373 "Unknown netlink nlmsg_type %s(%d) vrf %u\n",
374 nl_msg_type_to_str(h->nlmsg_type), h->nlmsg_type,
375 ns_id);
d62a17ae 376 break;
377 }
378 return 0;
1fdc9eae 379}
380
acfa8927 381#if defined(HANDLE_NETLINK_FUZZING)
81a2f870
SW
382/* Using globals here to avoid adding function parameters */
383
384/* Keep distinct filenames for netlink fuzzy collection */
385static unsigned int netlink_file_counter = 1;
386
387/* File name to read fuzzed netlink from */
388static char netlink_fuzz_file[MAXPATHLEN] = "";
389
390/* Flag for whether to read from file or not */
29bf7b0b 391bool netlink_read;
81a2f870
SW
392
393/**
394 * netlink_read_init() - Starts the message parser
395 * @fname: Filename to read.
396 */
397void netlink_read_init(const char *fname)
398{
1bcea841
MS
399 struct zebra_dplane_info dp_info;
400
81a2f870
SW
401 snprintf(netlink_fuzz_file, MAXPATHLEN, "%s", fname);
402 /* Creating this fake socket for testing purposes */
ef593eff
SW
403 struct zebra_ns *zns = zebra_ns_lookup(NS_DEFAULT);
404
1bcea841
MS
405 /* Capture key info from zns struct */
406 zebra_dplane_info_from_zns(&dp_info, zns, false);
407
408 netlink_parse_info(netlink_information_fetch, &zns->netlink,
409 &dp_info, 1, 0);
81a2f870
SW
410}
411
412/**
413 * netlink_write_incoming() - Writes all data received from netlink to a file
414 * @buf: Data from netlink.
415 * @size: Size of data.
416 * @counter: Counter for keeping filenames distinct.
417 */
418static void netlink_write_incoming(const char *buf, const unsigned int size,
419 unsigned int counter)
420{
421 char fname[MAXPATHLEN];
422 FILE *f;
423
3c649c71 424 snprintf(fname, MAXPATHLEN, "%s/%s_%u", frr_vtydir, "netlink", counter);
0cf6db21 425 frr_with_privs(&zserv_privs) {
6bb30c2c
DL
426 f = fopen(fname, "w");
427 }
81a2f870
SW
428 if (f) {
429 fwrite(buf, 1, size, f);
430 fclose(f);
431 }
81a2f870
SW
432}
433
434/**
435 * netlink_read_file() - Reads netlink data from file
436 * @buf: Netlink buffer being overwritten.
437 * @fname: File name to read from.
438 *
439 * Return: Size of file.
440 */
441static long netlink_read_file(char *buf, const char *fname)
442{
443 FILE *f;
444 long file_bytes = -1;
bd7891fd 445
0cf6db21 446 frr_with_privs(&zserv_privs) {
6bb30c2c
DL
447 f = fopen(fname, "r");
448 }
81a2f870
SW
449 if (f) {
450 fseek(f, 0, SEEK_END);
451 file_bytes = ftell(f);
452 rewind(f);
ef593eff 453 fread(buf, NL_RCV_PKT_BUF_SIZE, 1, f);
81a2f870
SW
454 fclose(f);
455 }
81a2f870
SW
456 return file_bytes;
457}
458
acfa8927 459#endif /* HANDLE_NETLINK_FUZZING */
81a2f870 460
d62a17ae 461static int kernel_read(struct thread *thread)
1fdc9eae 462{
d62a17ae 463 struct zebra_ns *zns = (struct zebra_ns *)THREAD_ARG(thread);
85a75f1e
MS
464 struct zebra_dplane_info dp_info;
465
466 /* Capture key info from ns struct */
467 zebra_dplane_info_from_zns(&dp_info, zns, false);
468
469 netlink_parse_info(netlink_information_fetch, &zns->netlink, &dp_info,
470 5, 0);
d62a17ae 471 zns->t_netlink = NULL;
3801e764 472 thread_add_read(zrouter.master, kernel_read, zns, zns->netlink.sock,
d62a17ae 473 &zns->t_netlink);
1fdc9eae 474
d62a17ae 475 return 0;
1fdc9eae 476}
477
3575d9e8
DS
478/*
479 * Filter out messages from self that occur on listener socket,
62b8bb7a 480 * caused by our actions on the command socket(s)
3575d9e8
DS
481 *
482 * When we add new Netlink message types we probably
483 * do not need to add them here as that we are filtering
484 * on the routes we actually care to receive( which is rarer
485 * then the normal course of operations). We are intentionally
486 * allowing some messages from ourselves through
487 * ( I'm looking at you Interface based netlink messages )
488 * so that we only had to write one way to handle incoming
489 * address add/delete changes.
1fdc9eae 490 */
62b8bb7a 491static void netlink_install_filter(int sock, __u32 pid, __u32 dplane_pid)
1fdc9eae 492{
3575d9e8
DS
493 /*
494 * BPF_JUMP instructions and where you jump to are based upon
495 * 0 as being the next statement. So count from 0. Writing
496 * this down because every time I look at this I have to
497 * re-remember it.
498 */
d62a17ae 499 struct sock_filter filter[] = {
3575d9e8
DS
500 /*
501 * Logic:
62b8bb7a
MS
502 * if (nlmsg_pid == pid ||
503 * nlmsg_pid == dplane_pid) {
3575d9e8
DS
504 * if (the incoming nlmsg_type ==
505 * RTM_NEWADDR | RTM_DELADDR)
506 * keep this message
507 * else
508 * skip this message
509 * } else
510 * keep this netlink message
511 */
512 /*
513 * 0: Load the nlmsg_pid into the BPF register
514 */
d62a17ae 515 BPF_STMT(BPF_LD | BPF_ABS | BPF_W,
516 offsetof(struct nlmsghdr, nlmsg_pid)),
3575d9e8
DS
517 /*
518 * 1: Compare to pid
519 */
62b8bb7a 520 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htonl(pid), 1, 0),
3575d9e8 521 /*
62b8bb7a
MS
522 * 2: Compare to dplane pid
523 */
524 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htonl(dplane_pid), 0, 4),
525 /*
526 * 3: Load the nlmsg_type into BPF register
3575d9e8
DS
527 */
528 BPF_STMT(BPF_LD | BPF_ABS | BPF_H,
529 offsetof(struct nlmsghdr, nlmsg_type)),
530 /*
62b8bb7a 531 * 4: Compare to RTM_NEWADDR
3575d9e8
DS
532 */
533 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htons(RTM_NEWADDR), 2, 0),
534 /*
62b8bb7a 535 * 5: Compare to RTM_DELADDR
3575d9e8
DS
536 */
537 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htons(RTM_DELADDR), 1, 0),
538 /*
62b8bb7a 539 * 6: This is the end state of we want to skip the
3575d9e8
DS
540 * message
541 */
d62a17ae 542 BPF_STMT(BPF_RET | BPF_K, 0),
62b8bb7a 543 /* 7: This is the end state of we want to keep
3575d9e8
DS
544 * the message
545 */
d62a17ae 546 BPF_STMT(BPF_RET | BPF_K, 0xffff),
547 };
548
549 struct sock_fprog prog = {
9d303b37 550 .len = array_size(filter), .filter = filter,
d62a17ae 551 };
552
553 if (setsockopt(sock, SOL_SOCKET, SO_ATTACH_FILTER, &prog, sizeof(prog))
554 < 0)
1c50c1c0 555 flog_err_sys(EC_LIB_SOCKET, "Can't install socket filter: %s\n",
9df414fe 556 safe_strerror(errno));
1fdc9eae 557}
558
d62a17ae 559void netlink_parse_rtattr(struct rtattr **tb, int max, struct rtattr *rta,
560 int len)
1fdc9eae 561{
d62a17ae 562 while (RTA_OK(rta, len)) {
563 if (rta->rta_type <= max)
564 tb[rta->rta_type] = rta;
565 rta = RTA_NEXT(rta, len);
566 }
1fdc9eae 567}
568
87da6a60
SW
569/**
570 * netlink_parse_rtattr_nested() - Parses a nested route attribute
571 * @tb: Pointer to array for storing rtattr in.
572 * @max: Max number to store.
573 * @rta: Pointer to rtattr to look for nested items in.
574 */
575void netlink_parse_rtattr_nested(struct rtattr **tb, int max,
576 struct rtattr *rta)
577{
578 netlink_parse_rtattr(tb, max, RTA_DATA(rta), RTA_PAYLOAD(rta));
579}
580
312a6bee
JU
581bool nl_attr_put(struct nlmsghdr *n, unsigned int maxlen, int type,
582 const void *data, unsigned int alen)
1fdc9eae 583{
d62a17ae 584 int len;
585 struct rtattr *rta;
1fdc9eae 586
d62a17ae 587 len = RTA_LENGTH(alen);
1fdc9eae 588
d62a17ae 589 if (NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len) > maxlen)
312a6bee 590 return false;
1fdc9eae 591
d62a17ae 592 rta = (struct rtattr *)(((char *)n) + NLMSG_ALIGN(n->nlmsg_len));
593 rta->rta_type = type;
594 rta->rta_len = len;
4b2792b5 595
d62a17ae 596 if (data)
597 memcpy(RTA_DATA(rta), data, alen);
598 else
599 assert(alen == 0);
4b2792b5 600
d62a17ae 601 n->nlmsg_len = NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len);
1fdc9eae 602
312a6bee 603 return true;
1fdc9eae 604}
605
312a6bee
JU
606bool nl_attr_put16(struct nlmsghdr *n, unsigned int maxlen, int type,
607 uint16_t data)
bbc16902 608{
312a6bee 609 return nl_attr_put(n, maxlen, type, &data, sizeof(uint16_t));
bbc16902 610}
611
312a6bee
JU
612bool nl_attr_put32(struct nlmsghdr *n, unsigned int maxlen, int type,
613 uint32_t data)
1fdc9eae 614{
312a6bee 615 return nl_attr_put(n, maxlen, type, &data, sizeof(uint32_t));
1fdc9eae 616}
617
312a6bee 618struct rtattr *nl_attr_nest(struct nlmsghdr *n, unsigned int maxlen, int type)
1fdc9eae 619{
d62a17ae 620 struct rtattr *nest = NLMSG_TAIL(n);
1fdc9eae 621
312a6bee
JU
622 if (!nl_attr_put(n, maxlen, type, NULL, 0))
623 return NULL;
624
40d86eba 625 nest->rta_type |= NLA_F_NESTED;
d62a17ae 626 return nest;
1fdc9eae 627}
628
312a6bee 629int nl_attr_nest_end(struct nlmsghdr *n, struct rtattr *nest)
1fdc9eae 630{
d7c0a89a 631 nest->rta_len = (uint8_t *)NLMSG_TAIL(n) - (uint8_t *)nest;
d62a17ae 632 return n->nlmsg_len;
1fdc9eae 633}
634
312a6bee 635struct rtnexthop *nl_attr_rtnh(struct nlmsghdr *n, unsigned int maxlen)
1fdc9eae 636{
312a6bee 637 struct rtnexthop *rtnh = (struct rtnexthop *)NLMSG_TAIL(n);
1fdc9eae 638
312a6bee
JU
639 if (NLMSG_ALIGN(n->nlmsg_len) + RTNH_ALIGN(sizeof(struct rtnexthop))
640 > maxlen)
641 return NULL;
642
643 memset(rtnh, 0, sizeof(struct rtnexthop));
644 n->nlmsg_len =
645 NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(sizeof(struct rtnexthop));
646
647 return rtnh;
1fdc9eae 648}
649
312a6bee 650void nl_attr_rtnh_end(struct nlmsghdr *n, struct rtnexthop *rtnh)
1fdc9eae 651{
312a6bee 652 rtnh->rtnh_len = (uint8_t *)NLMSG_TAIL(n) - (uint8_t *)rtnh;
1fdc9eae 653}
654
d62a17ae 655const char *nl_msg_type_to_str(uint16_t msg_type)
1fdc9eae 656{
d62a17ae 657 return lookup_msg(nlmsg_str, msg_type, "");
1fdc9eae 658}
659
d7c0a89a 660const char *nl_rtproto_to_str(uint8_t rtproto)
1fdc9eae 661{
d62a17ae 662 return lookup_msg(rtproto_str, rtproto, "");
1fdc9eae 663}
b339bde7 664
d7c0a89a 665const char *nl_family_to_str(uint8_t family)
b339bde7 666{
d62a17ae 667 return lookup_msg(family_str, family, "");
b339bde7
DS
668}
669
d7c0a89a 670const char *nl_rttype_to_str(uint8_t rttype)
b339bde7 671{
d62a17ae 672 return lookup_msg(rttype_str, rttype, "");
b339bde7
DS
673}
674
4cebb2b6 675#define NLA_OK(nla, len) \
5d307d5d
DS
676 ((len) >= (int)sizeof(struct nlattr) \
677 && (nla)->nla_len >= sizeof(struct nlattr) \
678 && (nla)->nla_len <= (len))
4cebb2b6
SW
679#define NLA_NEXT(nla, attrlen) \
680 ((attrlen) -= NLA_ALIGN((nla)->nla_len), \
681 (struct nlattr *)(((char *)(nla)) + NLA_ALIGN((nla)->nla_len)))
682#define NLA_LENGTH(len) (NLA_ALIGN(sizeof(struct nlattr)) + (len))
683#define NLA_DATA(nla) ((struct nlattr *)(((char *)(nla)) + NLA_LENGTH(0)))
684
685#define ERR_NLA(err, inner_len) \
686 ((struct nlattr *)(((char *)(err)) \
687 + NLMSG_ALIGN(sizeof(struct nlmsgerr)) \
688 + NLMSG_ALIGN((inner_len))))
5d307d5d
DS
689
690static void netlink_parse_nlattr(struct nlattr **tb, int max,
691 struct nlattr *nla, int len)
692{
4cebb2b6 693 while (NLA_OK(nla, len)) {
5d307d5d
DS
694 if (nla->nla_type <= max)
695 tb[nla->nla_type] = nla;
4cebb2b6 696 nla = NLA_NEXT(nla, len);
5d307d5d
DS
697 }
698}
699
700static void netlink_parse_extended_ack(struct nlmsghdr *h)
701{
4cebb2b6
SW
702 struct nlattr *tb[NLMSGERR_ATTR_MAX + 1] = {};
703 const struct nlmsgerr *err = (const struct nlmsgerr *)NLMSG_DATA(h);
5d307d5d 704 const struct nlmsghdr *err_nlh = NULL;
4cebb2b6
SW
705 /* Length not including nlmsghdr */
706 uint32_t len = 0;
707 /* Inner error netlink message length */
708 uint32_t inner_len = 0;
5d307d5d
DS
709 const char *msg = NULL;
710 uint32_t off = 0;
711
712 if (!(h->nlmsg_flags & NLM_F_CAPPED))
4cebb2b6
SW
713 inner_len = (uint32_t)NLMSG_PAYLOAD(&err->msg, 0);
714
715 len = (uint32_t)(NLMSG_PAYLOAD(h, sizeof(struct nlmsgerr)) - inner_len);
5d307d5d 716
4cebb2b6
SW
717 netlink_parse_nlattr(tb, NLMSGERR_ATTR_MAX, ERR_NLA(err, inner_len),
718 len);
5d307d5d
DS
719
720 if (tb[NLMSGERR_ATTR_MSG])
4cebb2b6 721 msg = (const char *)NLA_DATA(tb[NLMSGERR_ATTR_MSG]);
5d307d5d
DS
722
723 if (tb[NLMSGERR_ATTR_OFFS]) {
4cebb2b6 724 off = *(uint32_t *)NLA_DATA(tb[NLMSGERR_ATTR_OFFS]);
5d307d5d
DS
725
726 if (off > h->nlmsg_len) {
9165c5f5 727 zlog_err("Invalid offset for NLMSGERR_ATTR_OFFS");
5d307d5d
DS
728 } else if (!(h->nlmsg_flags & NLM_F_CAPPED)) {
729 /*
730 * Header of failed message
731 * we are not doing anything currently with it
732 * but noticing it for later.
733 */
734 err_nlh = &err->msg;
15569c58 735 zlog_debug("%s: Received %s extended Ack", __func__,
87b5d1b0 736 nl_msg_type_to_str(err_nlh->nlmsg_type));
5d307d5d
DS
737 }
738 }
739
740 if (msg && *msg != '\0') {
741 bool is_err = !!err->error;
742
743 if (is_err)
744 zlog_err("Extended Error: %s", msg);
745 else
e914ccbe 746 flog_warn(EC_ZEBRA_NETLINK_EXTENDED_WARNING,
9df414fe 747 "Extended Warning: %s", msg);
5d307d5d
DS
748 }
749}
750
ae6138bf
JU
751/*
752 * netlink_send_msg - send a netlink message of a certain size.
753 *
754 * Returns -1 on error. Otherwise, it returns the number of bytes sent.
755 */
f8653393
JU
756static ssize_t netlink_send_msg(const struct nlsock *nl, void *buf,
757 size_t buflen)
ae6138bf 758{
f8653393
JU
759 struct sockaddr_nl snl = {};
760 struct iovec iov = {};
761 struct msghdr msg = {};
762 ssize_t status;
763 int save_errno = 0;
ae6138bf
JU
764
765 iov.iov_base = buf;
766 iov.iov_len = buflen;
f8653393 767 msg.msg_name = &snl;
ae6138bf
JU
768 msg.msg_namelen = sizeof(snl);
769 msg.msg_iov = &iov;
770 msg.msg_iovlen = 1;
771
772 snl.nl_family = AF_NETLINK;
773
774 /* Send message to netlink interface. */
775 frr_with_privs(&zserv_privs) {
776 status = sendmsg(nl->sock, &msg, 0);
777 save_errno = errno;
778 }
779
780 if (IS_ZEBRA_DEBUG_KERNEL_MSGDUMP_SEND) {
781 zlog_debug("%s: >> netlink message dump [sent]", __func__);
782 zlog_hexdump(buf, buflen);
783 }
784
f8653393 785 if (status == -1) {
ae6138bf
JU
786 flog_err_sys(EC_LIB_SOCKET, "%s error: %s", __func__,
787 safe_strerror(save_errno));
788 return -1;
789 }
790
791 return status;
792}
793
794/*
795 * netlink_recv_msg - receive a netlink message.
796 *
797 * Returns -1 on error, 0 if read would block or the number of bytes received.
798 */
799static int netlink_recv_msg(const struct nlsock *nl, struct msghdr msg,
800 void *buf, size_t buflen)
801{
802 struct iovec iov;
803 int status;
804
805 iov.iov_base = buf;
806 iov.iov_len = buflen;
807 msg.msg_iov = &iov;
808 msg.msg_iovlen = 1;
809
810 do {
811#if defined(HANDLE_NETLINK_FUZZING)
812 /* Check if reading and filename is set */
813 if (netlink_read && '\0' != netlink_fuzz_file[0]) {
814 zlog_debug("Reading netlink fuzz file");
815 status = netlink_read_file(buf, netlink_fuzz_file);
816 ((struct sockaddr_nl *)msg.msg_name)->nl_pid = 0;
817 } else {
818 status = recvmsg(nl->sock, &msg, 0);
819 }
820#else
821 status = recvmsg(nl->sock, &msg, 0);
822#endif /* HANDLE_NETLINK_FUZZING */
f8653393 823 } while (status == -1 && errno == EINTR);
ae6138bf 824
f8653393 825 if (status == -1) {
ae6138bf
JU
826 if (errno == EWOULDBLOCK || errno == EAGAIN)
827 return 0;
828 flog_err(EC_ZEBRA_RECVMSG_OVERRUN, "%s recvmsg overrun: %s",
829 nl->name, safe_strerror(errno));
830 /*
831 * In this case we are screwed. There is no good way to recover
832 * zebra at this point.
833 */
834 exit(-1);
835 }
836
837 if (status == 0) {
838 flog_err_sys(EC_LIB_SOCKET, "%s EOF", nl->name);
839 return -1;
840 }
841
842 if (msg.msg_namelen != sizeof(struct sockaddr_nl)) {
843 flog_err(EC_ZEBRA_NETLINK_LENGTH_ERROR,
844 "%s sender address length error: length %d", nl->name,
845 msg.msg_namelen);
846 return -1;
847 }
848
849 if (IS_ZEBRA_DEBUG_KERNEL_MSGDUMP_RECV) {
850 zlog_debug("%s: << netlink message dump [recv]", __func__);
851 zlog_hexdump(buf, status);
852 }
853
854#if defined(HANDLE_NETLINK_FUZZING)
855 if (!netlink_read) {
856 zlog_debug("Writing incoming netlink message");
857 netlink_write_incoming(buf, status, netlink_file_counter++);
858 }
859#endif /* HANDLE_NETLINK_FUZZING */
860
861 return status;
862}
863
864/*
865 * netlink_parse_error - parse a netlink error message
866 *
867 * Returns 1 if this message is acknowledgement, 0 if this error should be
868 * ignored, -1 otherwise.
869 */
870static int netlink_parse_error(const struct nlsock *nl, struct nlmsghdr *h,
871 const struct zebra_dplane_info *zns,
872 bool startup)
873{
874 struct nlmsgerr *err = (struct nlmsgerr *)NLMSG_DATA(h);
875 int errnum = err->error;
876 int msg_type = err->msg.nlmsg_type;
877
878 if (h->nlmsg_len < NLMSG_LENGTH(sizeof(struct nlmsgerr))) {
879 flog_err(EC_ZEBRA_NETLINK_LENGTH_ERROR,
880 "%s error: message truncated", nl->name);
881 return -1;
882 }
883
884 /*
885 * Parse the extended information before we actually handle it. At this
886 * point in time we do not do anything other than report the issue.
887 */
888 if (h->nlmsg_flags & NLM_F_ACK_TLVS)
889 netlink_parse_extended_ack(h);
890
891 /* If the error field is zero, then this is an ACK. */
892 if (err->error == 0) {
893 if (IS_ZEBRA_DEBUG_KERNEL) {
894 zlog_debug("%s: %s ACK: type=%s(%u), seq=%u, pid=%u",
895 __func__, nl->name,
896 nl_msg_type_to_str(err->msg.nlmsg_type),
897 err->msg.nlmsg_type, err->msg.nlmsg_seq,
898 err->msg.nlmsg_pid);
899 }
900
901 return 1;
902 }
903
904 /* Deal with errors that occur because of races in link handling. */
905 if (zns->is_cmd
906 && ((msg_type == RTM_DELROUTE
907 && (-errnum == ENODEV || -errnum == ESRCH))
908 || (msg_type == RTM_NEWROUTE
909 && (-errnum == ENETDOWN || -errnum == EEXIST)))) {
910 if (IS_ZEBRA_DEBUG_KERNEL)
911 zlog_debug("%s: error: %s type=%s(%u), seq=%u, pid=%u",
912 nl->name, safe_strerror(-errnum),
913 nl_msg_type_to_str(msg_type), msg_type,
914 err->msg.nlmsg_seq, err->msg.nlmsg_pid);
915 return 0;
916 }
917
918 /*
919 * We see RTM_DELNEIGH when shutting down an interface with an IPv4
920 * link-local. The kernel should have already deleted the neighbor so
921 * do not log these as an error.
922 */
923 if (msg_type == RTM_DELNEIGH
924 || (zns->is_cmd && msg_type == RTM_NEWROUTE
925 && (-errnum == ESRCH || -errnum == ENETUNREACH))) {
926 /*
927 * This is known to happen in some situations, don't log as
928 * error.
929 */
930 if (IS_ZEBRA_DEBUG_KERNEL)
931 zlog_debug("%s error: %s, type=%s(%u), seq=%u, pid=%u",
932 nl->name, safe_strerror(-errnum),
933 nl_msg_type_to_str(msg_type), msg_type,
934 err->msg.nlmsg_seq, err->msg.nlmsg_pid);
935 } else {
936 if ((msg_type != RTM_GETNEXTHOP) || !startup)
937 flog_err(EC_ZEBRA_UNEXPECTED_MESSAGE,
938 "%s error: %s, type=%s(%u), seq=%u, pid=%u",
939 nl->name, safe_strerror(-errnum),
940 nl_msg_type_to_str(msg_type), msg_type,
941 err->msg.nlmsg_seq, err->msg.nlmsg_pid);
942 }
943
944 return -1;
945}
946
936ebf0a
DS
947/*
948 * netlink_parse_info
949 *
950 * Receive message from netlink interface and pass those information
951 * to the given function.
952 *
953 * filter -> Function to call to read the results
954 * nl -> netlink socket information
955 * zns -> The zebra namespace data
956 * count -> How many we should read in, 0 means as much as possible
957 * startup -> Are we reading in under startup conditions? passed to
958 * the filter.
959 */
2414abd3 960int netlink_parse_info(int (*filter)(struct nlmsghdr *, ns_id_t, int),
7cdb1a84
MS
961 const struct nlsock *nl,
962 const struct zebra_dplane_info *zns,
85a75f1e 963 int count, int startup)
1fdc9eae 964{
d62a17ae 965 int status;
966 int ret = 0;
967 int error;
968 int read_in = 0;
969
970 while (1) {
9ed7517b 971 char buf[NL_RCV_PKT_BUF_SIZE];
d62a17ae 972 struct sockaddr_nl snl;
973 struct msghdr msg = {.msg_name = (void *)&snl,
ae6138bf 974 .msg_namelen = sizeof(snl)};
d62a17ae 975 struct nlmsghdr *h;
976
977 if (count && read_in >= count)
978 return 0;
979
ae6138bf
JU
980 status = netlink_recv_msg(nl, msg, buf, sizeof(buf));
981 if (status == -1)
d62a17ae 982 return -1;
ae6138bf
JU
983 else if (status == 0)
984 break;
81a2f870 985
d62a17ae 986 read_in++;
987 for (h = (struct nlmsghdr *)buf;
e6a0e0d1 988 (status >= 0 && NLMSG_OK(h, (unsigned int)status));
d62a17ae 989 h = NLMSG_NEXT(h, status)) {
990 /* Finish of reading. */
991 if (h->nlmsg_type == NLMSG_DONE)
992 return ret;
993
994 /* Error handling. */
995 if (h->nlmsg_type == NLMSG_ERROR) {
ae6138bf
JU
996 int err = netlink_parse_error(nl, h, zns,
997 startup);
998 if (err == 1) {
d62a17ae 999 if (!(h->nlmsg_flags & NLM_F_MULTI))
1000 return 0;
1001 continue;
ae6138bf
JU
1002 } else
1003 return err;
d62a17ae 1004 }
1005
1006 /* OK we got netlink message. */
1007 if (IS_ZEBRA_DEBUG_KERNEL)
1008 zlog_debug(
1009 "netlink_parse_info: %s type %s(%u), len=%d, seq=%u, pid=%u",
1010 nl->name,
1011 nl_msg_type_to_str(h->nlmsg_type),
1012 h->nlmsg_type, h->nlmsg_len,
1013 h->nlmsg_seq, h->nlmsg_pid);
1014
783827ae
DS
1015
1016 /*
1017 * Ignore messages that maybe sent from
1018 * other actors besides the kernel
1019 */
1020 if (snl.nl_pid != 0) {
43e52561
QY
1021 zlog_debug("Ignoring message from pid %u",
1022 snl.nl_pid);
d62a17ae 1023 continue;
1024 }
1025
2414abd3 1026 error = (*filter)(h, zns->ns_id, startup);
d62a17ae 1027 if (error < 0) {
9df414fe
QY
1028 zlog_debug("%s filter function error",
1029 nl->name);
d62a17ae 1030 ret = error;
1031 }
1032 }
1033
1034 /* After error care. */
1035 if (msg.msg_flags & MSG_TRUNC) {
e914ccbe 1036 flog_err(EC_ZEBRA_NETLINK_LENGTH_ERROR,
1c50c1c0 1037 "%s error: message truncated", nl->name);
d62a17ae 1038 continue;
1039 }
1040 if (status) {
e914ccbe 1041 flog_err(EC_ZEBRA_NETLINK_LENGTH_ERROR,
1c50c1c0
QY
1042 "%s error: data remnant size %d", nl->name,
1043 status);
d62a17ae 1044 return -1;
1045 }
1046 }
1047 return ret;
1fdc9eae 1048}
1049
936ebf0a 1050/*
7cdb1a84 1051 * netlink_talk_info
936ebf0a
DS
1052 *
1053 * sendmsg() to netlink socket then recvmsg().
1054 * Calls netlink_parse_info to parse returned data
1055 *
1056 * filter -> The filter to read final results from kernel
1057 * nlmsghdr -> The data to send to the kernel
8b962e77 1058 * dp_info -> The dataplane and netlink socket information
936ebf0a
DS
1059 * startup -> Are we reading in under startup conditions
1060 * This is passed through eventually to filter.
1061 */
7cdb1a84
MS
1062int netlink_talk_info(int (*filter)(struct nlmsghdr *, ns_id_t, int startup),
1063 struct nlmsghdr *n,
1064 const struct zebra_dplane_info *dp_info, int startup)
1fdc9eae 1065{
7cdb1a84 1066 const struct nlsock *nl;
d62a17ae 1067
7cdb1a84
MS
1068 nl = &(dp_info->nls);
1069 n->nlmsg_seq = nl->seq;
d62a17ae 1070 n->nlmsg_pid = nl->snl.nl_pid;
1071
d62a17ae 1072 if (IS_ZEBRA_DEBUG_KERNEL)
1073 zlog_debug(
1074 "netlink_talk: %s type %s(%u), len=%d seq=%u flags 0x%x",
1075 nl->name, nl_msg_type_to_str(n->nlmsg_type),
1076 n->nlmsg_type, n->nlmsg_len, n->nlmsg_seq,
1077 n->nlmsg_flags);
1078
f8653393 1079 if (netlink_send_msg(nl, n, n->nlmsg_len) == -1)
d62a17ae 1080 return -1;
d62a17ae 1081
d62a17ae 1082 /*
1083 * Get reply from netlink socket.
1084 * The reply should either be an acknowlegement or an error.
1085 */
7cdb1a84
MS
1086 return netlink_parse_info(filter, nl, dp_info, 0, startup);
1087}
1088
1089/*
1090 * Synchronous version of netlink_talk_info. Converts args to suit the
1091 * common version, which is suitable for both sync and async use.
7cdb1a84
MS
1092 */
1093int netlink_talk(int (*filter)(struct nlmsghdr *, ns_id_t, int startup),
1094 struct nlmsghdr *n, struct nlsock *nl, struct zebra_ns *zns,
1095 int startup)
1096{
1097 struct zebra_dplane_info dp_info;
1098
1099 /* Increment sequence number before capturing snapshot of ns socket
1100 * info.
1101 */
1102 nl->seq++;
1103
1104 /* Capture info in intermediate info struct */
85a75f1e 1105 zebra_dplane_info_from_zns(&dp_info, zns, (nl == &(zns->netlink_cmd)));
7cdb1a84 1106
5709131c 1107 return netlink_talk_info(filter, n, &dp_info, startup);
1fdc9eae 1108}
1109
289602d7 1110/* Issue request message to kernel via netlink socket. GET messages
1111 * are issued through this interface.
1112 */
fd3f8e52 1113int netlink_request(struct nlsock *nl, void *req)
1fdc9eae 1114{
fd3f8e52 1115 struct nlmsghdr *n = (struct nlmsghdr *)req;
d62a17ae 1116
1117 /* Check netlink socket. */
1118 if (nl->sock < 0) {
450971aa 1119 flog_err_sys(EC_LIB_SOCKET, "%s socket isn't active.",
09c866e3 1120 nl->name);
d62a17ae 1121 return -1;
1122 }
1123
1124 /* Fill common fields for all requests. */
d62a17ae 1125 n->nlmsg_pid = nl->snl.nl_pid;
1126 n->nlmsg_seq = ++nl->seq;
1127
f8653393 1128 if (netlink_send_msg(nl, req, n->nlmsg_len) == -1)
d62a17ae 1129 return -1;
d62a17ae 1130
1131 return 0;
1fdc9eae 1132}
1133
e63c7622
JU
1134static int nl_batch_read_resp(struct nl_batch *bth)
1135{
1136 struct nlmsghdr *h;
1137 struct sockaddr_nl snl;
1138 struct msghdr msg;
1139 int status;
1140 const struct nlsock *nl;
1141 struct nl_msg_batch_item *item, *from_item;
1142
1143 nl = &(bth->zns->nls);
1144
1145 msg.msg_name = (void *)&snl;
1146 msg.msg_namelen = sizeof(snl);
1147
1148 from_item = nl_batch_list_first(&(bth->items));
1149
1150 status = netlink_recv_msg(nl, msg, nl_batch_rx_buf,
1151 sizeof(nl_batch_rx_buf));
1152 if (status == -1 || status == 0)
1153 return status;
1154
1155 for (h = (struct nlmsghdr *)nl_batch_rx_buf;
1156 (status >= 0 && NLMSG_OK(h, (unsigned int)status));
1157 h = NLMSG_NEXT(h, status)) {
1158
1159 /*
1160 * Find the corresponding batch item. Received responses are in
1161 * the same order as requests we sent, so we can simply iterate
1162 * over the batch item list and match responses with requests
1163 * at same time.
1164 */
1165 frr_each_from(nl_batch_list, &(bth->items), item, from_item) {
1166 if (item->seq == (int)h->nlmsg_seq)
1167 break;
1168 }
1169
1170 /*
1171 * We received a message with the sequence number that isn't
1172 * associated with any dplane context object.
1173 */
1174 if (item == NULL) {
1175 zlog_debug(
1176 "%s: skipping unassociated response, seq number %d NS %u",
1177 __func__, h->nlmsg_seq, bth->zns->ns_id);
1178 from_item = nl_batch_list_first(&(bth->items));
1179 continue;
1180 }
1181
1182 if (h->nlmsg_type == NLMSG_ERROR) {
1183 int err = netlink_parse_error(nl, h, bth->zns, 0);
1184
1185 if (err == -1)
1186 item->failure = true;
1187
1188 zlog_debug("%s: netlink error message seq=%d ",
1189 __func__, h->nlmsg_seq);
1190 continue;
1191 }
1192
1193 /*
1194 * If we get here then we did not receive neither the ack nor
1195 * the error and instead received some other message in an
1196 * unexpected way.
1197 */
1198 zlog_debug("%s: ignoring message type 0x%04x(%s) NS %u",
1199 __func__, h->nlmsg_type,
1200 nl_msg_type_to_str(h->nlmsg_type), bth->zns->ns_id);
1201 }
1202
1203 return 0;
1204}
1205
1206static void nl_batch_reset(struct nl_batch *bth)
1207{
1208 bth->buf = nl_batch_tx_buf;
1209 bth->bufsiz = sizeof(nl_batch_tx_buf);
1210 bth->limit = NL_BATCH_SEND_THRESHOLD;
1211
1212 bth->buf_head = bth->buf;
1213 bth->curlen = 0;
1214 bth->msgcnt = 0;
1215 bth->zns = NULL;
1216
1217 nl_batch_list_init(&(bth->items));
1218}
1219
1220static void nl_batch_send(struct nl_batch *bth)
1221{
1222 struct nl_msg_batch_item *item;
1223 bool err = false;
1224
1225 if (bth->curlen == 0 || bth->zns == NULL)
1226 return;
1227
1228 if (IS_ZEBRA_DEBUG_KERNEL)
1229 zlog_debug("%s: %s, batch size=%zu, msg cnt=%zu", __func__,
1230 bth->zns->nls.name, bth->curlen, bth->msgcnt);
1231
1232 if (netlink_send_msg(&(bth->zns->nls), bth->buf, bth->curlen) == -1)
1233 err = true;
1234
1235 if (!err) {
1236 if (nl_batch_read_resp(bth) == -1)
1237 err = true;
1238 }
1239
1240 frr_each_safe(nl_batch_list, &(bth->items), item) {
1241 enum zebra_dplane_result res = ZEBRA_DPLANE_REQUEST_SUCCESS;
1242
1243 /*
1244 * If either sending or receiving a message batch has ended with
1245 * the error, mark all dplane requests as failed.
1246 */
1247 if (item->failure || err)
1248 res = ZEBRA_DPLANE_REQUEST_FAILURE;
1249
1250 if (!item->ignore_res)
1251 dplane_ctx_set_status(item->ctx, res);
1252
1253 nl_batch_list_del(&(bth->items), item);
1254 XFREE(MTYPE_NL_BATCH_ITEM, item);
1255 }
1256
1257 nl_batch_reset(bth);
1258}
1259
1260static void nl_batch_add_item(struct nl_batch *bth, int seq,
1261 struct zebra_dplane_ctx *ctx, bool ignore_res)
1262{
1263 struct nl_msg_batch_item *item =
1264 XCALLOC(MTYPE_NL_BATCH_ITEM, sizeof(*item));
1265
1266 item->seq = seq;
1267 item->ctx = ctx;
1268 item->ignore_res = ignore_res;
1269 item->failure = false;
1270
1271 nl_batch_list_add_tail(&(bth->items), item);
1272}
1273
1274enum netlink_msg_status netlink_batch_add_msg(
1275 struct nl_batch *bth, struct zebra_dplane_ctx *ctx,
1276 ssize_t (*msg_encoder)(struct zebra_dplane_ctx *, void *, size_t),
1277 bool extra_msg)
1278{
1279 int seq;
1280 ssize_t size;
1281 struct nlmsghdr *msgh;
1282
1283 if (bth->zns != NULL
1284 && bth->zns->ns_id != dplane_ctx_get_ns(ctx)->ns_id)
1285 nl_batch_send(bth);
1286
1287 size = (*msg_encoder)(ctx, bth->buf_head, bth->bufsiz - bth->curlen);
1288
1289 /*
1290 * If there was an error while encoding the message (other than buffer
1291 * overflow) then return an error.
1292 */
1293 if (size < 0)
1294 return FRR_NETLINK_ERROR;
1295
1296 /*
1297 * If the message doesn't fit entirely in the buffer then send the batch
1298 * and retry.
1299 */
1300 if (size == 0) {
1301 nl_batch_send(bth);
1302 size = (*msg_encoder)(ctx, bth->buf_head,
1303 bth->bufsiz - bth->curlen);
1304 /*
1305 * If the message doesn't fit in the empty buffer then just
1306 * return an error.
1307 */
1308 if (size <= 0)
1309 return FRR_NETLINK_ERROR;
1310 }
1311
1312 seq = dplane_ctx_get_ns(ctx)->nls.seq;
1313 if (extra_msg)
1314 seq++;
1315
1316 msgh = (struct nlmsghdr *)bth->buf_head;
1317 msgh->nlmsg_seq = seq;
1318 msgh->nlmsg_pid = dplane_ctx_get_ns(ctx)->nls.snl.nl_pid;
1319
1320 nl_batch_add_item(bth, seq, ctx, extra_msg);
1321
1322 bth->zns = dplane_ctx_get_ns(ctx);
1323 bth->buf_head = ((char *)bth->buf_head) + size;
1324 bth->curlen += size;
1325 bth->msgcnt++;
1326
1327 if (bth->curlen > bth->limit)
1328 nl_batch_send(bth);
1329
1330 return FRR_NETLINK_QUEUED;
1331}
1332
fef24b03
JU
1333void kernel_update_multi(struct dplane_ctx_q *ctx_list)
1334{
1335 /* no-op */
1336}
1337
1338bool kernel_supports_batch(void)
1339{
1340 return false;
1341}
1342
1fdc9eae 1343/* Exported interface function. This function simply calls
1344 netlink_socket (). */
d62a17ae 1345void kernel_init(struct zebra_ns *zns)
1fdc9eae 1346{
67188ca2 1347 uint32_t groups;
5d307d5d
DS
1348#if defined SOL_NETLINK
1349 int one, ret;
1350#endif
d62a17ae 1351
026a316f
DS
1352 /*
1353 * Initialize netlink sockets
1354 *
1355 * If RTMGRP_XXX exists use that, but at some point
1356 * I think the kernel developers realized that
1357 * keeping track of all the different values would
1358 * lead to confusion, so we need to convert the
1359 * RTNLGRP_XXX to a bit position for ourself
1360 */
1361 groups = RTMGRP_LINK |
1362 RTMGRP_IPV4_ROUTE |
1363 RTMGRP_IPV4_IFADDR |
1364 RTMGRP_IPV6_ROUTE |
1365 RTMGRP_IPV6_IFADDR |
1366 RTMGRP_IPV4_MROUTE |
1367 RTMGRP_NEIGH |
67188ca2
QY
1368 ((uint32_t) 1 << (RTNLGRP_IPV4_RULE - 1)) |
1369 ((uint32_t) 1 << (RTNLGRP_IPV6_RULE - 1)) |
1370 ((uint32_t) 1 << (RTNLGRP_NEXTHOP - 1));
d62a17ae 1371
1372 snprintf(zns->netlink.name, sizeof(zns->netlink.name),
1373 "netlink-listen (NS %u)", zns->ns_id);
1374 zns->netlink.sock = -1;
19d5a4fe
DS
1375 if (netlink_socket(&zns->netlink, groups, zns->ns_id) < 0) {
1376 zlog_err("Failure to create %s socket",
1377 zns->netlink.name);
1378 exit(-1);
1379 }
d62a17ae 1380
1381 snprintf(zns->netlink_cmd.name, sizeof(zns->netlink_cmd.name),
1382 "netlink-cmd (NS %u)", zns->ns_id);
1383 zns->netlink_cmd.sock = -1;
19d5a4fe
DS
1384 if (netlink_socket(&zns->netlink_cmd, 0, zns->ns_id) < 0) {
1385 zlog_err("Failure to create %s socket",
1386 zns->netlink_cmd.name);
1387 exit(-1);
1388 }
d62a17ae 1389
62b8bb7a
MS
1390 snprintf(zns->netlink_dplane.name, sizeof(zns->netlink_dplane.name),
1391 "netlink-dp (NS %u)", zns->ns_id);
1392 zns->netlink_dplane.sock = -1;
1393 if (netlink_socket(&zns->netlink_dplane, 0, zns->ns_id) < 0) {
1394 zlog_err("Failure to create %s socket",
1395 zns->netlink_dplane.name);
1396 exit(-1);
1397 }
1398
5d307d5d
DS
1399 /*
1400 * SOL_NETLINK is not available on all platforms yet
1401 * apparently. It's in bits/socket.h which I am not
1402 * sure that we want to pull into our build system.
1403 */
1404#if defined SOL_NETLINK
1405 /*
1406 * Let's tell the kernel that we want to receive extended
62b8bb7a 1407 * ACKS over our command socket(s)
5d307d5d
DS
1408 */
1409 one = 1;
1410 ret = setsockopt(zns->netlink_cmd.sock, SOL_NETLINK, NETLINK_EXT_ACK,
1411 &one, sizeof(one));
1412
1413 if (ret < 0)
62b8bb7a
MS
1414 zlog_notice("Registration for extended cmd ACK failed : %d %s",
1415 errno, safe_strerror(errno));
1416
1417 one = 1;
1418 ret = setsockopt(zns->netlink_dplane.sock, SOL_NETLINK, NETLINK_EXT_ACK,
1419 &one, sizeof(one));
1420
1421 if (ret < 0)
1422 zlog_notice("Registration for extended dp ACK failed : %d %s",
5d307d5d
DS
1423 errno, safe_strerror(errno));
1424#endif
1425
d62a17ae 1426 /* Register kernel socket. */
19d5a4fe 1427 if (fcntl(zns->netlink.sock, F_SETFL, O_NONBLOCK) < 0)
450971aa 1428 flog_err_sys(EC_LIB_SOCKET, "Can't set %s socket flags: %s",
09c866e3 1429 zns->netlink.name, safe_strerror(errno));
8c85e8ea
DS
1430
1431 if (fcntl(zns->netlink_cmd.sock, F_SETFL, O_NONBLOCK) < 0)
1432 zlog_err("Can't set %s socket error: %s(%d)",
1433 zns->netlink_cmd.name, safe_strerror(errno), errno);
19d5a4fe 1434
62b8bb7a
MS
1435 if (fcntl(zns->netlink_dplane.sock, F_SETFL, O_NONBLOCK) < 0)
1436 zlog_err("Can't set %s socket error: %s(%d)",
1437 zns->netlink_dplane.name, safe_strerror(errno), errno);
1438
19d5a4fe
DS
1439 /* Set receive buffer size if it's set from command line */
1440 if (nl_rcvbufsize)
1441 netlink_recvbuf(&zns->netlink, nl_rcvbufsize);
1442
1443 netlink_install_filter(zns->netlink.sock,
62b8bb7a
MS
1444 zns->netlink_cmd.snl.nl_pid,
1445 zns->netlink_dplane.snl.nl_pid);
1446
19d5a4fe
DS
1447 zns->t_netlink = NULL;
1448
3801e764 1449 thread_add_read(zrouter.master, kernel_read, zns,
19d5a4fe 1450 zns->netlink.sock, &zns->t_netlink);
d62a17ae 1451
1452 rt_netlink_init();
1fdc9eae 1453}
1454
62b8bb7a 1455void kernel_terminate(struct zebra_ns *zns, bool complete)
1fdc9eae 1456{
d62a17ae 1457 THREAD_READ_OFF(zns->t_netlink);
1458
1459 if (zns->netlink.sock >= 0) {
1460 close(zns->netlink.sock);
1461 zns->netlink.sock = -1;
1462 }
1463
1464 if (zns->netlink_cmd.sock >= 0) {
1465 close(zns->netlink_cmd.sock);
1466 zns->netlink_cmd.sock = -1;
1467 }
ddfeb486 1468
62b8bb7a
MS
1469 /* During zebra shutdown, we need to leave the dataplane socket
1470 * around until all work is done.
1471 */
1472 if (complete) {
1473 if (zns->netlink_dplane.sock >= 0) {
1474 close(zns->netlink_dplane.sock);
1475 zns->netlink_dplane.sock = -1;
1476 }
1477 }
1478}
ddfeb486 1479#endif /* HAVE_NETLINK */