]> git.proxmox.com Git - mirror_frr.git/blame - zebra/kernel_netlink.c
Merge pull request #10632 from donaldsharp/thread_return_null
[mirror_frr.git] / zebra / kernel_netlink.c
CommitLineData
718e3744 1/* Kernel communication using netlink interface.
2 * Copyright (C) 1999 Kunihiro Ishiguro
3 *
4 * This file is part of GNU Zebra.
5 *
6 * GNU Zebra is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the
8 * Free Software Foundation; either version 2, or (at your option) any
9 * later version.
10 *
11 * GNU Zebra is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
896014f4
DL
16 * You should have received a copy of the GNU General Public License along
17 * with this program; see the file COPYING; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
718e3744 19 */
1fdc9eae 20
21#include <zebra.h>
22
ddfeb486
DL
23#ifdef HAVE_NETLINK
24
1fdc9eae 25#include "linklist.h"
26#include "if.h"
27#include "log.h"
28#include "prefix.h"
29#include "connected.h"
30#include "table.h"
31#include "memory.h"
1fdc9eae 32#include "rib.h"
33#include "thread.h"
34#include "privs.h"
35#include "nexthop.h"
36#include "vrf.h"
37#include "mpls.h"
174482ef 38#include "lib_errors.h"
d4000d7b 39#include "hash.h"
1fdc9eae 40
3801e764
DS
41//#include "zebra/zserv.h"
42#include "zebra/zebra_router.h"
1fdc9eae 43#include "zebra/zebra_ns.h"
44#include "zebra/zebra_vrf.h"
05f7f5db 45#include "zebra/rt.h"
1fdc9eae 46#include "zebra/debug.h"
47#include "zebra/kernel_netlink.h"
48#include "zebra/rt_netlink.h"
49#include "zebra/if_netlink.h"
942bf97b 50#include "zebra/rule_netlink.h"
43e52561 51#include "zebra/zebra_errors.h"
1fdc9eae 52
53#ifndef SO_RCVBUFFORCE
54#define SO_RCVBUFFORCE (33)
55#endif
56
57/* Hack for GNU libc version 2. */
58#ifndef MSG_TRUNC
59#define MSG_TRUNC 0x20
60#endif /* MSG_TRUNC */
61
62#ifndef NLMSG_TAIL
d62a17ae 63#define NLMSG_TAIL(nmsg) \
d7c0a89a
QY
64 ((struct rtattr *)(((uint8_t *)(nmsg)) \
65 + NLMSG_ALIGN((nmsg)->nlmsg_len)))
1fdc9eae 66#endif
67
68#ifndef RTA_TAIL
d62a17ae 69#define RTA_TAIL(rta) \
d7c0a89a 70 ((struct rtattr *)(((uint8_t *)(rta)) + RTA_ALIGN((rta)->rta_len)))
1fdc9eae 71#endif
72
f909c673
DS
73#ifndef RTNL_FAMILY_IP6MR
74#define RTNL_FAMILY_IP6MR 129
75#endif
76
77#ifndef RTPROT_MROUTED
78#define RTPROT_MROUTED 17
79#endif
80
531c92b8 81#define NL_DEFAULT_BATCH_BUFSIZE (16 * NL_PKT_BUF_SIZE)
e63c7622
JU
82
83/*
84 * We limit the batch's size to a number smaller than the length of the
85 * underlying buffer since the last message that wouldn't fit the batch would go
86 * over the upper boundary and then it would have to be encoded again into a new
87 * buffer. If the difference between the limit and the length of the buffer is
88 * big enough (bigger than the biggest Netlink message) then this situation
89 * won't occur.
90 */
531c92b8
JU
91#define NL_DEFAULT_BATCH_SEND_THRESHOLD (15 * NL_PKT_BUF_SIZE)
92
d62a17ae 93static const struct message nlmsg_str[] = {{RTM_NEWROUTE, "RTM_NEWROUTE"},
94 {RTM_DELROUTE, "RTM_DELROUTE"},
95 {RTM_GETROUTE, "RTM_GETROUTE"},
96 {RTM_NEWLINK, "RTM_NEWLINK"},
97 {RTM_DELLINK, "RTM_DELLINK"},
98 {RTM_GETLINK, "RTM_GETLINK"},
99 {RTM_NEWADDR, "RTM_NEWADDR"},
100 {RTM_DELADDR, "RTM_DELADDR"},
101 {RTM_GETADDR, "RTM_GETADDR"},
102 {RTM_NEWNEIGH, "RTM_NEWNEIGH"},
103 {RTM_DELNEIGH, "RTM_DELNEIGH"},
104 {RTM_GETNEIGH, "RTM_GETNEIGH"},
942bf97b 105 {RTM_NEWRULE, "RTM_NEWRULE"},
106 {RTM_DELRULE, "RTM_DELRULE"},
107 {RTM_GETRULE, "RTM_GETRULE"},
79580b5a
SW
108 {RTM_NEWNEXTHOP, "RTM_NEWNEXTHOP"},
109 {RTM_DELNEXTHOP, "RTM_DELNEXTHOP"},
110 {RTM_GETNEXTHOP, "RTM_GETNEXTHOP"},
d62a17ae 111 {0}};
1fdc9eae 112
113static const struct message rtproto_str[] = {
d62a17ae 114 {RTPROT_REDIRECT, "redirect"},
115 {RTPROT_KERNEL, "kernel"},
116 {RTPROT_BOOT, "boot"},
117 {RTPROT_STATIC, "static"},
118 {RTPROT_GATED, "GateD"},
119 {RTPROT_RA, "router advertisement"},
120 {RTPROT_MRT, "MRT"},
121 {RTPROT_ZEBRA, "Zebra"},
1fdc9eae 122#ifdef RTPROT_BIRD
d62a17ae 123 {RTPROT_BIRD, "BIRD"},
1fdc9eae 124#endif /* RTPROT_BIRD */
d62a17ae 125 {RTPROT_MROUTED, "mroute"},
126 {RTPROT_BGP, "BGP"},
127 {RTPROT_OSPF, "OSPF"},
128 {RTPROT_ISIS, "IS-IS"},
129 {RTPROT_RIP, "RIP"},
130 {RTPROT_RIPNG, "RIPNG"},
d4d71f11 131 {RTPROT_ZSTATIC, "static"},
d62a17ae 132 {0}};
133
134static const struct message family_str[] = {{AF_INET, "ipv4"},
135 {AF_INET6, "ipv6"},
136 {AF_BRIDGE, "bridge"},
137 {RTNL_FAMILY_IPMR, "ipv4MR"},
138 {RTNL_FAMILY_IP6MR, "ipv6MR"},
139 {0}};
140
8c8f250b
DS
141static const struct message rttype_str[] = {{RTN_UNSPEC, "none"},
142 {RTN_UNICAST, "unicast"},
143 {RTN_LOCAL, "local"},
144 {RTN_BROADCAST, "broadcast"},
145 {RTN_ANYCAST, "anycast"},
d62a17ae 146 {RTN_MULTICAST, "multicast"},
8c8f250b
DS
147 {RTN_BLACKHOLE, "blackhole"},
148 {RTN_UNREACHABLE, "unreachable"},
149 {RTN_PROHIBIT, "prohibited"},
150 {RTN_THROW, "throw"},
151 {RTN_NAT, "nat"},
152 {RTN_XRESOLVE, "resolver"},
d62a17ae 153 {0}};
b339bde7 154
1fdc9eae 155extern struct thread_master *master;
d7c0a89a 156extern uint32_t nl_rcvbufsize;
1fdc9eae 157
158extern struct zebra_privs_t zserv_privs;
159
bf8d3d6a 160DEFINE_MTYPE_STATIC(ZEBRA, NL_BUF, "Zebra Netlink buffers");
531c92b8 161
34869809
MS
162/* Hashtable and mutex to allow lookup of nlsock structs by socket/fd value.
163 * We have both the main and dplane pthreads using these structs, so we have
164 * to protect the hash with a lock.
165 */
166static struct hash *nlsock_hash;
167pthread_mutex_t nlsock_mutex;
168
169/* Lock and unlock wrappers for nlsock hash */
170#define NLSOCK_LOCK() pthread_mutex_lock(&nlsock_mutex)
171#define NLSOCK_UNLOCK() pthread_mutex_unlock(&nlsock_mutex)
172
531c92b8
JU
173size_t nl_batch_tx_bufsize;
174char *nl_batch_tx_buf;
175
531c92b8
JU
176_Atomic uint32_t nl_batch_bufsize = NL_DEFAULT_BATCH_BUFSIZE;
177_Atomic uint32_t nl_batch_send_threshold = NL_DEFAULT_BATCH_SEND_THRESHOLD;
178
e63c7622
JU
179struct nl_batch {
180 void *buf;
181 size_t bufsiz;
182 size_t limit;
183
184 void *buf_head;
185 size_t curlen;
186 size_t msgcnt;
187
188 const struct zebra_dplane_info *zns;
e63c7622 189
f6feb48b 190 struct dplane_ctx_q ctx_list;
e63c7622 191
f6feb48b
JU
192 /*
193 * Pointer to the queue of completed contexts outbound back
194 * towards the dataplane module.
195 */
196 struct dplane_ctx_q *ctx_out_q;
e63c7622
JU
197};
198
531c92b8
JU
199int netlink_config_write_helper(struct vty *vty)
200{
201 uint32_t size =
202 atomic_load_explicit(&nl_batch_bufsize, memory_order_relaxed);
203 uint32_t threshold = atomic_load_explicit(&nl_batch_send_threshold,
204 memory_order_relaxed);
205
206 if (size != NL_DEFAULT_BATCH_BUFSIZE
207 || threshold != NL_DEFAULT_BATCH_SEND_THRESHOLD)
208 vty_out(vty, "zebra kernel netlink batch-tx-buf %u %u\n", size,
209 threshold);
210
211 return 0;
212}
213
214void netlink_set_batch_buffer_size(uint32_t size, uint32_t threshold, bool set)
215{
216 if (!set) {
217 size = NL_DEFAULT_BATCH_BUFSIZE;
218 threshold = NL_DEFAULT_BATCH_SEND_THRESHOLD;
219 }
220
221 atomic_store_explicit(&nl_batch_bufsize, size, memory_order_relaxed);
222 atomic_store_explicit(&nl_batch_send_threshold, threshold,
223 memory_order_relaxed);
224}
225
2414abd3 226int netlink_talk_filter(struct nlmsghdr *h, ns_id_t ns_id, int startup)
1fdc9eae 227{
3575d9e8
DS
228 /*
229 * This is an error condition that must be handled during
230 * development.
231 *
232 * The netlink_talk_filter function is used for communication
233 * down the netlink_cmd pipe and we are expecting
234 * an ack being received. So if we get here
235 * then we did not receive the ack and instead
236 * received some other message in an unexpected
237 * way.
238 */
43e52561
QY
239 zlog_debug("%s: ignoring message type 0x%04x(%s) NS %u", __func__,
240 h->nlmsg_type, nl_msg_type_to_str(h->nlmsg_type), ns_id);
d62a17ae 241 return 0;
1fdc9eae 242}
243
d62a17ae 244static int netlink_recvbuf(struct nlsock *nl, uint32_t newsize)
1fdc9eae 245{
d7c0a89a 246 uint32_t oldsize;
d62a17ae 247 socklen_t newlen = sizeof(newsize);
248 socklen_t oldlen = sizeof(oldsize);
249 int ret;
250
251 ret = getsockopt(nl->sock, SOL_SOCKET, SO_RCVBUF, &oldsize, &oldlen);
252 if (ret < 0) {
450971aa 253 flog_err_sys(EC_LIB_SOCKET,
09c866e3
QY
254 "Can't get %s receive buffer size: %s", nl->name,
255 safe_strerror(errno));
d62a17ae 256 return -1;
257 }
258
259 /* Try force option (linux >= 2.6.14) and fall back to normal set */
0cf6db21 260 frr_with_privs(&zserv_privs) {
01b9e3fd
DL
261 ret = setsockopt(nl->sock, SOL_SOCKET, SO_RCVBUFFORCE,
262 &nl_rcvbufsize,
263 sizeof(nl_rcvbufsize));
264 }
d62a17ae 265 if (ret < 0)
266 ret = setsockopt(nl->sock, SOL_SOCKET, SO_RCVBUF,
267 &nl_rcvbufsize, sizeof(nl_rcvbufsize));
268 if (ret < 0) {
450971aa 269 flog_err_sys(EC_LIB_SOCKET,
09c866e3
QY
270 "Can't set %s receive buffer size: %s", nl->name,
271 safe_strerror(errno));
d62a17ae 272 return -1;
273 }
274
275 ret = getsockopt(nl->sock, SOL_SOCKET, SO_RCVBUF, &newsize, &newlen);
276 if (ret < 0) {
450971aa 277 flog_err_sys(EC_LIB_SOCKET,
09c866e3
QY
278 "Can't get %s receive buffer size: %s", nl->name,
279 safe_strerror(errno));
d62a17ae 280 return -1;
281 }
d62a17ae 282 return 0;
1fdc9eae 283}
284
285/* Make socket for Linux netlink interface. */
d62a17ae 286static int netlink_socket(struct nlsock *nl, unsigned long groups,
287 ns_id_t ns_id)
1fdc9eae 288{
d62a17ae 289 int ret;
290 struct sockaddr_nl snl;
291 int sock;
292 int namelen;
d62a17ae 293
0cf6db21 294 frr_with_privs(&zserv_privs) {
6bb30c2c
DL
295 sock = ns_socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE, ns_id);
296 if (sock < 0) {
297 zlog_err("Can't open %s socket: %s", nl->name,
298 safe_strerror(errno));
299 return -1;
300 }
d62a17ae 301
0d6f7fd6 302 memset(&snl, 0, sizeof(snl));
6bb30c2c
DL
303 snl.nl_family = AF_NETLINK;
304 snl.nl_groups = groups;
d62a17ae 305
6bb30c2c 306 /* Bind the socket to the netlink structure for anything. */
0d6f7fd6 307 ret = bind(sock, (struct sockaddr *)&snl, sizeof(snl));
6bb30c2c 308 }
d62a17ae 309
310 if (ret < 0) {
6bb30c2c
DL
311 zlog_err("Can't bind %s socket to group 0x%x: %s", nl->name,
312 snl.nl_groups, safe_strerror(errno));
d62a17ae 313 close(sock);
314 return -1;
315 }
316
317 /* multiple netlink sockets will have different nl_pid */
0d6f7fd6 318 namelen = sizeof(snl);
d62a17ae 319 ret = getsockname(sock, (struct sockaddr *)&snl, (socklen_t *)&namelen);
0d6f7fd6 320 if (ret < 0 || namelen != sizeof(snl)) {
450971aa 321 flog_err_sys(EC_LIB_SOCKET, "Can't get %s socket name: %s",
09c866e3 322 nl->name, safe_strerror(errno));
d62a17ae 323 close(sock);
324 return -1;
325 }
326
327 nl->snl = snl;
328 nl->sock = sock;
2cf7651f
DS
329 nl->buflen = NL_RCV_PKT_BUF_SIZE;
330 nl->buf = XMALLOC(MTYPE_NL_BUF, nl->buflen);
331
d62a17ae 332 return ret;
1fdc9eae 333}
334
d166308b
MS
335/*
336 * Dispatch an incoming netlink message; used by the zebra main pthread's
337 * netlink event reader.
338 */
2414abd3 339static int netlink_information_fetch(struct nlmsghdr *h, ns_id_t ns_id,
d62a17ae 340 int startup)
1fdc9eae 341{
3575d9e8
DS
342 /*
343 * When we handle new message types here
344 * because we are starting to install them
345 * then lets check the netlink_install_filter
346 * and see if we should add the corresponding
347 * allow through entry there.
348 * Probably not needed to do but please
349 * think about it.
350 */
d62a17ae 351 switch (h->nlmsg_type) {
352 case RTM_NEWROUTE:
2414abd3 353 return netlink_route_change(h, ns_id, startup);
d62a17ae 354 case RTM_DELROUTE:
2414abd3 355 return netlink_route_change(h, ns_id, startup);
d62a17ae 356 case RTM_NEWLINK:
2414abd3 357 return netlink_link_change(h, ns_id, startup);
d62a17ae 358 case RTM_DELLINK:
2414abd3 359 return netlink_link_change(h, ns_id, startup);
d62a17ae 360 case RTM_NEWNEIGH:
d62a17ae 361 case RTM_DELNEIGH:
951f8bcb 362 case RTM_GETNEIGH:
7a52f27e 363 return netlink_neigh_change(h, ns_id);
942bf97b 364 case RTM_NEWRULE:
2414abd3 365 return netlink_rule_change(h, ns_id, startup);
942bf97b 366 case RTM_DELRULE:
2414abd3 367 return netlink_rule_change(h, ns_id, startup);
79580b5a 368 case RTM_NEWNEXTHOP:
d9f5b2f5 369 return netlink_nexthop_change(h, ns_id, startup);
79580b5a 370 case RTM_DELNEXTHOP:
d9f5b2f5 371 return netlink_nexthop_change(h, ns_id, startup);
d166308b
MS
372
373 /* Messages handled in the dplane thread */
374 case RTM_NEWADDR:
375 case RTM_DELADDR:
376 return 0;
377
d62a17ae 378 default:
3575d9e8
DS
379 /*
380 * If we have received this message then
381 * we have made a mistake during development
382 * and we need to write some code to handle
383 * this message type or not ask for
384 * it to be sent up to us
385 */
e914ccbe 386 flog_err(EC_ZEBRA_UNKNOWN_NLMSG,
1d5453d6 387 "Unknown netlink nlmsg_type %s(%d) vrf %u",
1c50c1c0
QY
388 nl_msg_type_to_str(h->nlmsg_type), h->nlmsg_type,
389 ns_id);
d62a17ae 390 break;
391 }
392 return 0;
1fdc9eae 393}
394
d166308b
MS
395/*
396 * Dispatch an incoming netlink message; used by the dataplane pthread's
397 * netlink event reader code.
398 */
399static int dplane_netlink_information_fetch(struct nlmsghdr *h, ns_id_t ns_id,
400 int startup)
401{
402 /*
403 * Dispatch the incoming messages that the dplane pthread handles
404 */
405 switch (h->nlmsg_type) {
406 case RTM_NEWADDR:
407 case RTM_DELADDR:
408 return netlink_interface_addr_dplane(h, ns_id, startup);
409
410 /* TODO */
411 case RTM_NEWLINK:
412 case RTM_DELLINK:
413
414 default:
415 break;
416 }
417
418 return 0;
419}
420
cc9f21da 421static void kernel_read(struct thread *thread)
1fdc9eae 422{
d62a17ae 423 struct zebra_ns *zns = (struct zebra_ns *)THREAD_ARG(thread);
85a75f1e
MS
424 struct zebra_dplane_info dp_info;
425
426 /* Capture key info from ns struct */
427 zebra_dplane_info_from_zns(&dp_info, zns, false);
428
429 netlink_parse_info(netlink_information_fetch, &zns->netlink, &dp_info,
9bfadae8 430 5, false);
d166308b 431
3801e764 432 thread_add_read(zrouter.master, kernel_read, zns, zns->netlink.sock,
d62a17ae 433 &zns->t_netlink);
1fdc9eae 434}
435
d166308b
MS
436/*
437 * Called by the dplane pthread to read incoming OS messages and dispatch them.
438 */
439int kernel_dplane_read(struct zebra_dplane_info *info)
440{
d4000d7b
DS
441 struct nlsock *nl = kernel_netlink_nlsock_lookup(info->sock);
442
443 netlink_parse_info(dplane_netlink_information_fetch, nl, info, 5,
444 false);
d166308b
MS
445
446 return 0;
447}
448
3575d9e8
DS
449/*
450 * Filter out messages from self that occur on listener socket,
62b8bb7a 451 * caused by our actions on the command socket(s)
3575d9e8
DS
452 *
453 * When we add new Netlink message types we probably
454 * do not need to add them here as that we are filtering
455 * on the routes we actually care to receive( which is rarer
456 * then the normal course of operations). We are intentionally
457 * allowing some messages from ourselves through
458 * ( I'm looking at you Interface based netlink messages )
459 * so that we only had to write one way to handle incoming
460 * address add/delete changes.
1fdc9eae 461 */
ff45112c 462static void netlink_install_filter(int sock, uint32_t pid, uint32_t dplane_pid)
1fdc9eae 463{
3575d9e8
DS
464 /*
465 * BPF_JUMP instructions and where you jump to are based upon
466 * 0 as being the next statement. So count from 0. Writing
467 * this down because every time I look at this I have to
468 * re-remember it.
469 */
d62a17ae 470 struct sock_filter filter[] = {
3575d9e8
DS
471 /*
472 * Logic:
62b8bb7a
MS
473 * if (nlmsg_pid == pid ||
474 * nlmsg_pid == dplane_pid) {
3575d9e8
DS
475 * if (the incoming nlmsg_type ==
476 * RTM_NEWADDR | RTM_DELADDR)
477 * keep this message
478 * else
479 * skip this message
480 * } else
481 * keep this netlink message
482 */
483 /*
484 * 0: Load the nlmsg_pid into the BPF register
485 */
d62a17ae 486 BPF_STMT(BPF_LD | BPF_ABS | BPF_W,
487 offsetof(struct nlmsghdr, nlmsg_pid)),
3575d9e8
DS
488 /*
489 * 1: Compare to pid
490 */
62b8bb7a 491 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htonl(pid), 1, 0),
3575d9e8 492 /*
62b8bb7a
MS
493 * 2: Compare to dplane pid
494 */
495 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htonl(dplane_pid), 0, 4),
496 /*
497 * 3: Load the nlmsg_type into BPF register
3575d9e8
DS
498 */
499 BPF_STMT(BPF_LD | BPF_ABS | BPF_H,
500 offsetof(struct nlmsghdr, nlmsg_type)),
501 /*
62b8bb7a 502 * 4: Compare to RTM_NEWADDR
3575d9e8
DS
503 */
504 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htons(RTM_NEWADDR), 2, 0),
505 /*
62b8bb7a 506 * 5: Compare to RTM_DELADDR
3575d9e8
DS
507 */
508 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htons(RTM_DELADDR), 1, 0),
509 /*
62b8bb7a 510 * 6: This is the end state of we want to skip the
3575d9e8
DS
511 * message
512 */
d62a17ae 513 BPF_STMT(BPF_RET | BPF_K, 0),
62b8bb7a 514 /* 7: This is the end state of we want to keep
3575d9e8
DS
515 * the message
516 */
d62a17ae 517 BPF_STMT(BPF_RET | BPF_K, 0xffff),
518 };
519
520 struct sock_fprog prog = {
9d303b37 521 .len = array_size(filter), .filter = filter,
d62a17ae 522 };
523
524 if (setsockopt(sock, SOL_SOCKET, SO_ATTACH_FILTER, &prog, sizeof(prog))
525 < 0)
1d5453d6 526 flog_err_sys(EC_LIB_SOCKET, "Can't install socket filter: %s",
9df414fe 527 safe_strerror(errno));
1fdc9eae 528}
529
d166308b
MS
530void netlink_parse_rtattr_flags(struct rtattr **tb, int max, struct rtattr *rta,
531 int len, unsigned short flags)
4bcdb608
NA
532{
533 unsigned short type;
534
269b69d7 535 memset(tb, 0, sizeof(struct rtattr *) * (max + 1));
4bcdb608
NA
536 while (RTA_OK(rta, len)) {
537 type = rta->rta_type & ~flags;
538 if ((type <= max) && (!tb[type]))
539 tb[type] = rta;
540 rta = RTA_NEXT(rta, len);
541 }
542}
543
d62a17ae 544void netlink_parse_rtattr(struct rtattr **tb, int max, struct rtattr *rta,
545 int len)
1fdc9eae 546{
269b69d7 547 memset(tb, 0, sizeof(struct rtattr *) * (max + 1));
d62a17ae 548 while (RTA_OK(rta, len)) {
549 if (rta->rta_type <= max)
550 tb[rta->rta_type] = rta;
551 rta = RTA_NEXT(rta, len);
552 }
1fdc9eae 553}
554
87da6a60
SW
555/**
556 * netlink_parse_rtattr_nested() - Parses a nested route attribute
557 * @tb: Pointer to array for storing rtattr in.
558 * @max: Max number to store.
559 * @rta: Pointer to rtattr to look for nested items in.
560 */
561void netlink_parse_rtattr_nested(struct rtattr **tb, int max,
562 struct rtattr *rta)
563{
564 netlink_parse_rtattr(tb, max, RTA_DATA(rta), RTA_PAYLOAD(rta));
565}
566
312a6bee
JU
567bool nl_attr_put(struct nlmsghdr *n, unsigned int maxlen, int type,
568 const void *data, unsigned int alen)
1fdc9eae 569{
d62a17ae 570 int len;
571 struct rtattr *rta;
1fdc9eae 572
d62a17ae 573 len = RTA_LENGTH(alen);
1fdc9eae 574
d62a17ae 575 if (NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len) > maxlen)
312a6bee 576 return false;
1fdc9eae 577
d62a17ae 578 rta = (struct rtattr *)(((char *)n) + NLMSG_ALIGN(n->nlmsg_len));
579 rta->rta_type = type;
580 rta->rta_len = len;
4b2792b5 581
d62a17ae 582 if (data)
583 memcpy(RTA_DATA(rta), data, alen);
584 else
585 assert(alen == 0);
4b2792b5 586
d62a17ae 587 n->nlmsg_len = NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len);
1fdc9eae 588
312a6bee 589 return true;
1fdc9eae 590}
591
94d70a65
DS
592bool nl_attr_put8(struct nlmsghdr *n, unsigned int maxlen, int type,
593 uint8_t data)
594{
595 return nl_attr_put(n, maxlen, type, &data, sizeof(uint8_t));
596}
597
312a6bee
JU
598bool nl_attr_put16(struct nlmsghdr *n, unsigned int maxlen, int type,
599 uint16_t data)
bbc16902 600{
312a6bee 601 return nl_attr_put(n, maxlen, type, &data, sizeof(uint16_t));
bbc16902 602}
603
312a6bee
JU
604bool nl_attr_put32(struct nlmsghdr *n, unsigned int maxlen, int type,
605 uint32_t data)
1fdc9eae 606{
312a6bee 607 return nl_attr_put(n, maxlen, type, &data, sizeof(uint32_t));
1fdc9eae 608}
609
312a6bee 610struct rtattr *nl_attr_nest(struct nlmsghdr *n, unsigned int maxlen, int type)
1fdc9eae 611{
d62a17ae 612 struct rtattr *nest = NLMSG_TAIL(n);
1fdc9eae 613
312a6bee
JU
614 if (!nl_attr_put(n, maxlen, type, NULL, 0))
615 return NULL;
616
40d86eba 617 nest->rta_type |= NLA_F_NESTED;
d62a17ae 618 return nest;
1fdc9eae 619}
620
312a6bee 621int nl_attr_nest_end(struct nlmsghdr *n, struct rtattr *nest)
1fdc9eae 622{
d7c0a89a 623 nest->rta_len = (uint8_t *)NLMSG_TAIL(n) - (uint8_t *)nest;
d62a17ae 624 return n->nlmsg_len;
1fdc9eae 625}
626
312a6bee 627struct rtnexthop *nl_attr_rtnh(struct nlmsghdr *n, unsigned int maxlen)
1fdc9eae 628{
312a6bee 629 struct rtnexthop *rtnh = (struct rtnexthop *)NLMSG_TAIL(n);
1fdc9eae 630
312a6bee
JU
631 if (NLMSG_ALIGN(n->nlmsg_len) + RTNH_ALIGN(sizeof(struct rtnexthop))
632 > maxlen)
633 return NULL;
634
635 memset(rtnh, 0, sizeof(struct rtnexthop));
636 n->nlmsg_len =
637 NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(sizeof(struct rtnexthop));
638
639 return rtnh;
1fdc9eae 640}
641
312a6bee 642void nl_attr_rtnh_end(struct nlmsghdr *n, struct rtnexthop *rtnh)
1fdc9eae 643{
312a6bee 644 rtnh->rtnh_len = (uint8_t *)NLMSG_TAIL(n) - (uint8_t *)rtnh;
1fdc9eae 645}
646
d62a17ae 647const char *nl_msg_type_to_str(uint16_t msg_type)
1fdc9eae 648{
d62a17ae 649 return lookup_msg(nlmsg_str, msg_type, "");
1fdc9eae 650}
651
d7c0a89a 652const char *nl_rtproto_to_str(uint8_t rtproto)
1fdc9eae 653{
d62a17ae 654 return lookup_msg(rtproto_str, rtproto, "");
1fdc9eae 655}
b339bde7 656
d7c0a89a 657const char *nl_family_to_str(uint8_t family)
b339bde7 658{
d62a17ae 659 return lookup_msg(family_str, family, "");
b339bde7
DS
660}
661
d7c0a89a 662const char *nl_rttype_to_str(uint8_t rttype)
b339bde7 663{
d62a17ae 664 return lookup_msg(rttype_str, rttype, "");
b339bde7
DS
665}
666
4cebb2b6 667#define NLA_OK(nla, len) \
5d307d5d
DS
668 ((len) >= (int)sizeof(struct nlattr) \
669 && (nla)->nla_len >= sizeof(struct nlattr) \
670 && (nla)->nla_len <= (len))
4cebb2b6
SW
671#define NLA_NEXT(nla, attrlen) \
672 ((attrlen) -= NLA_ALIGN((nla)->nla_len), \
673 (struct nlattr *)(((char *)(nla)) + NLA_ALIGN((nla)->nla_len)))
674#define NLA_LENGTH(len) (NLA_ALIGN(sizeof(struct nlattr)) + (len))
675#define NLA_DATA(nla) ((struct nlattr *)(((char *)(nla)) + NLA_LENGTH(0)))
676
677#define ERR_NLA(err, inner_len) \
678 ((struct nlattr *)(((char *)(err)) \
679 + NLMSG_ALIGN(sizeof(struct nlmsgerr)) \
680 + NLMSG_ALIGN((inner_len))))
5d307d5d
DS
681
682static void netlink_parse_nlattr(struct nlattr **tb, int max,
683 struct nlattr *nla, int len)
684{
4cebb2b6 685 while (NLA_OK(nla, len)) {
5d307d5d
DS
686 if (nla->nla_type <= max)
687 tb[nla->nla_type] = nla;
4cebb2b6 688 nla = NLA_NEXT(nla, len);
5d307d5d
DS
689 }
690}
691
692static void netlink_parse_extended_ack(struct nlmsghdr *h)
693{
4cebb2b6
SW
694 struct nlattr *tb[NLMSGERR_ATTR_MAX + 1] = {};
695 const struct nlmsgerr *err = (const struct nlmsgerr *)NLMSG_DATA(h);
5d307d5d 696 const struct nlmsghdr *err_nlh = NULL;
4cebb2b6
SW
697 /* Length not including nlmsghdr */
698 uint32_t len = 0;
699 /* Inner error netlink message length */
700 uint32_t inner_len = 0;
5d307d5d
DS
701 const char *msg = NULL;
702 uint32_t off = 0;
703
704 if (!(h->nlmsg_flags & NLM_F_CAPPED))
4cebb2b6
SW
705 inner_len = (uint32_t)NLMSG_PAYLOAD(&err->msg, 0);
706
707 len = (uint32_t)(NLMSG_PAYLOAD(h, sizeof(struct nlmsgerr)) - inner_len);
5d307d5d 708
4cebb2b6
SW
709 netlink_parse_nlattr(tb, NLMSGERR_ATTR_MAX, ERR_NLA(err, inner_len),
710 len);
5d307d5d
DS
711
712 if (tb[NLMSGERR_ATTR_MSG])
4cebb2b6 713 msg = (const char *)NLA_DATA(tb[NLMSGERR_ATTR_MSG]);
5d307d5d
DS
714
715 if (tb[NLMSGERR_ATTR_OFFS]) {
4cebb2b6 716 off = *(uint32_t *)NLA_DATA(tb[NLMSGERR_ATTR_OFFS]);
5d307d5d
DS
717
718 if (off > h->nlmsg_len) {
9165c5f5 719 zlog_err("Invalid offset for NLMSGERR_ATTR_OFFS");
5d307d5d
DS
720 } else if (!(h->nlmsg_flags & NLM_F_CAPPED)) {
721 /*
722 * Header of failed message
723 * we are not doing anything currently with it
724 * but noticing it for later.
725 */
726 err_nlh = &err->msg;
15569c58 727 zlog_debug("%s: Received %s extended Ack", __func__,
87b5d1b0 728 nl_msg_type_to_str(err_nlh->nlmsg_type));
5d307d5d
DS
729 }
730 }
731
732 if (msg && *msg != '\0') {
733 bool is_err = !!err->error;
734
735 if (is_err)
736 zlog_err("Extended Error: %s", msg);
737 else
e914ccbe 738 flog_warn(EC_ZEBRA_NETLINK_EXTENDED_WARNING,
9df414fe 739 "Extended Warning: %s", msg);
5d307d5d
DS
740 }
741}
742
ae6138bf
JU
743/*
744 * netlink_send_msg - send a netlink message of a certain size.
745 *
746 * Returns -1 on error. Otherwise, it returns the number of bytes sent.
747 */
f8653393
JU
748static ssize_t netlink_send_msg(const struct nlsock *nl, void *buf,
749 size_t buflen)
ae6138bf 750{
f8653393
JU
751 struct sockaddr_nl snl = {};
752 struct iovec iov = {};
753 struct msghdr msg = {};
754 ssize_t status;
755 int save_errno = 0;
ae6138bf
JU
756
757 iov.iov_base = buf;
758 iov.iov_len = buflen;
f8653393 759 msg.msg_name = &snl;
ae6138bf
JU
760 msg.msg_namelen = sizeof(snl);
761 msg.msg_iov = &iov;
762 msg.msg_iovlen = 1;
763
764 snl.nl_family = AF_NETLINK;
765
766 /* Send message to netlink interface. */
767 frr_with_privs(&zserv_privs) {
768 status = sendmsg(nl->sock, &msg, 0);
769 save_errno = errno;
770 }
771
772 if (IS_ZEBRA_DEBUG_KERNEL_MSGDUMP_SEND) {
773 zlog_debug("%s: >> netlink message dump [sent]", __func__);
eead0bc4
RZ
774#ifdef NETLINK_DEBUG
775 nl_dump(buf, buflen);
776#else
ae6138bf 777 zlog_hexdump(buf, buflen);
eead0bc4 778#endif /* NETLINK_DEBUG */
ae6138bf
JU
779 }
780
f8653393 781 if (status == -1) {
ae6138bf
JU
782 flog_err_sys(EC_LIB_SOCKET, "%s error: %s", __func__,
783 safe_strerror(save_errno));
784 return -1;
785 }
786
787 return status;
788}
789
790/*
791 * netlink_recv_msg - receive a netlink message.
792 *
793 * Returns -1 on error, 0 if read would block or the number of bytes received.
794 */
2cf7651f 795static int netlink_recv_msg(struct nlsock *nl, struct msghdr *msg)
ae6138bf
JU
796{
797 struct iovec iov;
798 int status;
799
2cf7651f
DS
800 iov.iov_base = nl->buf;
801 iov.iov_len = nl->buflen;
802 msg->msg_iov = &iov;
803 msg->msg_iovlen = 1;
ae6138bf
JU
804
805 do {
2cf7651f
DS
806 int bytes;
807
808 bytes = recv(nl->sock, NULL, 0, MSG_PEEK | MSG_TRUNC);
809
810 if (bytes >= 0 && (size_t)bytes > nl->buflen) {
811 nl->buf = XREALLOC(MTYPE_NL_BUF, nl->buf, bytes);
812 nl->buflen = bytes;
813 iov.iov_base = nl->buf;
814 iov.iov_len = nl->buflen;
815 }
816
817 status = recvmsg(nl->sock, msg, 0);
f8653393 818 } while (status == -1 && errno == EINTR);
ae6138bf 819
f8653393 820 if (status == -1) {
ae6138bf
JU
821 if (errno == EWOULDBLOCK || errno == EAGAIN)
822 return 0;
823 flog_err(EC_ZEBRA_RECVMSG_OVERRUN, "%s recvmsg overrun: %s",
824 nl->name, safe_strerror(errno));
825 /*
826 * In this case we are screwed. There is no good way to recover
827 * zebra at this point.
828 */
829 exit(-1);
830 }
831
832 if (status == 0) {
833 flog_err_sys(EC_LIB_SOCKET, "%s EOF", nl->name);
834 return -1;
835 }
836
2cf7651f 837 if (msg->msg_namelen != sizeof(struct sockaddr_nl)) {
ae6138bf
JU
838 flog_err(EC_ZEBRA_NETLINK_LENGTH_ERROR,
839 "%s sender address length error: length %d", nl->name,
2cf7651f 840 msg->msg_namelen);
ae6138bf
JU
841 return -1;
842 }
843
844 if (IS_ZEBRA_DEBUG_KERNEL_MSGDUMP_RECV) {
845 zlog_debug("%s: << netlink message dump [recv]", __func__);
eead0bc4 846#ifdef NETLINK_DEBUG
2cf7651f 847 nl_dump(nl->buf, status);
eead0bc4 848#else
2cf7651f 849 zlog_hexdump(nl->buf, status);
eead0bc4 850#endif /* NETLINK_DEBUG */
ae6138bf
JU
851 }
852
ae6138bf
JU
853 return status;
854}
855
856/*
857 * netlink_parse_error - parse a netlink error message
858 *
859 * Returns 1 if this message is acknowledgement, 0 if this error should be
860 * ignored, -1 otherwise.
861 */
862static int netlink_parse_error(const struct nlsock *nl, struct nlmsghdr *h,
d166308b 863 bool is_cmd, bool startup)
ae6138bf
JU
864{
865 struct nlmsgerr *err = (struct nlmsgerr *)NLMSG_DATA(h);
866 int errnum = err->error;
867 int msg_type = err->msg.nlmsg_type;
868
869 if (h->nlmsg_len < NLMSG_LENGTH(sizeof(struct nlmsgerr))) {
870 flog_err(EC_ZEBRA_NETLINK_LENGTH_ERROR,
871 "%s error: message truncated", nl->name);
872 return -1;
873 }
874
875 /*
876 * Parse the extended information before we actually handle it. At this
877 * point in time we do not do anything other than report the issue.
878 */
879 if (h->nlmsg_flags & NLM_F_ACK_TLVS)
880 netlink_parse_extended_ack(h);
881
882 /* If the error field is zero, then this is an ACK. */
883 if (err->error == 0) {
884 if (IS_ZEBRA_DEBUG_KERNEL) {
885 zlog_debug("%s: %s ACK: type=%s(%u), seq=%u, pid=%u",
886 __func__, nl->name,
887 nl_msg_type_to_str(err->msg.nlmsg_type),
888 err->msg.nlmsg_type, err->msg.nlmsg_seq,
889 err->msg.nlmsg_pid);
890 }
891
892 return 1;
893 }
894
895 /* Deal with errors that occur because of races in link handling. */
d166308b 896 if (is_cmd
ae6138bf
JU
897 && ((msg_type == RTM_DELROUTE
898 && (-errnum == ENODEV || -errnum == ESRCH))
899 || (msg_type == RTM_NEWROUTE
900 && (-errnum == ENETDOWN || -errnum == EEXIST)))) {
901 if (IS_ZEBRA_DEBUG_KERNEL)
902 zlog_debug("%s: error: %s type=%s(%u), seq=%u, pid=%u",
903 nl->name, safe_strerror(-errnum),
904 nl_msg_type_to_str(msg_type), msg_type,
905 err->msg.nlmsg_seq, err->msg.nlmsg_pid);
906 return 0;
907 }
908
909 /*
910 * We see RTM_DELNEIGH when shutting down an interface with an IPv4
911 * link-local. The kernel should have already deleted the neighbor so
912 * do not log these as an error.
913 */
914 if (msg_type == RTM_DELNEIGH
d166308b 915 || (is_cmd && msg_type == RTM_NEWROUTE
ae6138bf
JU
916 && (-errnum == ESRCH || -errnum == ENETUNREACH))) {
917 /*
918 * This is known to happen in some situations, don't log as
919 * error.
920 */
921 if (IS_ZEBRA_DEBUG_KERNEL)
922 zlog_debug("%s error: %s, type=%s(%u), seq=%u, pid=%u",
923 nl->name, safe_strerror(-errnum),
924 nl_msg_type_to_str(msg_type), msg_type,
925 err->msg.nlmsg_seq, err->msg.nlmsg_pid);
926 } else {
927 if ((msg_type != RTM_GETNEXTHOP) || !startup)
928 flog_err(EC_ZEBRA_UNEXPECTED_MESSAGE,
929 "%s error: %s, type=%s(%u), seq=%u, pid=%u",
930 nl->name, safe_strerror(-errnum),
931 nl_msg_type_to_str(msg_type), msg_type,
932 err->msg.nlmsg_seq, err->msg.nlmsg_pid);
933 }
934
935 return -1;
936}
937
936ebf0a
DS
938/*
939 * netlink_parse_info
940 *
941 * Receive message from netlink interface and pass those information
942 * to the given function.
943 *
944 * filter -> Function to call to read the results
945 * nl -> netlink socket information
946 * zns -> The zebra namespace data
947 * count -> How many we should read in, 0 means as much as possible
948 * startup -> Are we reading in under startup conditions? passed to
949 * the filter.
950 */
2414abd3 951int netlink_parse_info(int (*filter)(struct nlmsghdr *, ns_id_t, int),
2cf7651f 952 struct nlsock *nl, const struct zebra_dplane_info *zns,
9bfadae8 953 int count, bool startup)
1fdc9eae 954{
d62a17ae 955 int status;
956 int ret = 0;
957 int error;
958 int read_in = 0;
959
960 while (1) {
d62a17ae 961 struct sockaddr_nl snl;
962 struct msghdr msg = {.msg_name = (void *)&snl,
ae6138bf 963 .msg_namelen = sizeof(snl)};
d62a17ae 964 struct nlmsghdr *h;
965
966 if (count && read_in >= count)
967 return 0;
968
2cf7651f 969 status = netlink_recv_msg(nl, &msg);
ae6138bf 970 if (status == -1)
d62a17ae 971 return -1;
ae6138bf
JU
972 else if (status == 0)
973 break;
81a2f870 974
d62a17ae 975 read_in++;
2cf7651f 976 for (h = (struct nlmsghdr *)nl->buf;
e6a0e0d1 977 (status >= 0 && NLMSG_OK(h, (unsigned int)status));
d62a17ae 978 h = NLMSG_NEXT(h, status)) {
979 /* Finish of reading. */
980 if (h->nlmsg_type == NLMSG_DONE)
981 return ret;
982
983 /* Error handling. */
984 if (h->nlmsg_type == NLMSG_ERROR) {
d166308b
MS
985 int err = netlink_parse_error(
986 nl, h, zns->is_cmd, startup);
987
ae6138bf 988 if (err == 1) {
d62a17ae 989 if (!(h->nlmsg_flags & NLM_F_MULTI))
990 return 0;
991 continue;
ae6138bf
JU
992 } else
993 return err;
d62a17ae 994 }
995
996 /* OK we got netlink message. */
997 if (IS_ZEBRA_DEBUG_KERNEL)
998 zlog_debug(
d166308b
MS
999 "%s: %s type %s(%u), len=%d, seq=%u, pid=%u",
1000 __func__, nl->name,
d62a17ae 1001 nl_msg_type_to_str(h->nlmsg_type),
1002 h->nlmsg_type, h->nlmsg_len,
1003 h->nlmsg_seq, h->nlmsg_pid);
1004
783827ae
DS
1005
1006 /*
1007 * Ignore messages that maybe sent from
1008 * other actors besides the kernel
1009 */
1010 if (snl.nl_pid != 0) {
43e52561
QY
1011 zlog_debug("Ignoring message from pid %u",
1012 snl.nl_pid);
d62a17ae 1013 continue;
1014 }
1015
2414abd3 1016 error = (*filter)(h, zns->ns_id, startup);
d62a17ae 1017 if (error < 0) {
9df414fe
QY
1018 zlog_debug("%s filter function error",
1019 nl->name);
d62a17ae 1020 ret = error;
1021 }
1022 }
1023
1024 /* After error care. */
1025 if (msg.msg_flags & MSG_TRUNC) {
e914ccbe 1026 flog_err(EC_ZEBRA_NETLINK_LENGTH_ERROR,
1c50c1c0 1027 "%s error: message truncated", nl->name);
d62a17ae 1028 continue;
1029 }
1030 if (status) {
e914ccbe 1031 flog_err(EC_ZEBRA_NETLINK_LENGTH_ERROR,
1c50c1c0
QY
1032 "%s error: data remnant size %d", nl->name,
1033 status);
d62a17ae 1034 return -1;
1035 }
1036 }
1037 return ret;
1fdc9eae 1038}
1039
936ebf0a 1040/*
7cdb1a84 1041 * netlink_talk_info
936ebf0a
DS
1042 *
1043 * sendmsg() to netlink socket then recvmsg().
1044 * Calls netlink_parse_info to parse returned data
1045 *
1046 * filter -> The filter to read final results from kernel
1047 * nlmsghdr -> The data to send to the kernel
8b962e77 1048 * dp_info -> The dataplane and netlink socket information
936ebf0a
DS
1049 * startup -> Are we reading in under startup conditions
1050 * This is passed through eventually to filter.
1051 */
2cf7651f
DS
1052static int netlink_talk_info(int (*filter)(struct nlmsghdr *, ns_id_t,
1053 int startup),
1054 struct nlmsghdr *n,
1055 struct zebra_dplane_info *dp_info, bool startup)
1fdc9eae 1056{
d4000d7b 1057 struct nlsock *nl;
d62a17ae 1058
d4000d7b 1059 nl = kernel_netlink_nlsock_lookup(dp_info->sock);
3670f504 1060 n->nlmsg_seq = dp_info->seq;
d62a17ae 1061 n->nlmsg_pid = nl->snl.nl_pid;
1062
d62a17ae 1063 if (IS_ZEBRA_DEBUG_KERNEL)
1064 zlog_debug(
1065 "netlink_talk: %s type %s(%u), len=%d seq=%u flags 0x%x",
1066 nl->name, nl_msg_type_to_str(n->nlmsg_type),
1067 n->nlmsg_type, n->nlmsg_len, n->nlmsg_seq,
1068 n->nlmsg_flags);
1069
f8653393 1070 if (netlink_send_msg(nl, n, n->nlmsg_len) == -1)
d62a17ae 1071 return -1;
d62a17ae 1072
d62a17ae 1073 /*
1074 * Get reply from netlink socket.
1075 * The reply should either be an acknowlegement or an error.
1076 */
7cdb1a84
MS
1077 return netlink_parse_info(filter, nl, dp_info, 0, startup);
1078}
1079
1080/*
1081 * Synchronous version of netlink_talk_info. Converts args to suit the
1082 * common version, which is suitable for both sync and async use.
7cdb1a84
MS
1083 */
1084int netlink_talk(int (*filter)(struct nlmsghdr *, ns_id_t, int startup),
1085 struct nlmsghdr *n, struct nlsock *nl, struct zebra_ns *zns,
9bfadae8 1086 bool startup)
7cdb1a84
MS
1087{
1088 struct zebra_dplane_info dp_info;
1089
1090 /* Increment sequence number before capturing snapshot of ns socket
1091 * info.
1092 */
1093 nl->seq++;
1094
1095 /* Capture info in intermediate info struct */
85a75f1e 1096 zebra_dplane_info_from_zns(&dp_info, zns, (nl == &(zns->netlink_cmd)));
7cdb1a84 1097
5709131c 1098 return netlink_talk_info(filter, n, &dp_info, startup);
1fdc9eae 1099}
1100
289602d7 1101/* Issue request message to kernel via netlink socket. GET messages
1102 * are issued through this interface.
1103 */
fd3f8e52 1104int netlink_request(struct nlsock *nl, void *req)
1fdc9eae 1105{
fd3f8e52 1106 struct nlmsghdr *n = (struct nlmsghdr *)req;
d62a17ae 1107
1108 /* Check netlink socket. */
1109 if (nl->sock < 0) {
450971aa 1110 flog_err_sys(EC_LIB_SOCKET, "%s socket isn't active.",
09c866e3 1111 nl->name);
d62a17ae 1112 return -1;
1113 }
1114
1115 /* Fill common fields for all requests. */
d62a17ae 1116 n->nlmsg_pid = nl->snl.nl_pid;
1117 n->nlmsg_seq = ++nl->seq;
1118
f8653393 1119 if (netlink_send_msg(nl, req, n->nlmsg_len) == -1)
d62a17ae 1120 return -1;
d62a17ae 1121
1122 return 0;
1fdc9eae 1123}
1124
e63c7622
JU
1125static int nl_batch_read_resp(struct nl_batch *bth)
1126{
1127 struct nlmsghdr *h;
1128 struct sockaddr_nl snl;
9d06e121 1129 struct msghdr msg = {};
f6feb48b 1130 int status, seq;
d4000d7b 1131 struct nlsock *nl;
f6feb48b
JU
1132 struct zebra_dplane_ctx *ctx;
1133 bool ignore_msg;
e63c7622 1134
d4000d7b 1135 nl = kernel_netlink_nlsock_lookup(bth->zns->sock);
e63c7622
JU
1136
1137 msg.msg_name = (void *)&snl;
1138 msg.msg_namelen = sizeof(snl);
1139
2f9dbd3a
JU
1140 /*
1141 * The responses are not batched, so we need to read and process one
1142 * message at a time.
1143 */
1144 while (true) {
2cf7651f 1145 status = netlink_recv_msg(nl, &msg);
00249e25
DS
1146 /*
1147 * status == -1 is a full on failure somewhere
1148 * since we don't know where the problem happened
1149 * we must mark all as failed
1150 *
1151 * Else we mark everything as worked
1152 *
1153 */
1154 if (status == -1 || status == 0) {
1155 while ((ctx = dplane_ctx_dequeue(&(bth->ctx_list))) !=
1156 NULL) {
1157 if (status == -1)
1158 dplane_ctx_set_status(
1159 ctx,
1160 ZEBRA_DPLANE_REQUEST_FAILURE);
1161 dplane_ctx_enqueue_tail(bth->ctx_out_q, ctx);
1162 }
2f9dbd3a 1163 return status;
00249e25 1164 }
e63c7622 1165
2cf7651f 1166 h = (struct nlmsghdr *)nl->buf;
f6feb48b
JU
1167 ignore_msg = false;
1168 seq = h->nlmsg_seq;
e63c7622 1169 /*
f6feb48b
JU
1170 * Find the corresponding context object. Received responses are
1171 * in the same order as requests we sent, so we can simply
1172 * iterate over the context list and match responses with
1173 * requests at same time.
e63c7622 1174 */
f6feb48b 1175 while (true) {
c8453cd7
DS
1176 ctx = dplane_ctx_get_head(&(bth->ctx_list));
1177 if (ctx == NULL) {
1178 /*
1179 * This is a situation where we have gotten
1180 * into a bad spot. We need to know that
1181 * this happens( does it? )
1182 */
1183 zlog_err(
1184 "%s:WARNING Received netlink Response for an error and no Contexts to associate with it",
1185 __func__);
e63c7622 1186 break;
c8453cd7 1187 }
f6feb48b
JU
1188
1189 /*
1190 * 'update' context objects take two consecutive
1191 * sequence numbers.
1192 */
3670f504
DS
1193 if (dplane_ctx_is_update(ctx) &&
1194 dplane_ctx_get_ns(ctx)->seq + 1 == seq) {
f6feb48b
JU
1195 /*
1196 * This is the situation where we get a response
1197 * to a message that should be ignored.
1198 */
1199 ignore_msg = true;
1200 break;
1201 }
c8453cd7
DS
1202
1203 ctx = dplane_ctx_dequeue(&(bth->ctx_list));
1204 dplane_ctx_enqueue_tail(bth->ctx_out_q, ctx);
1205
1206 /* We have found corresponding context object. */
3670f504 1207 if (dplane_ctx_get_ns(ctx)->seq == seq)
c8453cd7
DS
1208 break;
1209
3670f504 1210 if (dplane_ctx_get_ns(ctx)->seq > seq)
c8453cd7 1211 zlog_warn(
b9d95135 1212 "%s:WARNING Received %u is less than any context on the queue ctx->seq %u",
c8453cd7 1213 __func__, seq,
3670f504 1214 dplane_ctx_get_ns(ctx)->seq);
e63c7622
JU
1215 }
1216
c8453cd7
DS
1217 if (ignore_msg) {
1218 /*
1219 * If we ignore the message due to an update
1220 * above we should still fricking decode the
1221 * message for our operator to understand
1222 * what is going on
1223 */
1224 int err = netlink_parse_error(nl, h, bth->zns->is_cmd,
1225 false);
1226
1227 zlog_debug("%s: netlink error message seq=%d %d",
1228 __func__, h->nlmsg_seq, err);
f6feb48b 1229 continue;
c8453cd7 1230 }
f6feb48b 1231
e63c7622
JU
1232 /*
1233 * We received a message with the sequence number that isn't
1234 * associated with any dplane context object.
1235 */
f6feb48b 1236 if (ctx == NULL) {
4c99d413
MS
1237 if (IS_ZEBRA_DEBUG_KERNEL)
1238 zlog_debug(
1239 "%s: skipping unassociated response, seq number %d NS %u",
1240 __func__, h->nlmsg_seq,
1241 bth->zns->ns_id);
e63c7622
JU
1242 continue;
1243 }
1244
1245 if (h->nlmsg_type == NLMSG_ERROR) {
d166308b
MS
1246 int err = netlink_parse_error(nl, h, bth->zns->is_cmd,
1247 false);
e63c7622
JU
1248
1249 if (err == -1)
f6feb48b
JU
1250 dplane_ctx_set_status(
1251 ctx, ZEBRA_DPLANE_REQUEST_FAILURE);
e63c7622 1252
4c99d413
MS
1253 if (IS_ZEBRA_DEBUG_KERNEL)
1254 zlog_debug("%s: netlink error message seq=%d ",
1255 __func__, h->nlmsg_seq);
e63c7622
JU
1256 continue;
1257 }
1258
1259 /*
1260 * If we get here then we did not receive neither the ack nor
1261 * the error and instead received some other message in an
1262 * unexpected way.
1263 */
4c99d413
MS
1264 if (IS_ZEBRA_DEBUG_KERNEL)
1265 zlog_debug("%s: ignoring message type 0x%04x(%s) NS %u",
1266 __func__, h->nlmsg_type,
1267 nl_msg_type_to_str(h->nlmsg_type),
1268 bth->zns->ns_id);
e63c7622
JU
1269 }
1270
1271 return 0;
1272}
1273
1274static void nl_batch_reset(struct nl_batch *bth)
1275{
e63c7622
JU
1276 bth->buf_head = bth->buf;
1277 bth->curlen = 0;
1278 bth->msgcnt = 0;
1279 bth->zns = NULL;
1280
f6feb48b 1281 TAILQ_INIT(&(bth->ctx_list));
e63c7622
JU
1282}
1283
f6feb48b 1284static void nl_batch_init(struct nl_batch *bth, struct dplane_ctx_q *ctx_out_q)
e63c7622 1285{
531c92b8
JU
1286 /*
1287 * If the size of the buffer has changed, free and then allocate a new
1288 * one.
1289 */
1290 size_t bufsize =
1291 atomic_load_explicit(&nl_batch_bufsize, memory_order_relaxed);
1292 if (bufsize != nl_batch_tx_bufsize) {
1293 if (nl_batch_tx_buf)
1294 XFREE(MTYPE_NL_BUF, nl_batch_tx_buf);
1295
1296 nl_batch_tx_buf = XCALLOC(MTYPE_NL_BUF, bufsize);
1297 nl_batch_tx_bufsize = bufsize;
1298 }
1299
f6feb48b 1300 bth->buf = nl_batch_tx_buf;
531c92b8
JU
1301 bth->bufsiz = bufsize;
1302 bth->limit = atomic_load_explicit(&nl_batch_send_threshold,
1303 memory_order_relaxed);
e63c7622 1304
f6feb48b 1305 bth->ctx_out_q = ctx_out_q;
e63c7622 1306
f6feb48b
JU
1307 nl_batch_reset(bth);
1308}
1309
1310static void nl_batch_send(struct nl_batch *bth)
1311{
1312 struct zebra_dplane_ctx *ctx;
1313 bool err = false;
e63c7622 1314
f6feb48b 1315 if (bth->curlen != 0 && bth->zns != NULL) {
d4000d7b
DS
1316 struct nlsock *nl =
1317 kernel_netlink_nlsock_lookup(bth->zns->sock);
1318
f6feb48b
JU
1319 if (IS_ZEBRA_DEBUG_KERNEL)
1320 zlog_debug("%s: %s, batch size=%zu, msg cnt=%zu",
d4000d7b 1321 __func__, nl->name, bth->curlen,
f6feb48b 1322 bth->msgcnt);
e63c7622 1323
d4000d7b 1324 if (netlink_send_msg(nl, bth->buf, bth->curlen) == -1)
e63c7622 1325 err = true;
e63c7622 1326
f6feb48b
JU
1327 if (!err) {
1328 if (nl_batch_read_resp(bth) == -1)
1329 err = true;
1330 }
1331 }
e63c7622 1332
f6feb48b
JU
1333 /* Move remaining contexts to the outbound queue. */
1334 while (true) {
1335 ctx = dplane_ctx_dequeue(&(bth->ctx_list));
1336 if (ctx == NULL)
1337 break;
e63c7622 1338
f6feb48b
JU
1339 if (err)
1340 dplane_ctx_set_status(ctx,
1341 ZEBRA_DPLANE_REQUEST_FAILURE);
e63c7622 1342
f6feb48b 1343 dplane_ctx_enqueue_tail(bth->ctx_out_q, ctx);
e63c7622
JU
1344 }
1345
1346 nl_batch_reset(bth);
1347}
1348
e63c7622
JU
1349enum netlink_msg_status netlink_batch_add_msg(
1350 struct nl_batch *bth, struct zebra_dplane_ctx *ctx,
1351 ssize_t (*msg_encoder)(struct zebra_dplane_ctx *, void *, size_t),
f6feb48b 1352 bool ignore_res)
e63c7622
JU
1353{
1354 int seq;
1355 ssize_t size;
1356 struct nlmsghdr *msgh;
d4000d7b 1357 struct nlsock *nl;
e63c7622 1358
e63c7622
JU
1359 size = (*msg_encoder)(ctx, bth->buf_head, bth->bufsiz - bth->curlen);
1360
1361 /*
1362 * If there was an error while encoding the message (other than buffer
1363 * overflow) then return an error.
1364 */
1365 if (size < 0)
1366 return FRR_NETLINK_ERROR;
1367
1368 /*
1369 * If the message doesn't fit entirely in the buffer then send the batch
1370 * and retry.
1371 */
1372 if (size == 0) {
1373 nl_batch_send(bth);
1374 size = (*msg_encoder)(ctx, bth->buf_head,
1375 bth->bufsiz - bth->curlen);
1376 /*
1377 * If the message doesn't fit in the empty buffer then just
1378 * return an error.
1379 */
1380 if (size <= 0)
1381 return FRR_NETLINK_ERROR;
1382 }
1383
3670f504 1384 seq = dplane_ctx_get_ns(ctx)->seq;
d4000d7b
DS
1385 nl = kernel_netlink_nlsock_lookup(dplane_ctx_get_ns_sock(ctx));
1386
f6feb48b 1387 if (ignore_res)
e63c7622
JU
1388 seq++;
1389
1390 msgh = (struct nlmsghdr *)bth->buf_head;
1391 msgh->nlmsg_seq = seq;
d4000d7b 1392 msgh->nlmsg_pid = nl->snl.nl_pid;
e63c7622 1393
e63c7622
JU
1394 bth->zns = dplane_ctx_get_ns(ctx);
1395 bth->buf_head = ((char *)bth->buf_head) + size;
1396 bth->curlen += size;
1397 bth->msgcnt++;
1398
e63c7622
JU
1399 return FRR_NETLINK_QUEUED;
1400}
1401
67e3369e
JU
1402static enum netlink_msg_status nl_put_msg(struct nl_batch *bth,
1403 struct zebra_dplane_ctx *ctx)
1404{
1405 if (dplane_ctx_is_skip_kernel(ctx))
1406 return FRR_NETLINK_SUCCESS;
1407
1408 switch (dplane_ctx_get_op(ctx)) {
1409
1410 case DPLANE_OP_ROUTE_INSTALL:
1411 case DPLANE_OP_ROUTE_UPDATE:
1412 case DPLANE_OP_ROUTE_DELETE:
1413 return netlink_put_route_update_msg(bth, ctx);
1414
1415 case DPLANE_OP_NH_INSTALL:
1416 case DPLANE_OP_NH_UPDATE:
1417 case DPLANE_OP_NH_DELETE:
1418 return netlink_put_nexthop_update_msg(bth, ctx);
1419
1420 case DPLANE_OP_LSP_INSTALL:
1421 case DPLANE_OP_LSP_UPDATE:
1422 case DPLANE_OP_LSP_DELETE:
1423 return netlink_put_lsp_update_msg(bth, ctx);
1424
1425 case DPLANE_OP_PW_INSTALL:
1426 case DPLANE_OP_PW_UNINSTALL:
1427 return netlink_put_pw_update_msg(bth, ctx);
1428
1429 case DPLANE_OP_ADDR_INSTALL:
1430 case DPLANE_OP_ADDR_UNINSTALL:
1431 return netlink_put_address_update_msg(bth, ctx);
1432
1433 case DPLANE_OP_MAC_INSTALL:
1434 case DPLANE_OP_MAC_DELETE:
1435 return netlink_put_mac_update_msg(bth, ctx);
1436
1437 case DPLANE_OP_NEIGH_INSTALL:
1438 case DPLANE_OP_NEIGH_UPDATE:
1439 case DPLANE_OP_NEIGH_DELETE:
1440 case DPLANE_OP_VTEP_ADD:
1441 case DPLANE_OP_VTEP_DELETE:
d68e74b4 1442 case DPLANE_OP_NEIGH_DISCOVER:
0a27a2fe
PG
1443 case DPLANE_OP_NEIGH_IP_INSTALL:
1444 case DPLANE_OP_NEIGH_IP_DELETE:
e18747a9 1445 case DPLANE_OP_NEIGH_TABLE_UPDATE:
67e3369e
JU
1446 return netlink_put_neigh_update_msg(bth, ctx);
1447
1448 case DPLANE_OP_RULE_ADD:
1449 case DPLANE_OP_RULE_DELETE:
1450 case DPLANE_OP_RULE_UPDATE:
1451 return netlink_put_rule_update_msg(bth, ctx);
1452
1453 case DPLANE_OP_SYS_ROUTE_ADD:
1454 case DPLANE_OP_SYS_ROUTE_DELETE:
1455 case DPLANE_OP_ROUTE_NOTIFY:
1456 case DPLANE_OP_LSP_NOTIFY:
c60522f7 1457 case DPLANE_OP_BR_PORT_UPDATE:
67e3369e
JU
1458 return FRR_NETLINK_SUCCESS;
1459
5162e000
PG
1460 case DPLANE_OP_IPTABLE_ADD:
1461 case DPLANE_OP_IPTABLE_DELETE:
ef524230
PG
1462 case DPLANE_OP_IPSET_ADD:
1463 case DPLANE_OP_IPSET_DELETE:
1464 case DPLANE_OP_IPSET_ENTRY_ADD:
1465 case DPLANE_OP_IPSET_ENTRY_DELETE:
5162e000
PG
1466 return FRR_NETLINK_ERROR;
1467
62b4b7e4
PG
1468 case DPLANE_OP_GRE_SET:
1469 return netlink_put_gre_set_msg(bth, ctx);
1470
9d59df63
MS
1471 case DPLANE_OP_INTF_ADDR_ADD:
1472 case DPLANE_OP_INTF_ADDR_DEL:
67e3369e
JU
1473 case DPLANE_OP_NONE:
1474 return FRR_NETLINK_ERROR;
1475 }
1476
1477 return FRR_NETLINK_ERROR;
1478}
1479
fef24b03
JU
1480void kernel_update_multi(struct dplane_ctx_q *ctx_list)
1481{
67e3369e
JU
1482 struct nl_batch batch;
1483 struct zebra_dplane_ctx *ctx;
1484 struct dplane_ctx_q handled_list;
1485 enum netlink_msg_status res;
1486
67e3369e 1487 TAILQ_INIT(&handled_list);
f6feb48b 1488 nl_batch_init(&batch, &handled_list);
67e3369e
JU
1489
1490 while (true) {
1491 ctx = dplane_ctx_dequeue(ctx_list);
1492 if (ctx == NULL)
1493 break;
1494
f6feb48b
JU
1495 if (batch.zns != NULL
1496 && batch.zns->ns_id != dplane_ctx_get_ns(ctx)->ns_id)
1497 nl_batch_send(&batch);
67e3369e
JU
1498
1499 /*
f6feb48b
JU
1500 * Assume all messages will succeed and then mark only the ones
1501 * that failed.
67e3369e 1502 */
f6feb48b
JU
1503 dplane_ctx_set_status(ctx, ZEBRA_DPLANE_REQUEST_SUCCESS);
1504
1505 res = nl_put_msg(&batch, ctx);
1506
1507 dplane_ctx_enqueue_tail(&(batch.ctx_list), ctx);
1508 if (res == FRR_NETLINK_ERROR)
67e3369e
JU
1509 dplane_ctx_set_status(ctx,
1510 ZEBRA_DPLANE_REQUEST_FAILURE);
1511
f6feb48b
JU
1512 if (batch.curlen > batch.limit)
1513 nl_batch_send(&batch);
67e3369e
JU
1514 }
1515
1516 nl_batch_send(&batch);
1517
1518 TAILQ_INIT(ctx_list);
1519 dplane_ctx_list_append(ctx_list, &handled_list);
fef24b03
JU
1520}
1521
d4000d7b
DS
1522struct nlsock *kernel_netlink_nlsock_lookup(int sock)
1523{
34869809 1524 struct nlsock lookup, *retval;
d4000d7b
DS
1525
1526 lookup.sock = sock;
1527
34869809
MS
1528 NLSOCK_LOCK();
1529 retval = hash_lookup(nlsock_hash, &lookup);
1530 NLSOCK_UNLOCK();
1531
1532 return retval;
1533}
1534
1535/* Insert nlsock entry into hash */
1536static void kernel_netlink_nlsock_insert(struct nlsock *nls)
1537{
1538 NLSOCK_LOCK();
1539 (void)hash_get(nlsock_hash, nls, hash_alloc_intern);
1540 NLSOCK_UNLOCK();
1541}
1542
1543/* Remove nlsock entry from hash */
1544static void kernel_netlink_nlsock_remove(struct nlsock *nls)
1545{
1546 NLSOCK_LOCK();
1547 (void)hash_release(nlsock_hash, nls);
1548 NLSOCK_UNLOCK();
d4000d7b
DS
1549}
1550
1551static uint32_t kernel_netlink_nlsock_key(const void *arg)
1552{
1553 const struct nlsock *nl = arg;
1554
1555 return nl->sock;
1556}
1557
1558static bool kernel_netlink_nlsock_hash_equal(const void *arg1, const void *arg2)
1559{
1560 const struct nlsock *nl1 = arg1;
1561 const struct nlsock *nl2 = arg2;
1562
1563 if (nl1->sock == nl2->sock)
1564 return true;
1565
1566 return false;
1567}
1568
1fdc9eae 1569/* Exported interface function. This function simply calls
1570 netlink_socket (). */
d62a17ae 1571void kernel_init(struct zebra_ns *zns)
1fdc9eae 1572{
ceab66b7 1573 uint32_t groups, dplane_groups;
5d307d5d
DS
1574#if defined SOL_NETLINK
1575 int one, ret;
1576#endif
d62a17ae 1577
026a316f
DS
1578 /*
1579 * Initialize netlink sockets
1580 *
1581 * If RTMGRP_XXX exists use that, but at some point
1582 * I think the kernel developers realized that
1583 * keeping track of all the different values would
1584 * lead to confusion, so we need to convert the
1585 * RTNLGRP_XXX to a bit position for ourself
1586 */
1587 groups = RTMGRP_LINK |
1588 RTMGRP_IPV4_ROUTE |
1589 RTMGRP_IPV4_IFADDR |
1590 RTMGRP_IPV6_ROUTE |
1591 RTMGRP_IPV6_IFADDR |
1592 RTMGRP_IPV4_MROUTE |
1593 RTMGRP_NEIGH |
67188ca2
QY
1594 ((uint32_t) 1 << (RTNLGRP_IPV4_RULE - 1)) |
1595 ((uint32_t) 1 << (RTNLGRP_IPV6_RULE - 1)) |
1596 ((uint32_t) 1 << (RTNLGRP_NEXTHOP - 1));
d62a17ae 1597
ceab66b7
MS
1598 dplane_groups = (RTMGRP_LINK |
1599 RTMGRP_IPV4_IFADDR |
1600 RTMGRP_IPV6_IFADDR);
1601
d62a17ae 1602 snprintf(zns->netlink.name, sizeof(zns->netlink.name),
1603 "netlink-listen (NS %u)", zns->ns_id);
1604 zns->netlink.sock = -1;
19d5a4fe
DS
1605 if (netlink_socket(&zns->netlink, groups, zns->ns_id) < 0) {
1606 zlog_err("Failure to create %s socket",
1607 zns->netlink.name);
1608 exit(-1);
1609 }
34869809
MS
1610
1611 kernel_netlink_nlsock_insert(&zns->netlink);
d62a17ae 1612
1613 snprintf(zns->netlink_cmd.name, sizeof(zns->netlink_cmd.name),
1614 "netlink-cmd (NS %u)", zns->ns_id);
1615 zns->netlink_cmd.sock = -1;
19d5a4fe
DS
1616 if (netlink_socket(&zns->netlink_cmd, 0, zns->ns_id) < 0) {
1617 zlog_err("Failure to create %s socket",
1618 zns->netlink_cmd.name);
1619 exit(-1);
1620 }
34869809
MS
1621
1622 kernel_netlink_nlsock_insert(&zns->netlink_cmd);
d62a17ae 1623
80dcc388
MS
1624 /* Outbound socket for dplane programming of the host OS. */
1625 snprintf(zns->netlink_dplane_out.name,
1626 sizeof(zns->netlink_dplane_out.name), "netlink-dp (NS %u)",
1627 zns->ns_id);
1628 zns->netlink_dplane_out.sock = -1;
1629 if (netlink_socket(&zns->netlink_dplane_out, 0, zns->ns_id) < 0) {
62b8bb7a 1630 zlog_err("Failure to create %s socket",
80dcc388
MS
1631 zns->netlink_dplane_out.name);
1632 exit(-1);
1633 }
34869809
MS
1634
1635 kernel_netlink_nlsock_insert(&zns->netlink_dplane_out);
80dcc388
MS
1636
1637 /* Inbound socket for OS events coming to the dplane. */
1638 snprintf(zns->netlink_dplane_in.name,
1639 sizeof(zns->netlink_dplane_in.name), "netlink-dp-in (NS %u)",
1640 zns->ns_id);
1641 zns->netlink_dplane_in.sock = -1;
ceab66b7
MS
1642 if (netlink_socket(&zns->netlink_dplane_in, dplane_groups,
1643 zns->ns_id) < 0) {
80dcc388
MS
1644 zlog_err("Failure to create %s socket",
1645 zns->netlink_dplane_in.name);
62b8bb7a
MS
1646 exit(-1);
1647 }
34869809
MS
1648
1649 kernel_netlink_nlsock_insert(&zns->netlink_dplane_in);
62b8bb7a 1650
5d307d5d
DS
1651 /*
1652 * SOL_NETLINK is not available on all platforms yet
1653 * apparently. It's in bits/socket.h which I am not
1654 * sure that we want to pull into our build system.
1655 */
1656#if defined SOL_NETLINK
1657 /*
1658 * Let's tell the kernel that we want to receive extended
62b8bb7a 1659 * ACKS over our command socket(s)
5d307d5d
DS
1660 */
1661 one = 1;
1662 ret = setsockopt(zns->netlink_cmd.sock, SOL_NETLINK, NETLINK_EXT_ACK,
1663 &one, sizeof(one));
1664
1665 if (ret < 0)
62b8bb7a
MS
1666 zlog_notice("Registration for extended cmd ACK failed : %d %s",
1667 errno, safe_strerror(errno));
1668
1669 one = 1;
80dcc388
MS
1670 ret = setsockopt(zns->netlink_dplane_out.sock, SOL_NETLINK,
1671 NETLINK_EXT_ACK, &one, sizeof(one));
62b8bb7a
MS
1672
1673 if (ret < 0)
1674 zlog_notice("Registration for extended dp ACK failed : %d %s",
5d307d5d 1675 errno, safe_strerror(errno));
97f85144
JU
1676
1677 /*
1678 * Trim off the payload of the original netlink message in the
1679 * acknowledgment. This option is available since Linux 4.2, so if
1680 * setsockopt fails, ignore the error.
1681 */
1682 one = 1;
80dcc388
MS
1683 ret = setsockopt(zns->netlink_dplane_out.sock, SOL_NETLINK,
1684 NETLINK_CAP_ACK, &one, sizeof(one));
9781e6a0
DS
1685 if (ret < 0)
1686 zlog_notice(
1687 "Registration for reduced ACK packet size failed, probably running an early kernel");
5d307d5d
DS
1688#endif
1689
d62a17ae 1690 /* Register kernel socket. */
19d5a4fe 1691 if (fcntl(zns->netlink.sock, F_SETFL, O_NONBLOCK) < 0)
450971aa 1692 flog_err_sys(EC_LIB_SOCKET, "Can't set %s socket flags: %s",
09c866e3 1693 zns->netlink.name, safe_strerror(errno));
8c85e8ea
DS
1694
1695 if (fcntl(zns->netlink_cmd.sock, F_SETFL, O_NONBLOCK) < 0)
1696 zlog_err("Can't set %s socket error: %s(%d)",
1697 zns->netlink_cmd.name, safe_strerror(errno), errno);
19d5a4fe 1698
80dcc388 1699 if (fcntl(zns->netlink_dplane_out.sock, F_SETFL, O_NONBLOCK) < 0)
62b8bb7a 1700 zlog_err("Can't set %s socket error: %s(%d)",
80dcc388
MS
1701 zns->netlink_dplane_out.name, safe_strerror(errno),
1702 errno);
1703
1704 if (fcntl(zns->netlink_dplane_in.sock, F_SETFL, O_NONBLOCK) < 0)
1705 zlog_err("Can't set %s socket error: %s(%d)",
1706 zns->netlink_dplane_in.name, safe_strerror(errno),
1707 errno);
62b8bb7a 1708
19d5a4fe 1709 /* Set receive buffer size if it's set from command line */
97f85144 1710 if (nl_rcvbufsize) {
19d5a4fe 1711 netlink_recvbuf(&zns->netlink, nl_rcvbufsize);
97f85144 1712 netlink_recvbuf(&zns->netlink_cmd, nl_rcvbufsize);
80dcc388
MS
1713 netlink_recvbuf(&zns->netlink_dplane_out, nl_rcvbufsize);
1714 netlink_recvbuf(&zns->netlink_dplane_in, nl_rcvbufsize);
97f85144 1715 }
19d5a4fe 1716
80dcc388
MS
1717 /* Set filter for inbound sockets, to exclude events we've generated
1718 * ourselves.
1719 */
1720 netlink_install_filter(zns->netlink.sock, zns->netlink_cmd.snl.nl_pid,
1721 zns->netlink_dplane_out.snl.nl_pid);
1722
1723 netlink_install_filter(zns->netlink_dplane_in.sock,
62b8bb7a 1724 zns->netlink_cmd.snl.nl_pid,
80dcc388 1725 zns->netlink_dplane_out.snl.nl_pid);
62b8bb7a 1726
19d5a4fe
DS
1727 zns->t_netlink = NULL;
1728
3801e764 1729 thread_add_read(zrouter.master, kernel_read, zns,
19d5a4fe 1730 zns->netlink.sock, &zns->t_netlink);
d62a17ae 1731
1732 rt_netlink_init();
1fdc9eae 1733}
1734
34869809
MS
1735/* Helper to clean up an nlsock */
1736static void kernel_nlsock_fini(struct nlsock *nls)
1737{
1738 if (nls && nls->sock >= 0) {
1739 kernel_netlink_nlsock_remove(nls);
1740 close(nls->sock);
1741 nls->sock = -1;
1742 XFREE(MTYPE_NL_BUF, nls->buf);
1743 nls->buflen = 0;
1744 }
1745}
1746
62b8bb7a 1747void kernel_terminate(struct zebra_ns *zns, bool complete)
1fdc9eae 1748{
50478845 1749 thread_cancel(&zns->t_netlink);
d62a17ae 1750
34869809 1751 kernel_nlsock_fini(&zns->netlink);
d62a17ae 1752
34869809 1753 kernel_nlsock_fini(&zns->netlink_cmd);
ddfeb486 1754
34869809 1755 kernel_nlsock_fini(&zns->netlink_dplane_in);
80dcc388 1756
62b8bb7a
MS
1757 /* During zebra shutdown, we need to leave the dataplane socket
1758 * around until all work is done.
1759 */
34869809
MS
1760 if (complete)
1761 kernel_nlsock_fini(&zns->netlink_dplane_out);
1762}
d4000d7b 1763
34869809
MS
1764/*
1765 * Global init for platform-/OS-specific things
1766 */
1767void kernel_router_init(void)
1768{
1769 /* Init nlsock hash and lock */
1770 pthread_mutex_init(&nlsock_mutex, NULL);
1771 nlsock_hash = hash_create_size(8, kernel_netlink_nlsock_key,
1772 kernel_netlink_nlsock_hash_equal,
1773 "Netlink Socket Hash");
1774}
1775
1776/*
1777 * Global deinit for platform-/OS-specific things
1778 */
1779void kernel_router_terminate(void)
1780{
1781 pthread_mutex_destroy(&nlsock_mutex);
1782
1783 hash_free(nlsock_hash);
1784 nlsock_hash = NULL;
62b8bb7a 1785}
34869809 1786
ddfeb486 1787#endif /* HAVE_NETLINK */