zebra/kernel_netlink.c

   1 /* Kernel communication using netlink interface.
   2  * Copyright (C) 1999 Kunihiro Ishiguro
   3  *
   4  * This file is part of GNU Zebra.
   5  *
   6  * GNU Zebra is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License as published by the
   8  * Free Software Foundation; either version 2, or (at your option) any
   9  * later version.
  10  *
  11  * GNU Zebra is distributed in the hope that it will be useful, but
  12  * WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License along
  17  * with this program; see the file COPYING; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 #include <zebra.h>
  22
  23 #ifdef HAVE_NETLINK
  24
  25 #include "linklist.h"
  26 #include "if.h"
  27 #include "log.h"
  28 #include "prefix.h"
  29 #include "connected.h"
  30 #include "table.h"
  31 #include "memory.h"
  32 #include "rib.h"
  33 #include "thread.h"
  34 #include "privs.h"
  35 #include "nexthop.h"
  36 #include "vrf.h"
  37 #include "mpls.h"
  38 #include "lib_errors.h"
  39 #include "hash.h"
  40
  41 //#include "zebra/zserv.h"
  42 #include "zebra/zebra_router.h"
  43 #include "zebra/zebra_ns.h"
  44 #include "zebra/zebra_vrf.h"
  45 #include "zebra/rt.h"
  46 #include "zebra/debug.h"
  47 #include "zebra/kernel_netlink.h"
  48 #include "zebra/rt_netlink.h"
  49 #include "zebra/if_netlink.h"
  50 #include "zebra/rule_netlink.h"
  51 #include "zebra/zebra_errors.h"
  52
  53 #ifndef SO_RCVBUFFORCE
  54 #define SO_RCVBUFFORCE  (33)
  55 #endif
  56
  57 /* Hack for GNU libc version 2. */
  58 #ifndef MSG_TRUNC
  59 #define MSG_TRUNC      0x20
  60 #endif /* MSG_TRUNC */
  61
  62 #ifndef NLMSG_TAIL
  63 #define NLMSG_TAIL(nmsg)                                                       \
  64         ((struct rtattr *)(((uint8_t *)(nmsg))                                 \
  65                            + NLMSG_ALIGN((nmsg)->nlmsg_len)))
  66 #endif
  67
  68 #ifndef RTA_TAIL
  69 #define RTA_TAIL(rta)                                                          \
  70         ((struct rtattr *)(((uint8_t *)(rta)) + RTA_ALIGN((rta)->rta_len)))
  71 #endif
  72
  73 #ifndef RTNL_FAMILY_IP6MR
  74 #define RTNL_FAMILY_IP6MR 129
  75 #endif
  76
  77 #ifndef RTPROT_MROUTED
  78 #define RTPROT_MROUTED 17
  79 #endif
  80
  81 #define NL_DEFAULT_BATCH_BUFSIZE (16 * NL_PKT_BUF_SIZE)
  82
  83 /*
  84  * We limit the batch's size to a number smaller than the length of the
  85  * underlying buffer since the last message that wouldn't fit the batch would go
  86  * over the upper boundary and then it would have to be encoded again into a new
  87  * buffer. If the difference between the limit and the length of the buffer is
  88  * big enough (bigger than the biggest Netlink message) then this situation
  89  * won't occur.
  90  */
  91 #define NL_DEFAULT_BATCH_SEND_THRESHOLD (15 * NL_PKT_BUF_SIZE)
  92
  93 static const struct message nlmsg_str[] = {{RTM_NEWROUTE, "RTM_NEWROUTE"},
  94                                            {RTM_DELROUTE, "RTM_DELROUTE"},
  95                                            {RTM_GETROUTE, "RTM_GETROUTE"},
  96                                            {RTM_NEWLINK, "RTM_NEWLINK"},
  97                                            {RTM_DELLINK, "RTM_DELLINK"},
  98                                            {RTM_GETLINK, "RTM_GETLINK"},
  99                                            {RTM_NEWADDR, "RTM_NEWADDR"},
 100                                            {RTM_DELADDR, "RTM_DELADDR"},
 101                                            {RTM_GETADDR, "RTM_GETADDR"},
 102                                            {RTM_NEWNEIGH, "RTM_NEWNEIGH"},
 103                                            {RTM_DELNEIGH, "RTM_DELNEIGH"},
 104                                            {RTM_GETNEIGH, "RTM_GETNEIGH"},
 105                                            {RTM_NEWRULE, "RTM_NEWRULE"},
 106                                            {RTM_DELRULE, "RTM_DELRULE"},
 107                                            {RTM_GETRULE, "RTM_GETRULE"},
 108                                            {RTM_NEWNEXTHOP, "RTM_NEWNEXTHOP"},
 109                                            {RTM_DELNEXTHOP, "RTM_DELNEXTHOP"},
 110                                            {RTM_GETNEXTHOP, "RTM_GETNEXTHOP"},
 111                                            {0}};
 112
 113 static const struct message rtproto_str[] = {
 114         {RTPROT_REDIRECT, "redirect"},
 115         {RTPROT_KERNEL, "kernel"},
 116         {RTPROT_BOOT, "boot"},
 117         {RTPROT_STATIC, "static"},
 118         {RTPROT_GATED, "GateD"},
 119         {RTPROT_RA, "router advertisement"},
 120         {RTPROT_MRT, "MRT"},
 121         {RTPROT_ZEBRA, "Zebra"},
 122 #ifdef RTPROT_BIRD
 123         {RTPROT_BIRD, "BIRD"},
 124 #endif /* RTPROT_BIRD */
 125         {RTPROT_MROUTED, "mroute"},
 126         {RTPROT_BGP, "BGP"},
 127         {RTPROT_OSPF, "OSPF"},
 128         {RTPROT_ISIS, "IS-IS"},
 129         {RTPROT_RIP, "RIP"},
 130         {RTPROT_RIPNG, "RIPNG"},
 131         {RTPROT_ZSTATIC, "static"},
 132         {0}};
 133
 134 static const struct message family_str[] = {{AF_INET, "ipv4"},
 135                                             {AF_INET6, "ipv6"},
 136                                             {AF_BRIDGE, "bridge"},
 137                                             {RTNL_FAMILY_IPMR, "ipv4MR"},
 138                                             {RTNL_FAMILY_IP6MR, "ipv6MR"},
 139                                             {0}};
 140
 141 static const struct message rttype_str[] = {{RTN_UNSPEC, "none"},
 142                                             {RTN_UNICAST, "unicast"},
 143                                             {RTN_LOCAL, "local"},
 144                                             {RTN_BROADCAST, "broadcast"},
 145                                             {RTN_ANYCAST, "anycast"},
 146                                             {RTN_MULTICAST, "multicast"},
 147                                             {RTN_BLACKHOLE, "blackhole"},
 148                                             {RTN_UNREACHABLE, "unreachable"},
 149                                             {RTN_PROHIBIT, "prohibited"},
 150                                             {RTN_THROW, "throw"},
 151                                             {RTN_NAT, "nat"},
 152                                             {RTN_XRESOLVE, "resolver"},
 153                                             {0}};
 154
 155 extern struct thread_master *master;
 156 extern uint32_t nl_rcvbufsize;
 157
 158 extern struct zebra_privs_t zserv_privs;
 159
 160 DEFINE_MTYPE_STATIC(ZEBRA, NL_BUF, "Zebra Netlink buffers");
 161
 162 struct hash *nlsock_hash;
 163 size_t nl_batch_tx_bufsize;
 164 char *nl_batch_tx_buf;
 165
 166 _Atomic uint32_t nl_batch_bufsize = NL_DEFAULT_BATCH_BUFSIZE;
 167 _Atomic uint32_t nl_batch_send_threshold = NL_DEFAULT_BATCH_SEND_THRESHOLD;
 168
 169 struct nl_batch {
 170         void *buf;
 171         size_t bufsiz;
 172         size_t limit;
 173
 174         void *buf_head;
 175         size_t curlen;
 176         size_t msgcnt;
 177
 178         const struct zebra_dplane_info *zns;
 179
 180         struct dplane_ctx_q ctx_list;
 181
 182         /*
 183          * Pointer to the queue of completed contexts outbound back
 184          * towards the dataplane module.
 185          */
 186         struct dplane_ctx_q *ctx_out_q;
 187 };
 188
 189 int netlink_config_write_helper(struct vty *vty)
 190 {
 191         uint32_t size =
 192                 atomic_load_explicit(&nl_batch_bufsize, memory_order_relaxed);
 193         uint32_t threshold = atomic_load_explicit(&nl_batch_send_threshold,
 194                                                   memory_order_relaxed);
 195
 196         if (size != NL_DEFAULT_BATCH_BUFSIZE
 197             || threshold != NL_DEFAULT_BATCH_SEND_THRESHOLD)
 198                 vty_out(vty, "zebra kernel netlink batch-tx-buf %u %u\n", size,
 199                         threshold);
 200
 201         return 0;
 202 }
 203
 204 void netlink_set_batch_buffer_size(uint32_t size, uint32_t threshold, bool set)
 205 {
 206         if (!set) {
 207                 size = NL_DEFAULT_BATCH_BUFSIZE;
 208                 threshold = NL_DEFAULT_BATCH_SEND_THRESHOLD;
 209         }
 210
 211         atomic_store_explicit(&nl_batch_bufsize, size, memory_order_relaxed);
 212         atomic_store_explicit(&nl_batch_send_threshold, threshold,
 213                               memory_order_relaxed);
 214 }
 215
 216 int netlink_talk_filter(struct nlmsghdr *h, ns_id_t ns_id, int startup)
 217 {
 218         /*
 219          * This is an error condition that must be handled during
 220          * development.
 221          *
 222          * The netlink_talk_filter function is used for communication
 223          * down the netlink_cmd pipe and we are expecting
 224          * an ack being received.  So if we get here
 225          * then we did not receive the ack and instead
 226          * received some other message in an unexpected
 227          * way.
 228          */
 229         zlog_debug("%s: ignoring message type 0x%04x(%s) NS %u", __func__,
 230                    h->nlmsg_type, nl_msg_type_to_str(h->nlmsg_type), ns_id);
 231         return 0;
 232 }
 233
 234 static int netlink_recvbuf(struct nlsock *nl, uint32_t newsize)
 235 {
 236         uint32_t oldsize;
 237         socklen_t newlen = sizeof(newsize);
 238         socklen_t oldlen = sizeof(oldsize);
 239         int ret;
 240
 241         ret = getsockopt(nl->sock, SOL_SOCKET, SO_RCVBUF, &oldsize, &oldlen);
 242         if (ret < 0) {
 243                 flog_err_sys(EC_LIB_SOCKET,
 244                              "Can't get %s receive buffer size: %s", nl->name,
 245                              safe_strerror(errno));
 246                 return -1;
 247         }
 248
 249         /* Try force option (linux >= 2.6.14) and fall back to normal set */
 250         frr_with_privs(&zserv_privs) {
 251                 ret = setsockopt(nl->sock, SOL_SOCKET, SO_RCVBUFFORCE,
 252                                  &nl_rcvbufsize,
 253                                  sizeof(nl_rcvbufsize));
 254         }
 255         if (ret < 0)
 256                 ret = setsockopt(nl->sock, SOL_SOCKET, SO_RCVBUF,
 257                                  &nl_rcvbufsize, sizeof(nl_rcvbufsize));
 258         if (ret < 0) {
 259                 flog_err_sys(EC_LIB_SOCKET,
 260                              "Can't set %s receive buffer size: %s", nl->name,
 261                              safe_strerror(errno));
 262                 return -1;
 263         }
 264
 265         ret = getsockopt(nl->sock, SOL_SOCKET, SO_RCVBUF, &newsize, &newlen);
 266         if (ret < 0) {
 267                 flog_err_sys(EC_LIB_SOCKET,
 268                              "Can't get %s receive buffer size: %s", nl->name,
 269                              safe_strerror(errno));
 270                 return -1;
 271         }
 272         return 0;
 273 }
 274
 275 /* Make socket for Linux netlink interface. */
 276 static int netlink_socket(struct nlsock *nl, unsigned long groups,
 277                           ns_id_t ns_id)
 278 {
 279         int ret;
 280         struct sockaddr_nl snl;
 281         int sock;
 282         int namelen;
 283
 284         frr_with_privs(&zserv_privs) {
 285                 sock = ns_socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE, ns_id);
 286                 if (sock < 0) {
 287                         zlog_err("Can't open %s socket: %s", nl->name,
 288                                  safe_strerror(errno));
 289                         return -1;
 290                 }
 291
 292                 memset(&snl, 0, sizeof(snl));
 293                 snl.nl_family = AF_NETLINK;
 294                 snl.nl_groups = groups;
 295
 296                 /* Bind the socket to the netlink structure for anything. */
 297                 ret = bind(sock, (struct sockaddr *)&snl, sizeof(snl));
 298         }
 299
 300         if (ret < 0) {
 301                 zlog_err("Can't bind %s socket to group 0x%x: %s", nl->name,
 302                          snl.nl_groups, safe_strerror(errno));
 303                 close(sock);
 304                 return -1;
 305         }
 306
 307         /* multiple netlink sockets will have different nl_pid */
 308         namelen = sizeof(snl);
 309         ret = getsockname(sock, (struct sockaddr *)&snl, (socklen_t *)&namelen);
 310         if (ret < 0 || namelen != sizeof(snl)) {
 311                 flog_err_sys(EC_LIB_SOCKET, "Can't get %s socket name: %s",
 312                              nl->name, safe_strerror(errno));
 313                 close(sock);
 314                 return -1;
 315         }
 316
 317         nl->snl = snl;
 318         nl->sock = sock;
 319         nl->buflen = NL_RCV_PKT_BUF_SIZE;
 320         nl->buf = XMALLOC(MTYPE_NL_BUF, nl->buflen);
 321
 322         return ret;
 323 }
 324
 325 /*
 326  * Dispatch an incoming netlink message; used by the zebra main pthread's
 327  * netlink event reader.
 328  */
 329 static int netlink_information_fetch(struct nlmsghdr *h, ns_id_t ns_id,
 330                                      int startup)
 331 {
 332         /*
 333          * When we handle new message types here
 334          * because we are starting to install them
 335          * then lets check the netlink_install_filter
 336          * and see if we should add the corresponding
 337          * allow through entry there.
 338          * Probably not needed to do but please
 339          * think about it.
 340          */
 341         switch (h->nlmsg_type) {
 342         case RTM_NEWROUTE:
 343                 return netlink_route_change(h, ns_id, startup);
 344         case RTM_DELROUTE:
 345                 return netlink_route_change(h, ns_id, startup);
 346         case RTM_NEWLINK:
 347                 return netlink_link_change(h, ns_id, startup);
 348         case RTM_DELLINK:
 349                 return netlink_link_change(h, ns_id, startup);
 350         case RTM_NEWNEIGH:
 351         case RTM_DELNEIGH:
 352         case RTM_GETNEIGH:
 353                 return netlink_neigh_change(h, ns_id);
 354         case RTM_NEWRULE:
 355                 return netlink_rule_change(h, ns_id, startup);
 356         case RTM_DELRULE:
 357                 return netlink_rule_change(h, ns_id, startup);
 358         case RTM_NEWNEXTHOP:
 359                 return netlink_nexthop_change(h, ns_id, startup);
 360         case RTM_DELNEXTHOP:
 361                 return netlink_nexthop_change(h, ns_id, startup);
 362
 363         /* Messages handled in the dplane thread */
 364         case RTM_NEWADDR:
 365         case RTM_DELADDR:
 366                 return 0;
 367
 368         default:
 369                 /*
 370                  * If we have received this message then
 371                  * we have made a mistake during development
 372                  * and we need to write some code to handle
 373                  * this message type or not ask for
 374                  * it to be sent up to us
 375                  */
 376                 flog_err(EC_ZEBRA_UNKNOWN_NLMSG,
 377                          "Unknown netlink nlmsg_type %s(%d) vrf %u",
 378                          nl_msg_type_to_str(h->nlmsg_type), h->nlmsg_type,
 379                          ns_id);
 380                 break;
 381         }
 382         return 0;
 383 }
 384
 385 /*
 386  * Dispatch an incoming netlink message; used by the dataplane pthread's
 387  * netlink event reader code.
 388  */
 389 static int dplane_netlink_information_fetch(struct nlmsghdr *h, ns_id_t ns_id,
 390                                             int startup)
 391 {
 392         /*
 393          * Dispatch the incoming messages that the dplane pthread handles
 394          */
 395         switch (h->nlmsg_type) {
 396         case RTM_NEWADDR:
 397         case RTM_DELADDR:
 398                 return netlink_interface_addr_dplane(h, ns_id, startup);
 399
 400         /* TODO */
 401         case RTM_NEWLINK:
 402         case RTM_DELLINK:
 403
 404         default:
 405                 break;
 406         }
 407
 408         return 0;
 409 }
 410
 411 static int kernel_read(struct thread *thread)
 412 {
 413         struct zebra_ns *zns = (struct zebra_ns *)THREAD_ARG(thread);
 414         struct zebra_dplane_info dp_info;
 415
 416         /* Capture key info from ns struct */
 417         zebra_dplane_info_from_zns(&dp_info, zns, false);
 418
 419         netlink_parse_info(netlink_information_fetch, &zns->netlink, &dp_info,
 420                            5, false);
 421
 422         thread_add_read(zrouter.master, kernel_read, zns, zns->netlink.sock,
 423                         &zns->t_netlink);
 424
 425         return 0;
 426 }
 427
 428 /*
 429  * Called by the dplane pthread to read incoming OS messages and dispatch them.
 430  */
 431 int kernel_dplane_read(struct zebra_dplane_info *info)
 432 {
 433         struct nlsock *nl = kernel_netlink_nlsock_lookup(info->sock);
 434
 435         netlink_parse_info(dplane_netlink_information_fetch, nl, info, 5,
 436                            false);
 437
 438         return 0;
 439 }
 440
 441 /*
 442  * Filter out messages from self that occur on listener socket,
 443  * caused by our actions on the command socket(s)
 444  *
 445  * When we add new Netlink message types we probably
 446  * do not need to add them here as that we are filtering
 447  * on the routes we actually care to receive( which is rarer
 448  * then the normal course of operations).  We are intentionally
 449  * allowing some messages from ourselves through
 450  * ( I'm looking at you Interface based netlink messages )
 451  * so that we only had to write one way to handle incoming
 452  * address add/delete changes.
 453  */
 454 static void netlink_install_filter(int sock, uint32_t pid, uint32_t dplane_pid)
 455 {
 456         /*
 457          * BPF_JUMP instructions and where you jump to are based upon
 458          * 0 as being the next statement.  So count from 0.  Writing
 459          * this down because every time I look at this I have to
 460          * re-remember it.
 461          */
 462         struct sock_filter filter[] = {
 463                 /*
 464                  * Logic:
 465                  *   if (nlmsg_pid == pid ||
 466                  *       nlmsg_pid == dplane_pid) {
 467                  *       if (the incoming nlmsg_type ==
 468                  *           RTM_NEWADDR | RTM_DELADDR)
 469                  *           keep this message
 470                  *       else
 471                  *           skip this message
 472                  *   } else
 473                  *       keep this netlink message
 474                  */
 475                 /*
 476                  * 0: Load the nlmsg_pid into the BPF register
 477                  */
 478                 BPF_STMT(BPF_LD | BPF_ABS | BPF_W,
 479                          offsetof(struct nlmsghdr, nlmsg_pid)),
 480                 /*
 481                  * 1: Compare to pid
 482                  */
 483                 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htonl(pid), 1, 0),
 484                 /*
 485                  * 2: Compare to dplane pid
 486                  */
 487                 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htonl(dplane_pid), 0, 4),
 488                 /*
 489                  * 3: Load the nlmsg_type into BPF register
 490                  */
 491                 BPF_STMT(BPF_LD | BPF_ABS | BPF_H,
 492                          offsetof(struct nlmsghdr, nlmsg_type)),
 493                 /*
 494                  * 4: Compare to RTM_NEWADDR
 495                  */
 496                 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htons(RTM_NEWADDR), 2, 0),
 497                 /*
 498                  * 5: Compare to RTM_DELADDR
 499                  */
 500                 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htons(RTM_DELADDR), 1, 0),
 501                 /*
 502                  * 6: This is the end state of we want to skip the
 503                  *    message
 504                  */
 505                 BPF_STMT(BPF_RET | BPF_K, 0),
 506                 /* 7: This is the end state of we want to keep
 507                  *     the message
 508                  */
 509                 BPF_STMT(BPF_RET | BPF_K, 0xffff),
 510         };
 511
 512         struct sock_fprog prog = {
 513                 .len = array_size(filter), .filter = filter,
 514         };
 515
 516         if (setsockopt(sock, SOL_SOCKET, SO_ATTACH_FILTER, &prog, sizeof(prog))
 517             < 0)
 518                 flog_err_sys(EC_LIB_SOCKET, "Can't install socket filter: %s",
 519                              safe_strerror(errno));
 520 }
 521
 522 void netlink_parse_rtattr_flags(struct rtattr **tb, int max, struct rtattr *rta,
 523                                 int len, unsigned short flags)
 524 {
 525         unsigned short type;
 526
 527         memset(tb, 0, sizeof(struct rtattr *) * (max + 1));
 528         while (RTA_OK(rta, len)) {
 529                 type = rta->rta_type & ~flags;
 530                 if ((type <= max) && (!tb[type]))
 531                         tb[type] = rta;
 532                 rta = RTA_NEXT(rta, len);
 533         }
 534 }
 535
 536 void netlink_parse_rtattr(struct rtattr **tb, int max, struct rtattr *rta,
 537                           int len)
 538 {
 539         memset(tb, 0, sizeof(struct rtattr *) * (max + 1));
 540         while (RTA_OK(rta, len)) {
 541                 if (rta->rta_type <= max)
 542                         tb[rta->rta_type] = rta;
 543                 rta = RTA_NEXT(rta, len);
 544         }
 545 }
 546
 547 /**
 548  * netlink_parse_rtattr_nested() - Parses a nested route attribute
 549  * @tb:         Pointer to array for storing rtattr in.
 550  * @max:        Max number to store.
 551  * @rta:        Pointer to rtattr to look for nested items in.
 552  */
 553 void netlink_parse_rtattr_nested(struct rtattr **tb, int max,
 554                                  struct rtattr *rta)
 555 {
 556         netlink_parse_rtattr(tb, max, RTA_DATA(rta), RTA_PAYLOAD(rta));
 557 }
 558
 559 bool nl_attr_put(struct nlmsghdr *n, unsigned int maxlen, int type,
 560                  const void *data, unsigned int alen)
 561 {
 562         int len;
 563         struct rtattr *rta;
 564
 565         len = RTA_LENGTH(alen);
 566
 567         if (NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len) > maxlen)
 568                 return false;
 569
 570         rta = (struct rtattr *)(((char *)n) + NLMSG_ALIGN(n->nlmsg_len));
 571         rta->rta_type = type;
 572         rta->rta_len = len;
 573
 574         if (data)
 575                 memcpy(RTA_DATA(rta), data, alen);
 576         else
 577                 assert(alen == 0);
 578
 579         n->nlmsg_len = NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len);
 580
 581         return true;
 582 }
 583
 584 bool nl_attr_put8(struct nlmsghdr *n, unsigned int maxlen, int type,
 585                   uint8_t data)
 586 {
 587         return nl_attr_put(n, maxlen, type, &data, sizeof(uint8_t));
 588 }
 589
 590 bool nl_attr_put16(struct nlmsghdr *n, unsigned int maxlen, int type,
 591                    uint16_t data)
 592 {
 593         return nl_attr_put(n, maxlen, type, &data, sizeof(uint16_t));
 594 }
 595
 596 bool nl_attr_put32(struct nlmsghdr *n, unsigned int maxlen, int type,
 597                    uint32_t data)
 598 {
 599         return nl_attr_put(n, maxlen, type, &data, sizeof(uint32_t));
 600 }
 601
 602 struct rtattr *nl_attr_nest(struct nlmsghdr *n, unsigned int maxlen, int type)
 603 {
 604         struct rtattr *nest = NLMSG_TAIL(n);
 605
 606         if (!nl_attr_put(n, maxlen, type, NULL, 0))
 607                 return NULL;
 608
 609         nest->rta_type |= NLA_F_NESTED;
 610         return nest;
 611 }
 612
 613 int nl_attr_nest_end(struct nlmsghdr *n, struct rtattr *nest)
 614 {
 615         nest->rta_len = (uint8_t *)NLMSG_TAIL(n) - (uint8_t *)nest;
 616         return n->nlmsg_len;
 617 }
 618
 619 struct rtnexthop *nl_attr_rtnh(struct nlmsghdr *n, unsigned int maxlen)
 620 {
 621         struct rtnexthop *rtnh = (struct rtnexthop *)NLMSG_TAIL(n);
 622
 623         if (NLMSG_ALIGN(n->nlmsg_len) + RTNH_ALIGN(sizeof(struct rtnexthop))
 624             > maxlen)
 625                 return NULL;
 626
 627         memset(rtnh, 0, sizeof(struct rtnexthop));
 628         n->nlmsg_len =
 629                 NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(sizeof(struct rtnexthop));
 630
 631         return rtnh;
 632 }
 633
 634 void nl_attr_rtnh_end(struct nlmsghdr *n, struct rtnexthop *rtnh)
 635 {
 636         rtnh->rtnh_len = (uint8_t *)NLMSG_TAIL(n) - (uint8_t *)rtnh;
 637 }
 638
 639 const char *nl_msg_type_to_str(uint16_t msg_type)
 640 {
 641         return lookup_msg(nlmsg_str, msg_type, "");
 642 }
 643
 644 const char *nl_rtproto_to_str(uint8_t rtproto)
 645 {
 646         return lookup_msg(rtproto_str, rtproto, "");
 647 }
 648
 649 const char *nl_family_to_str(uint8_t family)
 650 {
 651         return lookup_msg(family_str, family, "");
 652 }
 653
 654 const char *nl_rttype_to_str(uint8_t rttype)
 655 {
 656         return lookup_msg(rttype_str, rttype, "");
 657 }
 658
 659 #define NLA_OK(nla, len)                                                       \
 660         ((len) >= (int)sizeof(struct nlattr)                                   \
 661          && (nla)->nla_len >= sizeof(struct nlattr)                            \
 662          && (nla)->nla_len <= (len))
 663 #define NLA_NEXT(nla, attrlen)                                                 \
 664         ((attrlen) -= NLA_ALIGN((nla)->nla_len),                               \
 665          (struct nlattr *)(((char *)(nla)) + NLA_ALIGN((nla)->nla_len)))
 666 #define NLA_LENGTH(len) (NLA_ALIGN(sizeof(struct nlattr)) + (len))
 667 #define NLA_DATA(nla) ((struct nlattr *)(((char *)(nla)) + NLA_LENGTH(0)))
 668
 669 #define ERR_NLA(err, inner_len)                                                \
 670         ((struct nlattr *)(((char *)(err))                                     \
 671                            + NLMSG_ALIGN(sizeof(struct nlmsgerr))              \
 672                            + NLMSG_ALIGN((inner_len))))
 673
 674 static void netlink_parse_nlattr(struct nlattr **tb, int max,
 675                                  struct nlattr *nla, int len)
 676 {
 677         while (NLA_OK(nla, len)) {
 678                 if (nla->nla_type <= max)
 679                         tb[nla->nla_type] = nla;
 680                 nla = NLA_NEXT(nla, len);
 681         }
 682 }
 683
 684 static void netlink_parse_extended_ack(struct nlmsghdr *h)
 685 {
 686         struct nlattr *tb[NLMSGERR_ATTR_MAX + 1] = {};
 687         const struct nlmsgerr *err = (const struct nlmsgerr *)NLMSG_DATA(h);
 688         const struct nlmsghdr *err_nlh = NULL;
 689         /* Length not including nlmsghdr */
 690         uint32_t len = 0;
 691         /* Inner error netlink message length */
 692         uint32_t inner_len = 0;
 693         const char *msg = NULL;
 694         uint32_t off = 0;
 695
 696         if (!(h->nlmsg_flags & NLM_F_CAPPED))
 697                 inner_len = (uint32_t)NLMSG_PAYLOAD(&err->msg, 0);
 698
 699         len = (uint32_t)(NLMSG_PAYLOAD(h, sizeof(struct nlmsgerr)) - inner_len);
 700
 701         netlink_parse_nlattr(tb, NLMSGERR_ATTR_MAX, ERR_NLA(err, inner_len),
 702                              len);
 703
 704         if (tb[NLMSGERR_ATTR_MSG])
 705                 msg = (const char *)NLA_DATA(tb[NLMSGERR_ATTR_MSG]);
 706
 707         if (tb[NLMSGERR_ATTR_OFFS]) {
 708                 off = *(uint32_t *)NLA_DATA(tb[NLMSGERR_ATTR_OFFS]);
 709
 710                 if (off > h->nlmsg_len) {
 711                         zlog_err("Invalid offset for NLMSGERR_ATTR_OFFS");
 712                 } else if (!(h->nlmsg_flags & NLM_F_CAPPED)) {
 713                         /*
 714                          * Header of failed message
 715                          * we are not doing anything currently with it
 716                          * but noticing it for later.
 717                          */
 718                         err_nlh = &err->msg;
 719                         zlog_debug("%s: Received %s extended Ack", __func__,
 720                                    nl_msg_type_to_str(err_nlh->nlmsg_type));
 721                 }
 722         }
 723
 724         if (msg && *msg != '\0') {
 725                 bool is_err = !!err->error;
 726
 727                 if (is_err)
 728                         zlog_err("Extended Error: %s", msg);
 729                 else
 730                         flog_warn(EC_ZEBRA_NETLINK_EXTENDED_WARNING,
 731                                   "Extended Warning: %s", msg);
 732         }
 733 }
 734
 735 /*
 736  * netlink_send_msg - send a netlink message of a certain size.
 737  *
 738  * Returns -1 on error. Otherwise, it returns the number of bytes sent.
 739  */
 740 static ssize_t netlink_send_msg(const struct nlsock *nl, void *buf,
 741                                 size_t buflen)
 742 {
 743         struct sockaddr_nl snl = {};
 744         struct iovec iov = {};
 745         struct msghdr msg = {};
 746         ssize_t status;
 747         int save_errno = 0;
 748
 749         iov.iov_base = buf;
 750         iov.iov_len = buflen;
 751         msg.msg_name = &snl;
 752         msg.msg_namelen = sizeof(snl);
 753         msg.msg_iov = &iov;
 754         msg.msg_iovlen = 1;
 755
 756         snl.nl_family = AF_NETLINK;
 757
 758         /* Send message to netlink interface. */
 759         frr_with_privs(&zserv_privs) {
 760                 status = sendmsg(nl->sock, &msg, 0);
 761                 save_errno = errno;
 762         }
 763
 764         if (IS_ZEBRA_DEBUG_KERNEL_MSGDUMP_SEND) {
 765                 zlog_debug("%s: >> netlink message dump [sent]", __func__);
 766 #ifdef NETLINK_DEBUG
 767                 nl_dump(buf, buflen);
 768 #else
 769                 zlog_hexdump(buf, buflen);
 770 #endif /* NETLINK_DEBUG */
 771         }
 772
 773         if (status == -1) {
 774                 flog_err_sys(EC_LIB_SOCKET, "%s error: %s", __func__,
 775                              safe_strerror(save_errno));
 776                 return -1;
 777         }
 778
 779         return status;
 780 }
 781
 782 /*
 783  * netlink_recv_msg - receive a netlink message.
 784  *
 785  * Returns -1 on error, 0 if read would block or the number of bytes received.
 786  */
 787 static int netlink_recv_msg(struct nlsock *nl, struct msghdr *msg)
 788 {
 789         struct iovec iov;
 790         int status;
 791
 792         iov.iov_base = nl->buf;
 793         iov.iov_len = nl->buflen;
 794         msg->msg_iov = &iov;
 795         msg->msg_iovlen = 1;
 796
 797         do {
 798                 int bytes;
 799
 800                 bytes = recv(nl->sock, NULL, 0, MSG_PEEK | MSG_TRUNC);
 801
 802                 if (bytes >= 0 && (size_t)bytes > nl->buflen) {
 803                         nl->buf = XREALLOC(MTYPE_NL_BUF, nl->buf, bytes);
 804                         nl->buflen = bytes;
 805                         iov.iov_base = nl->buf;
 806                         iov.iov_len = nl->buflen;
 807                 }
 808
 809                 status = recvmsg(nl->sock, msg, 0);
 810         } while (status == -1 && errno == EINTR);
 811
 812         if (status == -1) {
 813                 if (errno == EWOULDBLOCK || errno == EAGAIN)
 814                         return 0;
 815                 flog_err(EC_ZEBRA_RECVMSG_OVERRUN, "%s recvmsg overrun: %s",
 816                          nl->name, safe_strerror(errno));
 817                 /*
 818                  * In this case we are screwed. There is no good way to recover
 819                  * zebra at this point.
 820                  */
 821                 exit(-1);
 822         }
 823
 824         if (status == 0) {
 825                 flog_err_sys(EC_LIB_SOCKET, "%s EOF", nl->name);
 826                 return -1;
 827         }
 828
 829         if (msg->msg_namelen != sizeof(struct sockaddr_nl)) {
 830                 flog_err(EC_ZEBRA_NETLINK_LENGTH_ERROR,
 831                          "%s sender address length error: length %d", nl->name,
 832                          msg->msg_namelen);
 833                 return -1;
 834         }
 835
 836         if (IS_ZEBRA_DEBUG_KERNEL_MSGDUMP_RECV) {
 837                 zlog_debug("%s: << netlink message dump [recv]", __func__);
 838 #ifdef NETLINK_DEBUG
 839                 nl_dump(nl->buf, status);
 840 #else
 841                 zlog_hexdump(nl->buf, status);
 842 #endif /* NETLINK_DEBUG */
 843         }
 844
 845         return status;
 846 }
 847
 848 /*
 849  * netlink_parse_error - parse a netlink error message
 850  *
 851  * Returns 1 if this message is acknowledgement, 0 if this error should be
 852  * ignored, -1 otherwise.
 853  */
 854 static int netlink_parse_error(const struct nlsock *nl, struct nlmsghdr *h,
 855                                bool is_cmd, bool startup)
 856 {
 857         struct nlmsgerr *err = (struct nlmsgerr *)NLMSG_DATA(h);
 858         int errnum = err->error;
 859         int msg_type = err->msg.nlmsg_type;
 860
 861         if (h->nlmsg_len < NLMSG_LENGTH(sizeof(struct nlmsgerr))) {
 862                 flog_err(EC_ZEBRA_NETLINK_LENGTH_ERROR,
 863                          "%s error: message truncated", nl->name);
 864                 return -1;
 865         }
 866
 867         /*
 868          * Parse the extended information before we actually handle it. At this
 869          * point in time we do not do anything other than report the issue.
 870          */
 871         if (h->nlmsg_flags & NLM_F_ACK_TLVS)
 872                 netlink_parse_extended_ack(h);
 873
 874         /* If the error field is zero, then this is an ACK. */
 875         if (err->error == 0) {
 876                 if (IS_ZEBRA_DEBUG_KERNEL) {
 877                         zlog_debug("%s: %s ACK: type=%s(%u), seq=%u, pid=%u",
 878                                    __func__, nl->name,
 879                                    nl_msg_type_to_str(err->msg.nlmsg_type),
 880                                    err->msg.nlmsg_type, err->msg.nlmsg_seq,
 881                                    err->msg.nlmsg_pid);
 882                 }
 883
 884                 return 1;
 885         }
 886
 887         /* Deal with errors that occur because of races in link handling. */
 888         if (is_cmd
 889             && ((msg_type == RTM_DELROUTE
 890                  && (-errnum == ENODEV || -errnum == ESRCH))
 891                 || (msg_type == RTM_NEWROUTE
 892                     && (-errnum == ENETDOWN || -errnum == EEXIST)))) {
 893                 if (IS_ZEBRA_DEBUG_KERNEL)
 894                         zlog_debug("%s: error: %s type=%s(%u), seq=%u, pid=%u",
 895                                    nl->name, safe_strerror(-errnum),
 896                                    nl_msg_type_to_str(msg_type), msg_type,
 897                                    err->msg.nlmsg_seq, err->msg.nlmsg_pid);
 898                 return 0;
 899         }
 900
 901         /*
 902          * We see RTM_DELNEIGH when shutting down an interface with an IPv4
 903          * link-local.  The kernel should have already deleted the neighbor so
 904          * do not log these as an error.
 905          */
 906         if (msg_type == RTM_DELNEIGH
 907             || (is_cmd && msg_type == RTM_NEWROUTE
 908                 && (-errnum == ESRCH || -errnum == ENETUNREACH))) {
 909                 /*
 910                  * This is known to happen in some situations, don't log as
 911                  * error.
 912                  */
 913                 if (IS_ZEBRA_DEBUG_KERNEL)
 914                         zlog_debug("%s error: %s, type=%s(%u), seq=%u, pid=%u",
 915                                    nl->name, safe_strerror(-errnum),
 916                                    nl_msg_type_to_str(msg_type), msg_type,
 917                                    err->msg.nlmsg_seq, err->msg.nlmsg_pid);
 918         } else {
 919                 if ((msg_type != RTM_GETNEXTHOP) || !startup)
 920                         flog_err(EC_ZEBRA_UNEXPECTED_MESSAGE,
 921                                  "%s error: %s, type=%s(%u), seq=%u, pid=%u",
 922                                  nl->name, safe_strerror(-errnum),
 923                                  nl_msg_type_to_str(msg_type), msg_type,
 924                                  err->msg.nlmsg_seq, err->msg.nlmsg_pid);
 925         }
 926
 927         return -1;
 928 }
 929
 930 /*
 931  * netlink_parse_info
 932  *
 933  * Receive message from netlink interface and pass those information
 934  *  to the given function.
 935  *
 936  * filter  -> Function to call to read the results
 937  * nl      -> netlink socket information
 938  * zns     -> The zebra namespace data
 939  * count   -> How many we should read in, 0 means as much as possible
 940  * startup -> Are we reading in under startup conditions? passed to
 941  *            the filter.
 942  */
 943 int netlink_parse_info(int (*filter)(struct nlmsghdr *, ns_id_t, int),
 944                        struct nlsock *nl, const struct zebra_dplane_info *zns,
 945                        int count, bool startup)
 946 {
 947         int status;
 948         int ret = 0;
 949         int error;
 950         int read_in = 0;
 951
 952         while (1) {
 953                 struct sockaddr_nl snl;
 954                 struct msghdr msg = {.msg_name = (void *)&snl,
 955                                      .msg_namelen = sizeof(snl)};
 956                 struct nlmsghdr *h;
 957
 958                 if (count && read_in >= count)
 959                         return 0;
 960
 961                 status = netlink_recv_msg(nl, &msg);
 962                 if (status == -1)
 963                         return -1;
 964                 else if (status == 0)
 965                         break;
 966
 967                 read_in++;
 968                 for (h = (struct nlmsghdr *)nl->buf;
 969                      (status >= 0 && NLMSG_OK(h, (unsigned int)status));
 970                      h = NLMSG_NEXT(h, status)) {
 971                         /* Finish of reading. */
 972                         if (h->nlmsg_type == NLMSG_DONE)
 973                                 return ret;
 974
 975                         /* Error handling. */
 976                         if (h->nlmsg_type == NLMSG_ERROR) {
 977                                 int err = netlink_parse_error(
 978                                         nl, h, zns->is_cmd, startup);
 979
 980                                 if (err == 1) {
 981                                         if (!(h->nlmsg_flags & NLM_F_MULTI))
 982                                                 return 0;
 983                                         continue;
 984                                 } else
 985                                         return err;
 986                         }
 987
 988                         /* OK we got netlink message. */
 989                         if (IS_ZEBRA_DEBUG_KERNEL)
 990                                 zlog_debug(
 991                                         "%s: %s type %s(%u), len=%d, seq=%u, pid=%u",
 992                                         __func__, nl->name,
 993                                         nl_msg_type_to_str(h->nlmsg_type),
 994                                         h->nlmsg_type, h->nlmsg_len,
 995                                         h->nlmsg_seq, h->nlmsg_pid);
 996
 997
 998                         /*
 999                          * Ignore messages that maybe sent from
1000                          * other actors besides the kernel
1001                          */
1002                         if (snl.nl_pid != 0) {
1003                                 zlog_debug("Ignoring message from pid %u",
1004                                            snl.nl_pid);
1005                                 continue;
1006                         }
1007
1008                         error = (*filter)(h, zns->ns_id, startup);
1009                         if (error < 0) {
1010                                 zlog_debug("%s filter function error",
1011                                            nl->name);
1012                                 ret = error;
1013                         }
1014                 }
1015
1016                 /* After error care. */
1017                 if (msg.msg_flags & MSG_TRUNC) {
1018                         flog_err(EC_ZEBRA_NETLINK_LENGTH_ERROR,
1019                                  "%s error: message truncated", nl->name);
1020                         continue;
1021                 }
1022                 if (status) {
1023                         flog_err(EC_ZEBRA_NETLINK_LENGTH_ERROR,
1024                                  "%s error: data remnant size %d", nl->name,
1025                                  status);
1026                         return -1;
1027                 }
1028         }
1029         return ret;
1030 }
1031
1032 /*
1033  * netlink_talk_info
1034  *
1035  * sendmsg() to netlink socket then recvmsg().
1036  * Calls netlink_parse_info to parse returned data
1037  *
1038  * filter   -> The filter to read final results from kernel
1039  * nlmsghdr -> The data to send to the kernel
1040  * dp_info -> The dataplane and netlink socket information
1041  * startup  -> Are we reading in under startup conditions
1042  *             This is passed through eventually to filter.
1043  */
1044 static int netlink_talk_info(int (*filter)(struct nlmsghdr *, ns_id_t,
1045                                            int startup),
1046                              struct nlmsghdr *n,
1047                              struct zebra_dplane_info *dp_info, bool startup)
1048 {
1049         struct nlsock *nl;
1050
1051         nl = kernel_netlink_nlsock_lookup(dp_info->sock);
1052         n->nlmsg_seq = dp_info->seq;
1053         n->nlmsg_pid = nl->snl.nl_pid;
1054
1055         if (IS_ZEBRA_DEBUG_KERNEL)
1056                 zlog_debug(
1057                         "netlink_talk: %s type %s(%u), len=%d seq=%u flags 0x%x",
1058                         nl->name, nl_msg_type_to_str(n->nlmsg_type),
1059                         n->nlmsg_type, n->nlmsg_len, n->nlmsg_seq,
1060                         n->nlmsg_flags);
1061
1062         if (netlink_send_msg(nl, n, n->nlmsg_len) == -1)
1063                 return -1;
1064
1065         /*
1066          * Get reply from netlink socket.
1067          * The reply should either be an acknowlegement or an error.
1068          */
1069         return netlink_parse_info(filter, nl, dp_info, 0, startup);
1070 }
1071
1072 /*
1073  * Synchronous version of netlink_talk_info. Converts args to suit the
1074  * common version, which is suitable for both sync and async use.
1075  */
1076 int netlink_talk(int (*filter)(struct nlmsghdr *, ns_id_t, int startup),
1077                  struct nlmsghdr *n, struct nlsock *nl, struct zebra_ns *zns,
1078                  bool startup)
1079 {
1080         struct zebra_dplane_info dp_info;
1081
1082         /* Increment sequence number before capturing snapshot of ns socket
1083          * info.
1084          */
1085         nl->seq++;
1086
1087         /* Capture info in intermediate info struct */
1088         zebra_dplane_info_from_zns(&dp_info, zns, (nl == &(zns->netlink_cmd)));
1089
1090         return netlink_talk_info(filter, n, &dp_info, startup);
1091 }
1092
1093 /* Issue request message to kernel via netlink socket. GET messages
1094  * are issued through this interface.
1095  */
1096 int netlink_request(struct nlsock *nl, void *req)
1097 {
1098         struct nlmsghdr *n = (struct nlmsghdr *)req;
1099
1100         /* Check netlink socket. */
1101         if (nl->sock < 0) {
1102                 flog_err_sys(EC_LIB_SOCKET, "%s socket isn't active.",
1103                              nl->name);
1104                 return -1;
1105         }
1106
1107         /* Fill common fields for all requests. */
1108         n->nlmsg_pid = nl->snl.nl_pid;
1109         n->nlmsg_seq = ++nl->seq;
1110
1111         if (netlink_send_msg(nl, req, n->nlmsg_len) == -1)
1112                 return -1;
1113
1114         return 0;
1115 }
1116
1117 static int nl_batch_read_resp(struct nl_batch *bth)
1118 {
1119         struct nlmsghdr *h;
1120         struct sockaddr_nl snl;
1121         struct msghdr msg = {};
1122         int status, seq;
1123         struct nlsock *nl;
1124         struct zebra_dplane_ctx *ctx;
1125         bool ignore_msg;
1126
1127         nl = kernel_netlink_nlsock_lookup(bth->zns->sock);
1128
1129         msg.msg_name = (void *)&snl;
1130         msg.msg_namelen = sizeof(snl);
1131
1132         /*
1133          * The responses are not batched, so we need to read and process one
1134          * message at a time.
1135          */
1136         while (true) {
1137                 status = netlink_recv_msg(nl, &msg);
1138                 /*
1139                  * status == -1 is a full on failure somewhere
1140                  * since we don't know where the problem happened
1141                  * we must mark all as failed
1142                  *
1143                  * Else we mark everything as worked
1144                  *
1145                  */
1146                 if (status == -1 || status == 0) {
1147                         while ((ctx = dplane_ctx_dequeue(&(bth->ctx_list))) !=
1148                                NULL) {
1149                                 if (status == -1)
1150                                         dplane_ctx_set_status(
1151                                                 ctx,
1152                                                 ZEBRA_DPLANE_REQUEST_FAILURE);
1153                                 dplane_ctx_enqueue_tail(bth->ctx_out_q, ctx);
1154                         }
1155                         return status;
1156                 }
1157
1158                 h = (struct nlmsghdr *)nl->buf;
1159                 ignore_msg = false;
1160                 seq = h->nlmsg_seq;
1161                 /*
1162                  * Find the corresponding context object. Received responses are
1163                  * in the same order as requests we sent, so we can simply
1164                  * iterate over the context list and match responses with
1165                  * requests at same time.
1166                  */
1167                 while (true) {
1168                         ctx = dplane_ctx_get_head(&(bth->ctx_list));
1169                         if (ctx == NULL) {
1170                                 /*
1171                                  * This is a situation where we have gotten
1172                                  * into a bad spot.  We need to know that
1173                                  * this happens( does it? )
1174                                  */
1175                                 zlog_err(
1176                                         "%s:WARNING Received netlink Response for an error and no Contexts to associate with it",
1177                                         __func__);
1178                                 break;
1179                         }
1180
1181                         /*
1182                          * 'update' context objects take two consecutive
1183                          * sequence numbers.
1184                          */
1185                         if (dplane_ctx_is_update(ctx) &&
1186                             dplane_ctx_get_ns(ctx)->seq + 1 == seq) {
1187                                 /*
1188                                  * This is the situation where we get a response
1189                                  * to a message that should be ignored.
1190                                  */
1191                                 ignore_msg = true;
1192                                 break;
1193                         }
1194
1195                         ctx = dplane_ctx_dequeue(&(bth->ctx_list));
1196                         dplane_ctx_enqueue_tail(bth->ctx_out_q, ctx);
1197
1198                         /* We have found corresponding context object. */
1199                         if (dplane_ctx_get_ns(ctx)->seq == seq)
1200                                 break;
1201
1202                         if (dplane_ctx_get_ns(ctx)->seq > seq)
1203                                 zlog_warn(
1204                                         "%s:WARNING Recieved %u is less than any context on the queue ctx->seq %u",
1205                                         __func__, seq,
1206                                         dplane_ctx_get_ns(ctx)->seq);
1207                 }
1208
1209                 if (ignore_msg) {
1210                         /*
1211                          * If we ignore the message due to an update
1212                          * above we should still fricking decode the
1213                          * message for our operator to understand
1214                          * what is going on
1215                          */
1216                         int err = netlink_parse_error(nl, h, bth->zns->is_cmd,
1217                                                       false);
1218
1219                         zlog_debug("%s: netlink error message seq=%d %d",
1220                                    __func__, h->nlmsg_seq, err);
1221                         continue;
1222                 }
1223
1224                 /*
1225                  * We received a message with the sequence number that isn't
1226                  * associated with any dplane context object.
1227                  */
1228                 if (ctx == NULL) {
1229                         if (IS_ZEBRA_DEBUG_KERNEL)
1230                                 zlog_debug(
1231                                         "%s: skipping unassociated response, seq number %d NS %u",
1232                                         __func__, h->nlmsg_seq,
1233                                         bth->zns->ns_id);
1234                         continue;
1235                 }
1236
1237                 if (h->nlmsg_type == NLMSG_ERROR) {
1238                         int err = netlink_parse_error(nl, h, bth->zns->is_cmd,
1239                                                       false);
1240
1241                         if (err == -1)
1242                                 dplane_ctx_set_status(
1243                                         ctx, ZEBRA_DPLANE_REQUEST_FAILURE);
1244
1245                         if (IS_ZEBRA_DEBUG_KERNEL)
1246                                 zlog_debug("%s: netlink error message seq=%d ",
1247                                            __func__, h->nlmsg_seq);
1248                         continue;
1249                 }
1250
1251                 /*
1252                  * If we get here then we did not receive neither the ack nor
1253                  * the error and instead received some other message in an
1254                  * unexpected way.
1255                  */
1256                 if (IS_ZEBRA_DEBUG_KERNEL)
1257                         zlog_debug("%s: ignoring message type 0x%04x(%s) NS %u",
1258                                    __func__, h->nlmsg_type,
1259                                    nl_msg_type_to_str(h->nlmsg_type),
1260                                    bth->zns->ns_id);
1261         }
1262
1263         return 0;
1264 }
1265
1266 static void nl_batch_reset(struct nl_batch *bth)
1267 {
1268         bth->buf_head = bth->buf;
1269         bth->curlen = 0;
1270         bth->msgcnt = 0;
1271         bth->zns = NULL;
1272
1273         TAILQ_INIT(&(bth->ctx_list));
1274 }
1275
1276 static void nl_batch_init(struct nl_batch *bth, struct dplane_ctx_q *ctx_out_q)
1277 {
1278         /*
1279          * If the size of the buffer has changed, free and then allocate a new
1280          * one.
1281          */
1282         size_t bufsize =
1283                 atomic_load_explicit(&nl_batch_bufsize, memory_order_relaxed);
1284         if (bufsize != nl_batch_tx_bufsize) {
1285                 if (nl_batch_tx_buf)
1286                         XFREE(MTYPE_NL_BUF, nl_batch_tx_buf);
1287
1288                 nl_batch_tx_buf = XCALLOC(MTYPE_NL_BUF, bufsize);
1289                 nl_batch_tx_bufsize = bufsize;
1290         }
1291
1292         bth->buf = nl_batch_tx_buf;
1293         bth->bufsiz = bufsize;
1294         bth->limit = atomic_load_explicit(&nl_batch_send_threshold,
1295                                           memory_order_relaxed);
1296
1297         bth->ctx_out_q = ctx_out_q;
1298
1299         nl_batch_reset(bth);
1300 }
1301
1302 static void nl_batch_send(struct nl_batch *bth)
1303 {
1304         struct zebra_dplane_ctx *ctx;
1305         bool err = false;
1306
1307         if (bth->curlen != 0 && bth->zns != NULL) {
1308                 struct nlsock *nl =
1309                         kernel_netlink_nlsock_lookup(bth->zns->sock);
1310
1311                 if (IS_ZEBRA_DEBUG_KERNEL)
1312                         zlog_debug("%s: %s, batch size=%zu, msg cnt=%zu",
1313                                    __func__, nl->name, bth->curlen,
1314                                    bth->msgcnt);
1315
1316                 if (netlink_send_msg(nl, bth->buf, bth->curlen) == -1)
1317                         err = true;
1318
1319                 if (!err) {
1320                         if (nl_batch_read_resp(bth) == -1)
1321                                 err = true;
1322                 }
1323         }
1324
1325         /* Move remaining contexts to the outbound queue. */
1326         while (true) {
1327                 ctx = dplane_ctx_dequeue(&(bth->ctx_list));
1328                 if (ctx == NULL)
1329                         break;
1330
1331                 if (err)
1332                         dplane_ctx_set_status(ctx,
1333                                               ZEBRA_DPLANE_REQUEST_FAILURE);
1334
1335                 dplane_ctx_enqueue_tail(bth->ctx_out_q, ctx);
1336         }
1337
1338         nl_batch_reset(bth);
1339 }
1340
1341 enum netlink_msg_status netlink_batch_add_msg(
1342         struct nl_batch *bth, struct zebra_dplane_ctx *ctx,
1343         ssize_t (*msg_encoder)(struct zebra_dplane_ctx *, void *, size_t),
1344         bool ignore_res)
1345 {
1346         int seq;
1347         ssize_t size;
1348         struct nlmsghdr *msgh;
1349         struct nlsock *nl;
1350
1351         size = (*msg_encoder)(ctx, bth->buf_head, bth->bufsiz - bth->curlen);
1352
1353         /*
1354          * If there was an error while encoding the message (other than buffer
1355          * overflow) then return an error.
1356          */
1357         if (size < 0)
1358                 return FRR_NETLINK_ERROR;
1359
1360         /*
1361          * If the message doesn't fit entirely in the buffer then send the batch
1362          * and retry.
1363          */
1364         if (size == 0) {
1365                 nl_batch_send(bth);
1366                 size = (*msg_encoder)(ctx, bth->buf_head,
1367                                       bth->bufsiz - bth->curlen);
1368                 /*
1369                  * If the message doesn't fit in the empty buffer then just
1370                  * return an error.
1371                  */
1372                 if (size <= 0)
1373                         return FRR_NETLINK_ERROR;
1374         }
1375
1376         seq = dplane_ctx_get_ns(ctx)->seq;
1377         nl = kernel_netlink_nlsock_lookup(dplane_ctx_get_ns_sock(ctx));
1378
1379         if (ignore_res)
1380                 seq++;
1381
1382         msgh = (struct nlmsghdr *)bth->buf_head;
1383         msgh->nlmsg_seq = seq;
1384         msgh->nlmsg_pid = nl->snl.nl_pid;
1385
1386         bth->zns = dplane_ctx_get_ns(ctx);
1387         bth->buf_head = ((char *)bth->buf_head) + size;
1388         bth->curlen += size;
1389         bth->msgcnt++;
1390
1391         return FRR_NETLINK_QUEUED;
1392 }
1393
1394 static enum netlink_msg_status nl_put_msg(struct nl_batch *bth,
1395                                           struct zebra_dplane_ctx *ctx)
1396 {
1397         if (dplane_ctx_is_skip_kernel(ctx))
1398                 return FRR_NETLINK_SUCCESS;
1399
1400         switch (dplane_ctx_get_op(ctx)) {
1401
1402         case DPLANE_OP_ROUTE_INSTALL:
1403         case DPLANE_OP_ROUTE_UPDATE:
1404         case DPLANE_OP_ROUTE_DELETE:
1405                 return netlink_put_route_update_msg(bth, ctx);
1406
1407         case DPLANE_OP_NH_INSTALL:
1408         case DPLANE_OP_NH_UPDATE:
1409         case DPLANE_OP_NH_DELETE:
1410                 return netlink_put_nexthop_update_msg(bth, ctx);
1411
1412         case DPLANE_OP_LSP_INSTALL:
1413         case DPLANE_OP_LSP_UPDATE:
1414         case DPLANE_OP_LSP_DELETE:
1415                 return netlink_put_lsp_update_msg(bth, ctx);
1416
1417         case DPLANE_OP_PW_INSTALL:
1418         case DPLANE_OP_PW_UNINSTALL:
1419                 return netlink_put_pw_update_msg(bth, ctx);
1420
1421         case DPLANE_OP_ADDR_INSTALL:
1422         case DPLANE_OP_ADDR_UNINSTALL:
1423                 return netlink_put_address_update_msg(bth, ctx);
1424
1425         case DPLANE_OP_MAC_INSTALL:
1426         case DPLANE_OP_MAC_DELETE:
1427                 return netlink_put_mac_update_msg(bth, ctx);
1428
1429         case DPLANE_OP_NEIGH_INSTALL:
1430         case DPLANE_OP_NEIGH_UPDATE:
1431         case DPLANE_OP_NEIGH_DELETE:
1432         case DPLANE_OP_VTEP_ADD:
1433         case DPLANE_OP_VTEP_DELETE:
1434         case DPLANE_OP_NEIGH_DISCOVER:
1435         case DPLANE_OP_NEIGH_IP_INSTALL:
1436         case DPLANE_OP_NEIGH_IP_DELETE:
1437         case DPLANE_OP_NEIGH_TABLE_UPDATE:
1438                 return netlink_put_neigh_update_msg(bth, ctx);
1439
1440         case DPLANE_OP_RULE_ADD:
1441         case DPLANE_OP_RULE_DELETE:
1442         case DPLANE_OP_RULE_UPDATE:
1443                 return netlink_put_rule_update_msg(bth, ctx);
1444
1445         case DPLANE_OP_SYS_ROUTE_ADD:
1446         case DPLANE_OP_SYS_ROUTE_DELETE:
1447         case DPLANE_OP_ROUTE_NOTIFY:
1448         case DPLANE_OP_LSP_NOTIFY:
1449         case DPLANE_OP_BR_PORT_UPDATE:
1450                 return FRR_NETLINK_SUCCESS;
1451
1452         case DPLANE_OP_IPTABLE_ADD:
1453         case DPLANE_OP_IPTABLE_DELETE:
1454         case DPLANE_OP_IPSET_ADD:
1455         case DPLANE_OP_IPSET_DELETE:
1456         case DPLANE_OP_IPSET_ENTRY_ADD:
1457         case DPLANE_OP_IPSET_ENTRY_DELETE:
1458                 return FRR_NETLINK_ERROR;
1459
1460         case DPLANE_OP_GRE_SET:
1461                 return netlink_put_gre_set_msg(bth, ctx);
1462
1463         case DPLANE_OP_INTF_ADDR_ADD:
1464         case DPLANE_OP_INTF_ADDR_DEL:
1465         case DPLANE_OP_NONE:
1466                 return FRR_NETLINK_ERROR;
1467         }
1468
1469         return FRR_NETLINK_ERROR;
1470 }
1471
1472 void kernel_update_multi(struct dplane_ctx_q *ctx_list)
1473 {
1474         struct nl_batch batch;
1475         struct zebra_dplane_ctx *ctx;
1476         struct dplane_ctx_q handled_list;
1477         enum netlink_msg_status res;
1478
1479         TAILQ_INIT(&handled_list);
1480         nl_batch_init(&batch, &handled_list);
1481
1482         while (true) {
1483                 ctx = dplane_ctx_dequeue(ctx_list);
1484                 if (ctx == NULL)
1485                         break;
1486
1487                 if (batch.zns != NULL
1488                     && batch.zns->ns_id != dplane_ctx_get_ns(ctx)->ns_id)
1489                         nl_batch_send(&batch);
1490
1491                 /*
1492                  * Assume all messages will succeed and then mark only the ones
1493                  * that failed.
1494                  */
1495                 dplane_ctx_set_status(ctx, ZEBRA_DPLANE_REQUEST_SUCCESS);
1496
1497                 res = nl_put_msg(&batch, ctx);
1498
1499                 dplane_ctx_enqueue_tail(&(batch.ctx_list), ctx);
1500                 if (res == FRR_NETLINK_ERROR)
1501                         dplane_ctx_set_status(ctx,
1502                                               ZEBRA_DPLANE_REQUEST_FAILURE);
1503
1504                 if (batch.curlen > batch.limit)
1505                         nl_batch_send(&batch);
1506         }
1507
1508         nl_batch_send(&batch);
1509
1510         TAILQ_INIT(ctx_list);
1511         dplane_ctx_list_append(ctx_list, &handled_list);
1512 }
1513
1514 struct nlsock *kernel_netlink_nlsock_lookup(int sock)
1515 {
1516         struct nlsock lookup;
1517
1518         lookup.sock = sock;
1519
1520         return hash_lookup(nlsock_hash, &lookup);
1521 }
1522
1523 static uint32_t kernel_netlink_nlsock_key(const void *arg)
1524 {
1525         const struct nlsock *nl = arg;
1526
1527         return nl->sock;
1528 }
1529
1530 static bool kernel_netlink_nlsock_hash_equal(const void *arg1, const void *arg2)
1531 {
1532         const struct nlsock *nl1 = arg1;
1533         const struct nlsock *nl2 = arg2;
1534
1535         if (nl1->sock == nl2->sock)
1536                 return true;
1537
1538         return false;
1539 }
1540
1541 /* Exported interface function.  This function simply calls
1542    netlink_socket (). */
1543 void kernel_init(struct zebra_ns *zns)
1544 {
1545         uint32_t groups, dplane_groups;
1546 #if defined SOL_NETLINK
1547         int one, ret;
1548 #endif
1549
1550         if (!nlsock_hash)
1551                 nlsock_hash = hash_create_size(8, kernel_netlink_nlsock_key,
1552                                                kernel_netlink_nlsock_hash_equal,
1553                                                "Netlink Socket Hash");
1554
1555         /*
1556          * Initialize netlink sockets
1557          *
1558          * If RTMGRP_XXX exists use that, but at some point
1559          * I think the kernel developers realized that
1560          * keeping track of all the different values would
1561          * lead to confusion, so we need to convert the
1562          * RTNLGRP_XXX to a bit position for ourself
1563          */
1564         groups = RTMGRP_LINK                   |
1565                 RTMGRP_IPV4_ROUTE              |
1566                 RTMGRP_IPV4_IFADDR             |
1567                 RTMGRP_IPV6_ROUTE              |
1568                 RTMGRP_IPV6_IFADDR             |
1569                 RTMGRP_IPV4_MROUTE             |
1570                 RTMGRP_NEIGH                   |
1571                 ((uint32_t) 1 << (RTNLGRP_IPV4_RULE - 1)) |
1572                 ((uint32_t) 1 << (RTNLGRP_IPV6_RULE - 1)) |
1573                 ((uint32_t) 1 << (RTNLGRP_NEXTHOP - 1));
1574
1575         dplane_groups = (RTMGRP_LINK            |
1576                          RTMGRP_IPV4_IFADDR     |
1577                          RTMGRP_IPV6_IFADDR);
1578
1579         snprintf(zns->netlink.name, sizeof(zns->netlink.name),
1580                  "netlink-listen (NS %u)", zns->ns_id);
1581         zns->netlink.sock = -1;
1582         if (netlink_socket(&zns->netlink, groups, zns->ns_id) < 0) {
1583                 zlog_err("Failure to create %s socket",
1584                          zns->netlink.name);
1585                 exit(-1);
1586         }
1587         (void)hash_get(nlsock_hash, &zns->netlink, hash_alloc_intern);
1588
1589         snprintf(zns->netlink_cmd.name, sizeof(zns->netlink_cmd.name),
1590                  "netlink-cmd (NS %u)", zns->ns_id);
1591         zns->netlink_cmd.sock = -1;
1592         if (netlink_socket(&zns->netlink_cmd, 0, zns->ns_id) < 0) {
1593                 zlog_err("Failure to create %s socket",
1594                          zns->netlink_cmd.name);
1595                 exit(-1);
1596         }
1597         (void)hash_get(nlsock_hash, &zns->netlink_cmd, hash_alloc_intern);
1598
1599         /* Outbound socket for dplane programming of the host OS. */
1600         snprintf(zns->netlink_dplane_out.name,
1601                  sizeof(zns->netlink_dplane_out.name), "netlink-dp (NS %u)",
1602                  zns->ns_id);
1603         zns->netlink_dplane_out.sock = -1;
1604         if (netlink_socket(&zns->netlink_dplane_out, 0, zns->ns_id) < 0) {
1605                 zlog_err("Failure to create %s socket",
1606                          zns->netlink_dplane_out.name);
1607                 exit(-1);
1608         }
1609         (void)hash_get(nlsock_hash, &zns->netlink_dplane_out,
1610                        hash_alloc_intern);
1611
1612         /* Inbound socket for OS events coming to the dplane. */
1613         snprintf(zns->netlink_dplane_in.name,
1614                  sizeof(zns->netlink_dplane_in.name), "netlink-dp-in (NS %u)",
1615                  zns->ns_id);
1616         zns->netlink_dplane_in.sock = -1;
1617         if (netlink_socket(&zns->netlink_dplane_in, dplane_groups,
1618                            zns->ns_id) < 0) {
1619                 zlog_err("Failure to create %s socket",
1620                          zns->netlink_dplane_in.name);
1621                 exit(-1);
1622         }
1623         (void)hash_get(nlsock_hash, &zns->netlink_dplane_in, hash_alloc_intern);
1624
1625         /*
1626          * SOL_NETLINK is not available on all platforms yet
1627          * apparently.  It's in bits/socket.h which I am not
1628          * sure that we want to pull into our build system.
1629          */
1630 #if defined SOL_NETLINK
1631         /*
1632          * Let's tell the kernel that we want to receive extended
1633          * ACKS over our command socket(s)
1634          */
1635         one = 1;
1636         ret = setsockopt(zns->netlink_cmd.sock, SOL_NETLINK, NETLINK_EXT_ACK,
1637                          &one, sizeof(one));
1638
1639         if (ret < 0)
1640                 zlog_notice("Registration for extended cmd ACK failed : %d %s",
1641                             errno, safe_strerror(errno));
1642
1643         one = 1;
1644         ret = setsockopt(zns->netlink_dplane_out.sock, SOL_NETLINK,
1645                          NETLINK_EXT_ACK, &one, sizeof(one));
1646
1647         if (ret < 0)
1648                 zlog_notice("Registration for extended dp ACK failed : %d %s",
1649                             errno, safe_strerror(errno));
1650
1651         /*
1652          * Trim off the payload of the original netlink message in the
1653          * acknowledgment. This option is available since Linux 4.2, so if
1654          * setsockopt fails, ignore the error.
1655          */
1656         one = 1;
1657         ret = setsockopt(zns->netlink_dplane_out.sock, SOL_NETLINK,
1658                          NETLINK_CAP_ACK, &one, sizeof(one));
1659         if (ret < 0)
1660                 zlog_notice(
1661                         "Registration for reduced ACK packet size failed, probably running an early kernel");
1662 #endif
1663
1664         /* Register kernel socket. */
1665         if (fcntl(zns->netlink.sock, F_SETFL, O_NONBLOCK) < 0)
1666                 flog_err_sys(EC_LIB_SOCKET, "Can't set %s socket flags: %s",
1667                              zns->netlink.name, safe_strerror(errno));
1668
1669         if (fcntl(zns->netlink_cmd.sock, F_SETFL, O_NONBLOCK) < 0)
1670                 zlog_err("Can't set %s socket error: %s(%d)",
1671                          zns->netlink_cmd.name, safe_strerror(errno), errno);
1672
1673         if (fcntl(zns->netlink_dplane_out.sock, F_SETFL, O_NONBLOCK) < 0)
1674                 zlog_err("Can't set %s socket error: %s(%d)",
1675                          zns->netlink_dplane_out.name, safe_strerror(errno),
1676                          errno);
1677
1678         if (fcntl(zns->netlink_dplane_in.sock, F_SETFL, O_NONBLOCK) < 0)
1679                 zlog_err("Can't set %s socket error: %s(%d)",
1680                          zns->netlink_dplane_in.name, safe_strerror(errno),
1681                          errno);
1682
1683         /* Set receive buffer size if it's set from command line */
1684         if (nl_rcvbufsize) {
1685                 netlink_recvbuf(&zns->netlink, nl_rcvbufsize);
1686                 netlink_recvbuf(&zns->netlink_cmd, nl_rcvbufsize);
1687                 netlink_recvbuf(&zns->netlink_dplane_out, nl_rcvbufsize);
1688                 netlink_recvbuf(&zns->netlink_dplane_in, nl_rcvbufsize);
1689         }
1690
1691         /* Set filter for inbound sockets, to exclude events we've generated
1692          * ourselves.
1693          */
1694         netlink_install_filter(zns->netlink.sock, zns->netlink_cmd.snl.nl_pid,
1695                                zns->netlink_dplane_out.snl.nl_pid);
1696
1697         netlink_install_filter(zns->netlink_dplane_in.sock,
1698                                zns->netlink_cmd.snl.nl_pid,
1699                                zns->netlink_dplane_out.snl.nl_pid);
1700
1701         zns->t_netlink = NULL;
1702
1703         thread_add_read(zrouter.master, kernel_read, zns,
1704                         zns->netlink.sock, &zns->t_netlink);
1705
1706         rt_netlink_init();
1707 }
1708
1709 void kernel_terminate(struct zebra_ns *zns, bool complete)
1710 {
1711         thread_cancel(&zns->t_netlink);
1712
1713         if (zns->netlink.sock >= 0) {
1714                 hash_release(nlsock_hash, &zns->netlink);
1715                 close(zns->netlink.sock);
1716                 zns->netlink.sock = -1;
1717                 XFREE(MTYPE_NL_BUF, zns->netlink.buf);
1718                 zns->netlink.buflen = 0;
1719         }
1720
1721         if (zns->netlink_cmd.sock >= 0) {
1722                 hash_release(nlsock_hash, &zns->netlink_cmd);
1723                 close(zns->netlink_cmd.sock);
1724                 zns->netlink_cmd.sock = -1;
1725                 XFREE(MTYPE_NL_BUF, zns->netlink_cmd.buf);
1726                 zns->netlink_cmd.buflen = 0;
1727         }
1728
1729         if (zns->netlink_dplane_in.sock >= 0) {
1730                 hash_release(nlsock_hash, &zns->netlink_dplane_in);
1731                 close(zns->netlink_dplane_in.sock);
1732                 zns->netlink_dplane_in.sock = -1;
1733                 XFREE(MTYPE_NL_BUF, zns->netlink_dplane_in.buf);
1734                 zns->netlink_dplane_in.buflen = 0;
1735         }
1736
1737         /* During zebra shutdown, we need to leave the dataplane socket
1738          * around until all work is done.
1739          */
1740         if (complete) {
1741                 if (zns->netlink_dplane_out.sock >= 0) {
1742                         hash_release(nlsock_hash, &zns->netlink_dplane_out);
1743                         close(zns->netlink_dplane_out.sock);
1744                         zns->netlink_dplane_out.sock = -1;
1745                         XFREE(MTYPE_NL_BUF, zns->netlink_dplane_out.buf);
1746                         zns->netlink_dplane_out.buflen = 0;
1747                 }
1748
1749                 hash_free(nlsock_hash);
1750         }
1751 }
1752 #endif /* HAVE_NETLINK */