#include "zebra/kernel_netlink.h"
#include "zebra/rt_netlink.h"
#include "zebra/if_netlink.h"
+#include "zebra/rule_netlink.h"
#ifndef SO_RCVBUFFORCE
#define SO_RCVBUFFORCE (33)
#ifndef NLMSG_TAIL
#define NLMSG_TAIL(nmsg) \
- ((struct rtattr *)(((u_char *)(nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len)))
+ ((struct rtattr *)(((uint8_t *)(nmsg)) \
+ + NLMSG_ALIGN((nmsg)->nlmsg_len)))
#endif
#ifndef RTA_TAIL
#define RTA_TAIL(rta) \
- ((struct rtattr *)(((u_char *)(rta)) + RTA_ALIGN((rta)->rta_len)))
+ ((struct rtattr *)(((uint8_t *)(rta)) + RTA_ALIGN((rta)->rta_len)))
#endif
#ifndef RTNL_FAMILY_IP6MR
{RTM_NEWNEIGH, "RTM_NEWNEIGH"},
{RTM_DELNEIGH, "RTM_DELNEIGH"},
{RTM_GETNEIGH, "RTM_GETNEIGH"},
+ {RTM_NEWRULE, "RTM_NEWRULE"},
+ {RTM_DELRULE, "RTM_DELRULE"},
+ {RTM_GETRULE, "RTM_GETRULE"},
{0}};
static const struct message rtproto_str[] = {
{RTPROT_ISIS, "IS-IS"},
{RTPROT_RIP, "RIP"},
{RTPROT_RIPNG, "RIPNG"},
+ {RTPROT_ZSTATIC, "static"},
{0}};
static const struct message family_str[] = {{AF_INET, "ipv4"},
{0}};
extern struct thread_master *master;
-extern u_int32_t nl_rcvbufsize;
+extern uint32_t nl_rcvbufsize;
extern struct zebra_privs_t zserv_privs;
-int netlink_talk_filter(struct sockaddr_nl *snl, struct nlmsghdr *h,
- ns_id_t ns_id, int startup)
+int netlink_talk_filter(struct nlmsghdr *h, ns_id_t ns_id, int startup)
{
- zlog_warn("netlink_talk: ignoring message type 0x%04x NS %u",
- h->nlmsg_type, ns_id);
+ /*
+ * This is an error condition that must be handled during
+ * development.
+ *
+ * The netlink_talk_filter function is used for communication
+ * down the netlink_cmd pipe and we are expecting
+ * an ack being received. So if we get here
+ * then we did not receive the ack and instead
+ * received some other message in an unexpected
+ * way.
+ */
+ zlog_err("%s: ignoring message type 0x%04x(%s) NS %u",
+ __PRETTY_FUNCTION__, h->nlmsg_type,
+ nl_msg_type_to_str(h->nlmsg_type), ns_id);
return 0;
}
static int netlink_recvbuf(struct nlsock *nl, uint32_t newsize)
{
- u_int32_t oldsize;
+ uint32_t oldsize;
socklen_t newlen = sizeof(newsize);
socklen_t oldlen = sizeof(oldsize);
int ret;
return ret;
}
-static int netlink_information_fetch(struct sockaddr_nl *snl,
- struct nlmsghdr *h, ns_id_t ns_id,
+static int netlink_information_fetch(struct nlmsghdr *h, ns_id_t ns_id,
int startup)
{
- /* JF: Ignore messages that aren't from the kernel */
- if (snl->nl_pid != 0) {
- zlog_err("Ignoring message from pid %u", snl->nl_pid);
- return 0;
- }
-
+ /*
+ * When we handle new message types here
+ * because we are starting to install them
+ * then lets check the netlink_install_filter
+ * and see if we should add the corresponding
+ * allow through entry there.
+ * Probably not needed to do but please
+ * think about it.
+ */
switch (h->nlmsg_type) {
case RTM_NEWROUTE:
- return netlink_route_change(snl, h, ns_id, startup);
- break;
+ return netlink_route_change(h, ns_id, startup);
case RTM_DELROUTE:
- return netlink_route_change(snl, h, ns_id, startup);
- break;
+ return netlink_route_change(h, ns_id, startup);
case RTM_NEWLINK:
- return netlink_link_change(snl, h, ns_id, startup);
- break;
+ return netlink_link_change(h, ns_id, startup);
case RTM_DELLINK:
- return netlink_link_change(snl, h, ns_id, startup);
- break;
+ return netlink_link_change(h, ns_id, startup);
case RTM_NEWADDR:
- return netlink_interface_addr(snl, h, ns_id, startup);
- break;
+ return netlink_interface_addr(h, ns_id, startup);
case RTM_DELADDR:
- return netlink_interface_addr(snl, h, ns_id, startup);
- break;
+ return netlink_interface_addr(h, ns_id, startup);
case RTM_NEWNEIGH:
- return netlink_neigh_change(snl, h, ns_id);
- break;
+ return netlink_neigh_change(h, ns_id);
case RTM_DELNEIGH:
- return netlink_neigh_change(snl, h, ns_id);
- break;
+ return netlink_neigh_change(h, ns_id);
+ case RTM_NEWRULE:
+ return netlink_rule_change(h, ns_id, startup);
+ case RTM_DELRULE:
+ return netlink_rule_change(h, ns_id, startup);
default:
- if (IS_ZEBRA_DEBUG_KERNEL)
- zlog_debug("Unknown netlink nlmsg_type %d vrf %u\n",
- h->nlmsg_type, ns_id);
+ /*
+ * If we have received this message then
+ * we have made a mistake during development
+ * and we need to write some code to handle
+ * this message type or not ask for
+ * it to be sent up to us
+ */
+ zlog_err("Unknown netlink nlmsg_type %s(%d) vrf %u\n",
+ nl_msg_type_to_str(h->nlmsg_type), h->nlmsg_type,
+ ns_id);
break;
}
return 0;
return 0;
}
-/* Filter out messages from self that occur on listener socket,
+/*
+ * Filter out messages from self that occur on listener socket,
* caused by our actions on the command socket
+ *
+ * When we add new Netlink message types we probably
+ * do not need to add them here as that we are filtering
+ * on the routes we actually care to receive( which is rarer
+ * then the normal course of operations). We are intentionally
+ * allowing some messages from ourselves through
+ * ( I'm looking at you Interface based netlink messages )
+ * so that we only had to write one way to handle incoming
+ * address add/delete changes.
*/
static void netlink_install_filter(int sock, __u32 pid)
{
+ /*
+ * BPF_JUMP instructions and where you jump to are based upon
+ * 0 as being the next statement. So count from 0. Writing
+ * this down because every time I look at this I have to
+ * re-remember it.
+ */
struct sock_filter filter[] = {
- /* 0: ldh [4] */
- BPF_STMT(BPF_LD | BPF_ABS | BPF_H,
- offsetof(struct nlmsghdr, nlmsg_type)),
- /* 1: jeq 0x18 jt 5 jf next */
- BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htons(RTM_NEWROUTE), 3, 0),
- /* 2: jeq 0x19 jt 5 jf next */
- BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htons(RTM_DELROUTE), 2, 0),
- /* 3: jeq 0x19 jt 5 jf next */
- BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htons(RTM_NEWNEIGH), 1, 0),
- /* 4: jeq 0x19 jt 5 jf 8 */
- BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htons(RTM_DELNEIGH), 0, 3),
- /* 5: ldw [12] */
+ /*
+ * Logic:
+ * if (nlmsg_pid == pid) {
+ * if (the incoming nlmsg_type ==
+ * RTM_NEWADDR | RTM_DELADDR)
+ * keep this message
+ * else
+ * skip this message
+ * } else
+ * keep this netlink message
+ */
+ /*
+ * 0: Load the nlmsg_pid into the BPF register
+ */
BPF_STMT(BPF_LD | BPF_ABS | BPF_W,
offsetof(struct nlmsghdr, nlmsg_pid)),
- /* 6: jeq XX jt 7 jf 8 */
- BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htonl(pid), 0, 1),
- /* 7: ret 0 (skip) */
+ /*
+ * 1: Compare to pid
+ */
+ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htonl(pid), 0, 4),
+ /*
+ * 2: Load the nlmsg_type into BPF register
+ */
+ BPF_STMT(BPF_LD | BPF_ABS | BPF_H,
+ offsetof(struct nlmsghdr, nlmsg_type)),
+ /*
+ * 3: Compare to RTM_NEWADDR
+ */
+ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htons(RTM_NEWADDR), 2, 0),
+ /*
+ * 4: Compare to RTM_DELADDR
+ */
+ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htons(RTM_DELADDR), 1, 0),
+ /*
+ * 5: This is the end state of we want to skip the
+ * message
+ */
BPF_STMT(BPF_RET | BPF_K, 0),
- /* 8: ret 0xffff (keep) */
+ /* 6: This is the end state of we want to keep
+ * the message
+ */
BPF_STMT(BPF_RET | BPF_K, 0xffff),
};
return 0;
}
-int addattr16(struct nlmsghdr *n, unsigned int maxlen, int type, u_int16_t data)
+int addattr16(struct nlmsghdr *n, unsigned int maxlen, int type, uint16_t data)
{
- return addattr_l(n, maxlen, type, &data, sizeof(u_int16_t));
+ return addattr_l(n, maxlen, type, &data, sizeof(uint16_t));
}
int addattr32(struct nlmsghdr *n, unsigned int maxlen, int type, int data)
{
- return addattr_l(n, maxlen, type, &data, sizeof(u_int32_t));
+ return addattr_l(n, maxlen, type, &data, sizeof(uint32_t));
}
struct rtattr *addattr_nest(struct nlmsghdr *n, int maxlen, int type)
int addattr_nest_end(struct nlmsghdr *n, struct rtattr *nest)
{
- nest->rta_len = (u_char *)NLMSG_TAIL(n) - (u_char *)nest;
+ nest->rta_len = (uint8_t *)NLMSG_TAIL(n) - (uint8_t *)nest;
return n->nlmsg_len;
}
int rta_nest_end(struct rtattr *rta, struct rtattr *nest)
{
- nest->rta_len = (u_char *)RTA_TAIL(rta) - (u_char *)nest;
+ nest->rta_len = (uint8_t *)RTA_TAIL(rta) - (uint8_t *)nest;
return rta->rta_len;
}
return lookup_msg(nlmsg_str, msg_type, "");
}
-const char *nl_rtproto_to_str(u_char rtproto)
+const char *nl_rtproto_to_str(uint8_t rtproto)
{
return lookup_msg(rtproto_str, rtproto, "");
}
-const char *nl_family_to_str(u_char family)
+const char *nl_family_to_str(uint8_t family)
{
return lookup_msg(family_str, family, "");
}
-const char *nl_rttype_to_str(u_char rttype)
+const char *nl_rttype_to_str(uint8_t rttype)
{
return lookup_msg(rttype_str, rttype, "");
}
+#define NL_OK(nla, len) \
+ ((len) >= (int)sizeof(struct nlattr) \
+ && (nla)->nla_len >= sizeof(struct nlattr) \
+ && (nla)->nla_len <= (len))
+#define NL_NEXT(nla, attrlen) \
+ ((attrlen) -= RTA_ALIGN((nla)->nla_len), \
+ (struct nlattr *)(((char *)(nla)) + RTA_ALIGN((nla)->nla_len)))
+#define NL_RTA(r) \
+ ((struct nlattr *)(((char *)(r)) \
+ + NLMSG_ALIGN(sizeof(struct nlmsgerr))))
+
+static void netlink_parse_nlattr(struct nlattr **tb, int max,
+ struct nlattr *nla, int len)
+{
+ while (NL_OK(nla, len)) {
+ if (nla->nla_type <= max)
+ tb[nla->nla_type] = nla;
+ nla = NL_NEXT(nla, len);
+ }
+}
+
+static void netlink_parse_extended_ack(struct nlmsghdr *h)
+{
+ struct nlattr *tb[NLMSGERR_ATTR_MAX + 1];
+ const struct nlmsgerr *err =
+ (const struct nlmsgerr *)((uint8_t *)h
+ + NLMSG_ALIGN(
+ sizeof(struct nlmsghdr)));
+ const struct nlmsghdr *err_nlh = NULL;
+ uint32_t hlen = sizeof(*err);
+ const char *msg = NULL;
+ uint32_t off = 0;
+
+ if (!(h->nlmsg_flags & NLM_F_CAPPED))
+ hlen += h->nlmsg_len - NLMSG_ALIGN(sizeof(struct nlmsghdr));
+
+ memset(tb, 0, sizeof(tb));
+ netlink_parse_nlattr(tb, NLMSGERR_ATTR_MAX, NL_RTA(h), hlen);
+
+ if (tb[NLMSGERR_ATTR_MSG])
+ msg = (const char *)RTA_DATA(tb[NLMSGERR_ATTR_MSG]);
+
+ if (tb[NLMSGERR_ATTR_OFFS]) {
+ off = *(uint32_t *)RTA_DATA(tb[NLMSGERR_ATTR_OFFS]);
+
+ if (off > h->nlmsg_len) {
+ zlog_err("Invalid offset for NLMSGERR_ATTR_OFFS\n");
+ off = 0;
+ } else if (!(h->nlmsg_flags & NLM_F_CAPPED)) {
+ /*
+ * Header of failed message
+ * we are not doing anything currently with it
+ * but noticing it for later.
+ */
+ err_nlh = &err->msg;
+ zlog_warn("%s: Received %d extended Ack",
+ __PRETTY_FUNCTION__, err_nlh->nlmsg_type);
+ }
+ }
+
+ if (msg && *msg != '\0') {
+ bool is_err = !!err->error;
+
+ if (is_err)
+ zlog_err("Extended Error: %s", msg);
+ else
+ zlog_warn("Extended Warning: %s", msg);
+ }
+}
+
/*
* netlink_parse_info
*
* startup -> Are we reading in under startup conditions? passed to
* the filter.
*/
-int netlink_parse_info(int (*filter)(struct sockaddr_nl *, struct nlmsghdr *,
- ns_id_t, int),
+int netlink_parse_info(int (*filter)(struct nlmsghdr *, ns_id_t, int),
struct nlsock *nl, struct zebra_ns *zns, int count,
int startup)
{
int read_in = 0;
while (1) {
- char buf[NL_PKT_BUF_SIZE];
+ char buf[NL_RCV_PKT_BUF_SIZE];
struct iovec iov = {.iov_base = buf, .iov_len = sizeof buf};
struct sockaddr_nl snl;
struct msghdr msg = {.msg_name = (void *)&snl,
int errnum = err->error;
int msg_type = err->msg.nlmsg_type;
+ if (h->nlmsg_len
+ < NLMSG_LENGTH(sizeof(struct nlmsgerr))) {
+ zlog_err("%s error: message truncated",
+ nl->name);
+ return -1;
+ }
+
+ /*
+ * Parse the extended information before
+ * we actually handle it.
+ * At this point in time we do not
+ * do anything other than report the
+ * issue.
+ */
+ if (h->nlmsg_flags & NLM_F_ACK_TLVS)
+ netlink_parse_extended_ack(h);
+
/* If the error field is zero, then this is an
* ACK */
if (err->error == 0) {
continue;
}
- if (h->nlmsg_len
- < NLMSG_LENGTH(sizeof(struct nlmsgerr))) {
- zlog_err("%s error: message truncated",
- nl->name);
- return -1;
- }
-
/* Deal with errors that occur because of races
* in link handling */
if (nl == &zns->netlink_cmd
h->nlmsg_type, h->nlmsg_len,
h->nlmsg_seq, h->nlmsg_pid);
- /* skip unsolicited messages originating from command
- * socket
- * linux sets the originators port-id for {NEW|DEL}ADDR
- * messages,
- * so this has to be checked here. */
- if (nl != &zns->netlink_cmd
- && h->nlmsg_pid == zns->netlink_cmd.snl.nl_pid
- && (h->nlmsg_type != RTM_NEWADDR
- && h->nlmsg_type != RTM_DELADDR)) {
- if (IS_ZEBRA_DEBUG_KERNEL)
- zlog_debug(
- "netlink_parse_info: %s packet comes from %s",
- zns->netlink_cmd.name,
- nl->name);
+
+ /*
+ * Ignore messages that maybe sent from
+ * other actors besides the kernel
+ */
+ if (snl.nl_pid != 0) {
+ zlog_err("Ignoring message from pid %u",
+ snl.nl_pid);
continue;
}
- error = (*filter)(&snl, h, zns->ns_id, startup);
+ error = (*filter)(h, zns->ns_id, startup);
if (error < 0) {
zlog_err("%s filter function error", nl->name);
+ zlog_backtrace(LOG_ERR);
ret = error;
}
}
* startup -> Are we reading in under startup conditions
* This is passed through eventually to filter.
*/
-int netlink_talk(int (*filter)(struct sockaddr_nl *, struct nlmsghdr *, ns_id_t,
- int startup),
+int netlink_talk(int (*filter)(struct nlmsghdr *, ns_id_t, int startup),
struct nlmsghdr *n, struct nlsock *nl, struct zebra_ns *zns,
int startup)
{
void kernel_init(struct zebra_ns *zns)
{
unsigned long groups;
+#if defined SOL_NETLINK
+ int one, ret;
+#endif
- /* Initialize netlink sockets */
- groups = RTMGRP_LINK | RTMGRP_IPV4_ROUTE | RTMGRP_IPV4_IFADDR
- | RTMGRP_IPV6_ROUTE | RTMGRP_IPV6_IFADDR | RTMGRP_IPV4_MROUTE
- | RTMGRP_NEIGH;
+ /*
+ * Initialize netlink sockets
+ *
+ * If RTMGRP_XXX exists use that, but at some point
+ * I think the kernel developers realized that
+ * keeping track of all the different values would
+ * lead to confusion, so we need to convert the
+ * RTNLGRP_XXX to a bit position for ourself
+ */
+ groups = RTMGRP_LINK |
+ RTMGRP_IPV4_ROUTE |
+ RTMGRP_IPV4_IFADDR |
+ RTMGRP_IPV6_ROUTE |
+ RTMGRP_IPV6_IFADDR |
+ RTMGRP_IPV4_MROUTE |
+ RTMGRP_NEIGH |
+ (1 << (RTNLGRP_IPV4_RULE - 1)) |
+ (1 << (RTNLGRP_IPV6_RULE - 1));
snprintf(zns->netlink.name, sizeof(zns->netlink.name),
"netlink-listen (NS %u)", zns->ns_id);
zns->netlink_cmd.sock = -1;
netlink_socket(&zns->netlink_cmd, 0, zns->ns_id);
+ /*
+ * SOL_NETLINK is not available on all platforms yet
+ * apparently. It's in bits/socket.h which I am not
+ * sure that we want to pull into our build system.
+ */
+#if defined SOL_NETLINK
+ /*
+ * Let's tell the kernel that we want to receive extended
+ * ACKS over our command socket
+ */
+ one = 1;
+ ret = setsockopt(zns->netlink_cmd.sock, SOL_NETLINK, NETLINK_EXT_ACK,
+ &one, sizeof(one));
+
+ if (ret < 0)
+ zlog_notice("Registration for extended ACK failed : %d %s",
+ errno, safe_strerror(errno));
+#endif
+
/* Register kernel socket. */
if (zns->netlink.sock > 0) {
/* Only want non-blocking on the netlink event socket */
netlink_install_filter(zns->netlink.sock,
zns->netlink_cmd.snl.nl_pid);
zns->t_netlink = NULL;
+
thread_add_read(zebrad.master, kernel_read, zns,
zns->netlink.sock, &zns->t_netlink);
}