From 1120a35b121db78ea666749e51eb18e2cd84a529 Mon Sep 17 00:00:00 2001 From: Jay Vosburgh Date: Wed, 1 Apr 2015 16:11:09 -0700 Subject: [PATCH] UBUNTU: SAUCE: fan: tunnel multiple mapping mode (v3) Switch to a single tunnel for all mappings, this removes the limitations on how many mappings each tunnel can handle, and therefore how many Fan slices each local address may hold. NOTE: This introduces a new kernel netlink interface which needs updated iproute2 support. BugLink: http://bugs.launchpad.net/bugs/1470091 Signed-off-by: Jay Vosburgh Signed-off-by: Andy Whitcroft Signed-off-by: Tim Gardner Conflicts: include/net/ip_tunnels.h --- include/net/ip_tunnels.h | 15 +++ include/uapi/linux/if_tunnel.h | 20 ++++ net/ipv4/ip_tunnel.c | 7 +- net/ipv4/ipip.c | 183 ++++++++++++++++++++++++++++++++- 4 files changed, 221 insertions(+), 4 deletions(-) diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index e893fe43dd13..d3bdecc05b7d 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -98,6 +98,19 @@ struct ip_tunnel_prl_entry { }; struct metadata_dst; +/* A fan overlay /8 (250.0.0.0/8, for example) maps to exactly one /16 + * underlay (10.88.0.0/16, for example). Multiple local addresses within + * the /16 may be used, but a particular overlay may not span + * multiple underlay subnets. + * + * We store one underlay, indexed by the overlay's high order octet. + */ +#define FAN_OVERLAY_CNT 256 + +struct ip_tunnel_fan { +/* u32 __rcu *map;*/ + u32 map[FAN_OVERLAY_CNT]; +}; struct ip_tunnel { struct ip_tunnel __rcu *next; @@ -129,6 +142,7 @@ struct ip_tunnel { #endif struct ip_tunnel_prl_entry __rcu *prl; /* potential router list */ unsigned int prl_count; /* # of entries in PRL */ + struct ip_tunnel_fan fan; unsigned int ip_tnl_net_id; struct gro_cells gro_cells; bool collect_md; @@ -151,6 +165,7 @@ struct ip_tunnel { #define TUNNEL_NOCACHE __cpu_to_be16(0x2000) #define TUNNEL_OPTIONS_PRESENT (TUNNEL_GENEVE_OPT | TUNNEL_VXLAN_OPT) +#define TUNNEL_FAN __cpu_to_be16(0x4000) struct tnl_ptk_info { __be16 flags; diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h index 92f3c8677523..423e4fac1974 100644 --- a/include/uapi/linux/if_tunnel.h +++ b/include/uapi/linux/if_tunnel.h @@ -75,6 +75,10 @@ enum { IFLA_IPTUN_ENCAP_SPORT, IFLA_IPTUN_ENCAP_DPORT, IFLA_IPTUN_COLLECT_METADATA, + + __IFLA_IPTUN_VENDOR_BREAK, /* Ensure new entries do not hit the below. */ + IFLA_IPTUN_FAN_MAP = 33, + __IFLA_IPTUN_MAX, }; #define IFLA_IPTUN_MAX (__IFLA_IPTUN_MAX - 1) @@ -151,4 +155,20 @@ enum { }; #define IFLA_VTI_MAX (__IFLA_VTI_MAX - 1) + +enum { + IFLA_FAN_UNSPEC, + IFLA_FAN_MAPPING, + __IFLA_FAN_MAX, +}; + +#define IFLA_FAN_MAX (__IFLA_FAN_MAX - 1) + +struct ip_tunnel_fan_map { + __be32 underlay; + __be32 overlay; + __u16 underlay_prefix; + __u16 overlay_prefix; +}; + #endif /* _UAPI_IF_TUNNEL_H_ */ diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 823abaef006b..1726fb4291e5 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -1104,6 +1104,11 @@ out: } EXPORT_SYMBOL_GPL(ip_tunnel_newlink); +static int ip_tunnel_is_fan(struct ip_tunnel *tunnel) +{ + return tunnel->parms.i_flags & TUNNEL_FAN; +} + int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm *p) { @@ -1113,7 +1118,7 @@ int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id); if (dev == itn->fb_tunnel_dev) - return -EINVAL; + return ip_tunnel_is_fan(tunnel) ? 0 : -EINVAL; t = ip_tunnel_find(itn, p, dev->type); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 00d4229b6954..bcfa202f5863 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -106,6 +106,7 @@ #include #include #include +#include #include #include @@ -245,6 +246,40 @@ static int mplsip_rcv(struct sk_buff *skb) } #endif +static int ipip_tunnel_is_fan(struct ip_tunnel *tunnel) +{ + return tunnel->parms.i_flags & TUNNEL_FAN; +} + +/* + * Determine fan tunnel endpoint to send packet to, based on the inner IP + * address. For an overlay (inner) address Y.A.B.C, the transformation is + * F.G.A.B, where "F" and "G" are the first two octets of the underlay + * network (the network portion of a /16), "A" and "B" are the low order + * two octets of the underlay network host (the host portion of a /16), + * and "Y" is a configured first octet of the overlay network. + * + * E.g., underlay host 10.88.3.4 with an overlay of 99 would host overlay + * subnet 99.3.4.0/24. An overlay network datagram from 99.3.4.5 to + * 99.6.7.8, would be directed to underlay host 10.88.6.7, which hosts + * overlay network 99.6.7.0/24. + */ +static int ipip_build_fan_iphdr(struct ip_tunnel *tunnel, struct sk_buff *skb, struct iphdr *iph) +{ + unsigned int overlay; + u32 daddr, underlay; + + daddr = ntohl(ip_hdr(skb)->daddr); + overlay = daddr >> 24; + underlay = tunnel->fan.map[overlay]; + if (!underlay) + return -EINVAL; + + *iph = tunnel->parms.iph; + iph->daddr = htonl(underlay | ((daddr >> 8) & 0x0000ffff)); + return 0; +} + /* * This function assumes it is being called from dev_queue_xmit() * and that skb is filled properly by that function. @@ -255,6 +290,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *tiph = &tunnel->parms.iph; u8 ipproto; + struct iphdr fiph; switch (skb->protocol) { case htons(ETH_P_IP): @@ -275,6 +311,14 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4)) goto tx_error; + if (ipip_tunnel_is_fan(tunnel)) { + if (ipip_build_fan_iphdr(tunnel, skb, &fiph)) + goto tx_error; + tiph = &fiph; + } else { + tiph = &tunnel->parms.iph; + } + skb_set_inner_ipproto(skb, ipproto); if (tunnel->collect_md) @@ -464,21 +508,89 @@ static bool ipip_netlink_encap_parms(struct nlattr *data[], return ret; } +static void ipip_fan_free_map(struct ip_tunnel *t) +{ + memset(&t->fan.map, 0, sizeof(t->fan.map)); +} + +static int ipip_fan_set_map(struct ip_tunnel *t, struct ip_tunnel_fan_map *map) +{ + u32 overlay, overlay_mask, underlay, underlay_mask; + + if ((map->underlay_prefix && map->underlay_prefix != 16) || + (map->overlay_prefix && map->overlay_prefix != 8)) + return -EINVAL; + + overlay = ntohl(map->overlay); + overlay_mask = ntohl(inet_make_mask(map->overlay_prefix)); + + underlay = ntohl(map->underlay); + underlay_mask = ntohl(inet_make_mask(map->underlay_prefix)); + + if ((overlay & ~overlay_mask) || (underlay & ~underlay_mask)) + return -EINVAL; + + if (!(overlay & overlay_mask) && (underlay & underlay_mask)) + return -EINVAL; + + t->parms.i_flags |= TUNNEL_FAN; + + /* Special case: overlay 0 and underlay 0 clears all mappings */ + if (!overlay && !underlay) { + ipip_fan_free_map(t); + return 0; + } + + overlay >>= (32 - map->overlay_prefix); + t->fan.map[overlay] = underlay; + + return 0; +} + + +static int ipip_netlink_fan(struct nlattr *data[], struct ip_tunnel *t, + struct ip_tunnel_parm *parms) +{ + struct ip_tunnel_fan_map *map; + struct nlattr *attr; + int rem, rv; + + if (!data[IFLA_IPTUN_FAN_MAP]) + return 0; + + if (parms->iph.daddr) + return -EINVAL; + + nla_for_each_nested(attr, data[IFLA_IPTUN_FAN_MAP], rem) { + map = nla_data(attr); + rv = ipip_fan_set_map(t, map); + if (rv) + return rv; + } + + return 0; +} + static int ipip_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; + struct ip_tunnel *t = netdev_priv(dev); + int err; if (ipip_netlink_encap_parms(data, &ipencap)) { - int err = ip_tunnel_encap_setup(t, &ipencap); + err = ip_tunnel_encap_setup(t, &ipencap); if (err < 0) return err; } ipip_netlink_parms(data, &p, &t->collect_md); + err = ipip_netlink_fan(data, t, &p); + if (err < 0) + return err; return ip_tunnel_newlink(dev, tb, &p); } @@ -488,10 +600,11 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; bool collect_md; + struct ip_tunnel *t = netdev_priv(dev); + int err; if (ipip_netlink_encap_parms(data, &ipencap)) { - struct ip_tunnel *t = netdev_priv(dev); - int err = ip_tunnel_encap_setup(t, &ipencap); + err = ip_tunnel_encap_setup(t, &ipencap); if (err < 0) return err; @@ -500,6 +613,9 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[], ipip_netlink_parms(data, &p, &collect_md); if (collect_md) return -EINVAL; + err = ipip_netlink_fan(data, t, &p); + if (err < 0) + return err; if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) || (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr)) @@ -535,6 +651,8 @@ static size_t ipip_get_size(const struct net_device *dev) nla_total_size(2) + /* IFLA_IPTUN_COLLECT_METADATA */ nla_total_size(0) + + /* IFLA_IPTUN_FAN_MAP */ + nla_total_size(sizeof(struct ip_tunnel_fan_map)) * 256 + 0; } @@ -566,6 +684,29 @@ static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev) if (tunnel->collect_md) if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA)) goto nla_put_failure; + if (tunnel->parms.i_flags & TUNNEL_FAN) { + struct nlattr *fan_nest; + int i; + + fan_nest = nla_nest_start(skb, IFLA_IPTUN_FAN_MAP); + if (!fan_nest) + goto nla_put_failure; + for (i = 0; i < 256; i++) { + if (tunnel->fan.map[i]) { + struct ip_tunnel_fan_map map; + + map.underlay = htonl(tunnel->fan.map[i]); + map.underlay_prefix = 16; + map.overlay = htonl(i << 24); + map.overlay_prefix = 8; + if (nla_put(skb, IFLA_FAN_MAPPING, + sizeof(map), &map)) + goto nla_put_failure; + } + } + nla_nest_end(skb, fan_nest); + } + return 0; nla_put_failure: @@ -585,6 +726,9 @@ static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = { [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_COLLECT_METADATA] = { .type = NLA_FLAG }, + + [__IFLA_IPTUN_VENDOR_BREAK ... IFLA_IPTUN_MAX] = { .type = NLA_BINARY }, + [IFLA_IPTUN_FAN_MAP] = { .type = NLA_NESTED }, }; static struct rtnl_link_ops ipip_link_ops __read_mostly = { @@ -634,6 +778,23 @@ static struct pernet_operations ipip_net_ops = { .size = sizeof(struct ip_tunnel_net), }; +#ifdef CONFIG_SYSCTL +static struct ctl_table_header *ipip_fan_header; +static unsigned int ipip_fan_version = 3; + +static struct ctl_table ipip_fan_sysctls[] = { + { + .procname = "version", + .data = &ipip_fan_version, + .maxlen = sizeof(ipip_fan_version), + .mode = 0444, + .proc_handler = proc_dointvec, + }, + {}, +}; + +#endif /* CONFIG_SYSCTL */ + static int __init ipip_init(void) { int err; @@ -659,9 +820,22 @@ static int __init ipip_init(void) if (err < 0) goto rtnl_link_failed; +#ifdef CONFIG_SYSCTL + ipip_fan_header = register_net_sysctl(&init_net, "net/fan", + ipip_fan_sysctls); + if (!ipip_fan_header) { + err = -ENOMEM; + goto sysctl_failed; + } +#endif /* CONFIG_SYSCTL */ + out: return err; +#ifdef CONFIG_SYSCTL +sysctl_failed: + rtnl_link_unregister(&ipip_link_ops); +#endif /* CONFIG_SYSCTL */ rtnl_link_failed: #if IS_ENABLED(CONFIG_MPLS) xfrm4_tunnel_deregister(&mplsip_handler, AF_INET); @@ -676,6 +850,9 @@ xfrm_tunnel_ipip_failed: static void __exit ipip_fini(void) { +#ifdef CONFIG_SYSCTL + unregister_net_sysctl_table(ipip_fan_header); +#endif /* CONFIG_SYSCTL */ rtnl_link_unregister(&ipip_link_ops); if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET)) pr_info("%s: can't deregister tunnel\n", __func__); -- 2.39.5