net/ipv4/ipip.c

   1 /*
   2  *      Linux NET3:     IP/IP protocol decoder.
   3  *
   4  *      Authors:
   5  *              Sam Lantinga (slouken@cs.ucdavis.edu)  02/01/95
   6  *
   7  *      Fixes:
   8  *              Alan Cox        :       Merged and made usable non modular (its so tiny its silly as
   9  *                                      a module taking up 2 pages).
  10  *              Alan Cox        :       Fixed bug with 1.3.18 and IPIP not working (now needs to set skb->h.iph)
  11  *                                      to keep ip_forward happy.
  12  *              Alan Cox        :       More fixes for 1.3.21, and firewall fix. Maybe this will work soon 8).
  13  *              Kai Schulte     :       Fixed #defines for IP_FIREWALL->FIREWALL
  14  *              David Woodhouse :       Perform some basic ICMP handling.
  15  *                                      IPIP Routing without decapsulation.
  16  *              Carlos Picoto   :       GRE over IP support
  17  *              Alexey Kuznetsov:       Reworked. Really, now it is truncated version of ipv4/ip_gre.c.
  18  *                                      I do not want to merge them together.
  19  *
  20  *      This program is free software; you can redistribute it and/or
  21  *      modify it under the terms of the GNU General Public License
  22  *      as published by the Free Software Foundation; either version
  23  *      2 of the License, or (at your option) any later version.
  24  *
  25  */
  26
  27 /* tunnel.c: an IP tunnel driver
  28
  29         The purpose of this driver is to provide an IP tunnel through
  30         which you can tunnel network traffic transparently across subnets.
  31
  32         This was written by looking at Nick Holloway's dummy driver
  33         Thanks for the great code!
  34
  35                 -Sam Lantinga   (slouken@cs.ucdavis.edu)  02/01/95
  36
  37         Minor tweaks:
  38                 Cleaned up the code a little and added some pre-1.3.0 tweaks.
  39                 dev->hard_header/hard_header_len changed to use no headers.
  40                 Comments/bracketing tweaked.
  41                 Made the tunnels use dev->name not tunnel: when error reporting.
  42                 Added tx_dropped stat
  43
  44                 -Alan Cox       (alan@lxorguk.ukuu.org.uk) 21 March 95
  45
  46         Reworked:
  47                 Changed to tunnel to destination gateway in addition to the
  48                         tunnel's pointopoint address
  49                 Almost completely rewritten
  50                 Note:  There is currently no firewall or ICMP handling done.
  51
  52                 -Sam Lantinga   (slouken@cs.ucdavis.edu) 02/13/96
  53
  54 */
  55
  56 /* Things I wish I had known when writing the tunnel driver:
  57
  58         When the tunnel_xmit() function is called, the skb contains the
  59         packet to be sent (plus a great deal of extra info), and dev
  60         contains the tunnel device that _we_ are.
  61
  62         When we are passed a packet, we are expected to fill in the
  63         source address with our source IP address.
  64
  65         What is the proper way to allocate, copy and free a buffer?
  66         After you allocate it, it is a "0 length" chunk of memory
  67         starting at zero.  If you want to add headers to the buffer
  68         later, you'll have to call "skb_reserve(skb, amount)" with
  69         the amount of memory you want reserved.  Then, you call
  70         "skb_put(skb, amount)" with the amount of space you want in
  71         the buffer.  skb_put() returns a pointer to the top (#0) of
  72         that buffer.  skb->len is set to the amount of space you have
  73         "allocated" with skb_put().  You can then write up to skb->len
  74         bytes to that buffer.  If you need more, you can call skb_put()
  75         again with the additional amount of space you need.  You can
  76         find out how much more space you can allocate by calling
  77         "skb_tailroom(skb)".
  78         Now, to add header space, call "skb_push(skb, header_len)".
  79         This creates space at the beginning of the buffer and returns
  80         a pointer to this new space.  If later you need to strip a
  81         header from a buffer, call "skb_pull(skb, header_len)".
  82         skb_headroom() will return how much space is left at the top
  83         of the buffer (before the main data).  Remember, this headroom
  84         space must be reserved before the skb_put() function is called.
  85         */
  86
  87 /*
  88    This version of net/ipv4/ipip.c is cloned of net/ipv4/ip_gre.c
  89
  90    For comments look at net/ipv4/ip_gre.c --ANK
  91  */
  92
  93
  94 #include <linux/capability.h>
  95 #include <linux/module.h>
  96 #include <linux/types.h>
  97 #include <linux/kernel.h>
  98 #include <linux/slab.h>
  99 #include <linux/uaccess.h>
 100 #include <linux/skbuff.h>
 101 #include <linux/netdevice.h>
 102 #include <linux/in.h>
 103 #include <linux/tcp.h>
 104 #include <linux/udp.h>
 105 #include <linux/if_arp.h>
 106 #include <linux/init.h>
 107 #include <linux/netfilter_ipv4.h>
 108 #include <linux/if_ether.h>
 109 #include <linux/inetdevice.h>
 110 #include <linux/rculist.h>
 111
 112 #include <net/sock.h>
 113 #include <net/ip.h>
 114 #include <net/icmp.h>
 115 #include <net/ip_tunnels.h>
 116 #include <net/inet_ecn.h>
 117 #include <net/xfrm.h>
 118 #include <net/net_namespace.h>
 119 #include <net/netns/generic.h>
 120 #include <net/dst_metadata.h>
 121
 122 static bool log_ecn_error = true;
 123 module_param(log_ecn_error, bool, 0644);
 124 MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
 125
 126 static unsigned int ipip_net_id __read_mostly;
 127
 128 static int ipip_tunnel_init(struct net_device *dev);
 129 static struct rtnl_link_ops ipip_link_ops __read_mostly;
 130
 131 static int ipip_err(struct sk_buff *skb, u32 info)
 132 {
 133         /* All the routers (except for Linux) return only
 134          * 8 bytes of packet payload. It means, that precise relaying of
 135          * ICMP in the real Internet is absolutely infeasible.
 136          */
 137         struct net *net = dev_net(skb->dev);
 138         struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
 139         const struct iphdr *iph = (const struct iphdr *)skb->data;
 140         const int type = icmp_hdr(skb)->type;
 141         const int code = icmp_hdr(skb)->code;
 142         struct ip_tunnel *t;
 143         int err = 0;
 144
 145         switch (type) {
 146         case ICMP_DEST_UNREACH:
 147                 switch (code) {
 148                 case ICMP_SR_FAILED:
 149                         /* Impossible event. */
 150                         goto out;
 151                 default:
 152                         /* All others are translated to HOST_UNREACH.
 153                          * rfc2003 contains "deep thoughts" about NET_UNREACH,
 154                          * I believe they are just ether pollution. --ANK
 155                          */
 156                         break;
 157                 }
 158                 break;
 159
 160         case ICMP_TIME_EXCEEDED:
 161                 if (code != ICMP_EXC_TTL)
 162                         goto out;
 163                 break;
 164
 165         case ICMP_REDIRECT:
 166                 break;
 167
 168         default:
 169                 goto out;
 170         }
 171
 172         t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
 173                              iph->daddr, iph->saddr, 0);
 174         if (!t) {
 175                 err = -ENOENT;
 176                 goto out;
 177         }
 178
 179         if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
 180                 ipv4_update_pmtu(skb, net, info, t->parms.link, 0,
 181                                  iph->protocol, 0);
 182                 goto out;
 183         }
 184
 185         if (type == ICMP_REDIRECT) {
 186                 ipv4_redirect(skb, net, t->parms.link, 0, iph->protocol, 0);
 187                 goto out;
 188         }
 189
 190         if (t->parms.iph.daddr == 0) {
 191                 err = -ENOENT;
 192                 goto out;
 193         }
 194
 195         if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
 196                 goto out;
 197
 198         if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
 199                 t->err_count++;
 200         else
 201                 t->err_count = 1;
 202         t->err_time = jiffies;
 203
 204 out:
 205         return err;
 206 }
 207
 208 static const struct tnl_ptk_info ipip_tpi = {
 209         /* no tunnel info required for ipip. */
 210         .proto = htons(ETH_P_IP),
 211 };
 212
 213 #if IS_ENABLED(CONFIG_MPLS)
 214 static const struct tnl_ptk_info mplsip_tpi = {
 215         /* no tunnel info required for mplsip. */
 216         .proto = htons(ETH_P_MPLS_UC),
 217 };
 218 #endif
 219
 220 static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto)
 221 {
 222         struct net *net = dev_net(skb->dev);
 223         struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
 224         struct metadata_dst *tun_dst = NULL;
 225         struct ip_tunnel *tunnel;
 226         const struct iphdr *iph;
 227
 228         iph = ip_hdr(skb);
 229         tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
 230                         iph->saddr, iph->daddr, 0);
 231         if (tunnel) {
 232                 const struct tnl_ptk_info *tpi;
 233
 234                 if (tunnel->parms.iph.protocol != ipproto &&
 235                     tunnel->parms.iph.protocol != 0)
 236                         goto drop;
 237
 238                 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
 239                         goto drop;
 240 #if IS_ENABLED(CONFIG_MPLS)
 241                 if (ipproto == IPPROTO_MPLS)
 242                         tpi = &mplsip_tpi;
 243                 else
 244 #endif
 245                         tpi = &ipip_tpi;
 246                 if (iptunnel_pull_header(skb, 0, tpi->proto, false))
 247                         goto drop;
 248                 if (tunnel->collect_md) {
 249                         tun_dst = ip_tun_rx_dst(skb, 0, 0, 0);
 250                         if (!tun_dst)
 251                                 return 0;
 252                 }
 253                 return ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
 254         }
 255
 256         return -1;
 257
 258 drop:
 259         kfree_skb(skb);
 260         return 0;
 261 }
 262
 263 static int ipip_rcv(struct sk_buff *skb)
 264 {
 265         return ipip_tunnel_rcv(skb, IPPROTO_IPIP);
 266 }
 267
 268 #if IS_ENABLED(CONFIG_MPLS)
 269 static int mplsip_rcv(struct sk_buff *skb)
 270 {
 271         return ipip_tunnel_rcv(skb, IPPROTO_MPLS);
 272 }
 273 #endif
 274
 275 static struct ip_fan_map *ipip_fan_find_map(struct ip_tunnel *t, __be32 daddr)
 276 {
 277         struct ip_fan_map *fan_map;
 278
 279         rcu_read_lock();
 280         list_for_each_entry_rcu(fan_map, &t->fan.fan_maps, list) {
 281                 if (fan_map->overlay ==
 282                     (daddr & inet_make_mask(fan_map->overlay_prefix))) {
 283                         rcu_read_unlock();
 284                         return fan_map;
 285                 }
 286         }
 287         rcu_read_unlock();
 288
 289         return NULL;
 290 }
 291
 292 /* Determine fan tunnel endpoint to send packet to, based on the inner IP
 293  * address.
 294  *
 295  * Given a /8 overlay and /16 underlay, for an overlay (inner) address
 296  * Y.A.B.C, the transformation is F.G.A.B, where "F" and "G" are the first
 297  * two octets of the underlay network (the network portion of a /16), "A"
 298  * and "B" are the low order two octets of the underlay network host (the
 299  * host portion of a /16), and "Y" is a configured first octet of the
 300  * overlay network.
 301  *
 302  * E.g., underlay host 10.88.3.4/16 with an overlay of 99.0.0.0/8 would
 303  * host overlay subnet 99.3.4.0/24.  An overlay network datagram from
 304  * 99.3.4.5 to 99.6.7.8, would be directed to underlay host 10.88.6.7,
 305  * which hosts overlay network subnet 99.6.7.0/24.  This transformation is
 306  * described in detail further below.
 307  *
 308  * Using netmasks for the overlay and underlay other than /8 and /16, as
 309  * shown above, can yield larger (or smaller) overlay subnets, with the
 310  * trade-off of allowing fewer (or more) underlay hosts to participate.
 311  *
 312  * The size of each overlay network subnet is defined by the total of the
 313  * network mask of the overlay plus the size of host portion of the
 314  * underlay network. In the above example, /8 + /16 = /24.
 315  *
 316  * E.g., consider underlay host 10.99.238.5/20 and overlay 99.0.0.0/8. In
 317  * this case, the network portion of the underlay is 10.99.224.0/20, and
 318  * the host portion is 0.0.14.5 (12 bits).  To determine the overlay
 319  * network subnet, the 12 bits of host portion are left shifted 12 bits
 320  * (/20 - /8) and ORed with the overlay subnet prefix.  This yields an
 321  * overlay subnet of 99.224.80/20, composed of 8 bits overlay, followed by
 322  * 12 bits underlay.  This yields 12 bits in the overlay network portion,
 323  * allowing for 4094 addresses in each overlay network subnet.  The
 324  * trade-off is that fewer hosts may participate in the underlay network,
 325  * as its host address size has shrunk from 16 bits (65534 addresses) in
 326  * the first example to 12 bits (4094 addresses) here.
 327  *
 328  * For fewer hosts per overlay subnet (permitting a larger number of
 329  * underlay hosts to participate), the underlay netmask may be made
 330  * smaller.
 331  *
 332  * E.g., underlay host 10.111.1.2/12 (network 10.96.0.0/12, host portion
 333  * is 0.15.1.2, 20 bits) with an overlay of 33.0.0.0/8 would left shift
 334  * the 20 bits of host by 4 (so that it's highest order bit is adjacent to
 335  * the lowest order bit of the /8 overlay).  This yields an overlay subnet
 336  * of 33.240.16.32/28 (8 bits overlay, 20 bits from the host portion of
 337  * the underlay).  This provides more addresses for the underlay network
 338  * (approximately 2^20), but each host's segment of the overlay provides
 339  * only 4 bits of addresses (14 usable).
 340  *
 341  * It is also possible to adjust the overlay subnet.
 342  *
 343  * For an overlay of 240.0.0.0/5 and underlay of 10.88.0.0/20, consider
 344  * underlay host 10.88.129.2; the 12 bits of host, 0.0.1.2, are left
 345  * shifted 15 bits (/20 - /5), yielding an overlay network of
 346  * 240.129.0.0/17.  An underlay host of 10.88.244.215 would yield an
 347  * overlay network of 242.107.128.0/17.
 348  *
 349  * For an overlay of 100.64.0.0/10 and underlay of 10.224.220.0/24, for
 350  * underlay host 10.224.220.10, the underlay host portion (.10) is left
 351  * shifted 14 bits, yielding an overlay network subnet of 100.66.128.0/18.
 352  * This would permit 254 addresses on the underlay, with each overlay
 353  * segment providing approximately 2^14 - 2 addresses (16382).
 354  *
 355  * For packets being encapsulated, the overlay network destination IP
 356  * address is deconstructed into its overlay and underlay-derived
 357  * portions.  The underlay portion (determined by the overlay mask and
 358  * overlay subnet mask) is right shifted according to the size of the
 359  * underlay network mask.  This value is then ORed with the network
 360  * portion of the underlay network to produce the underlay network
 361  * destination for the encapsulated datagram.
 362  *
 363  * For example, using the initial example of underlay 10.88.3.4/16 and
 364  * overlay 99.0.0.0/8, with underlay host 10.88.3.4/16 providing overlay
 365  * subnet 99.3.4.0/24 with specfic host 99.3.4.5.  A datagram from
 366  * 99.3.4.5 to 99.6.7.8 would first have the underlay host derived portion
 367  * of the address extracted.  This is a number of bits equal to underlay
 368  * network host portion.  In the destination address, the highest order of
 369  * these bits is one bit lower than the lowest order bit from the overlay
 370  * network mask.
 371  *
 372  * Using the sample value, 99.6.7.8, the overlay mask is /8, and the
 373  * underlay mask is /16 (leaving 16 bits for the host portion).  The bits
 374  * to be shifted are the middle two octets, 0.6.7.0, as this is 99.6.7.8
 375  * ANDed with the mask 0x00ffff00 (which is 16 bits, the highest order of
 376  * which is 1 bit lower than the lowest order overlay address bit).
 377  *
 378  * These octets, 0.6.7.0, are then right shifted 8 bits, yielding 0.0.6.7.
 379  * This value is then ORed with the underlay network portion,
 380  * 10.88.0.0/16, providing 10.88.6.7 as the final underlay destination for
 381  * the encapuslated datagram.
 382  *
 383  * Another transform using the final example: overlay 100.64.0.0/10 and
 384  * underlay 10.224.220.0/24.  Consider overlay address 100.66.128.1
 385  * sending a datagram to 100.66.200.5.  In this case, 8 bits (the host
 386  * portion size of 10.224.220.0/24) beginning after the 100.64/10 overlay
 387  * prefix are masked off, yielding 0.2.192.0.  This is right shifted 14
 388  * (32 - 10 - (32 - 24), i.e., the number of bits between the overlay
 389  * network portion and the underlay host portion) bits, yielding 0.0.0.11.
 390  * This is ORed with the underlay network portion, 10.224.220.0/24, giving
 391  * the underlay destination of 10.224.220.11 for overlay destination
 392  * 100.66.200.5.
 393  */
 394 static int ipip_build_fan_iphdr(struct ip_tunnel *tunnel, struct sk_buff *skb, struct iphdr *iph)
 395 {
 396         struct ip_fan_map *f_map;
 397         u32 daddr, underlay;
 398
 399         f_map = ipip_fan_find_map(tunnel, ip_hdr(skb)->daddr);
 400         if (!f_map)
 401                 return -ENOENT;
 402
 403         daddr = ntohl(ip_hdr(skb)->daddr);
 404         underlay = ntohl(f_map->underlay);
 405         if (!underlay)
 406                 return -EINVAL;
 407
 408         *iph = tunnel->parms.iph;
 409         iph->daddr = htonl(underlay |
 410                            ((daddr & ~f_map->overlay_mask) >>
 411                             (32 - f_map->overlay_prefix -
 412                              (32 - f_map->underlay_prefix))));
 413         return 0;
 414 }
 415
 416 /*
 417  *      This function assumes it is being called from dev_queue_xmit()
 418  *      and that skb is filled properly by that function.
 419  */
 420 static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb,
 421                                     struct net_device *dev)
 422 {
 423         struct ip_tunnel *tunnel = netdev_priv(dev);
 424         const struct iphdr  *tiph = &tunnel->parms.iph;
 425         u8 ipproto;
 426         struct iphdr fiph;
 427
 428         if (!pskb_inet_may_pull(skb))
 429                 goto tx_error;
 430
 431         switch (skb->protocol) {
 432         case htons(ETH_P_IP):
 433                 ipproto = IPPROTO_IPIP;
 434                 break;
 435 #if IS_ENABLED(CONFIG_MPLS)
 436         case htons(ETH_P_MPLS_UC):
 437                 ipproto = IPPROTO_MPLS;
 438                 break;
 439 #endif
 440         default:
 441                 goto tx_error;
 442         }
 443
 444         if (tiph->protocol != ipproto && tiph->protocol != 0)
 445                 goto tx_error;
 446
 447         if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4))
 448                 goto tx_error;
 449
 450         if (fan_has_map(&tunnel->fan)) {
 451                 if (ipip_build_fan_iphdr(tunnel, skb, &fiph))
 452                         goto tx_error;
 453                 tiph = &fiph;
 454         } else {
 455                 tiph = &tunnel->parms.iph;
 456         }
 457
 458         skb_set_inner_ipproto(skb, ipproto);
 459
 460         if (tunnel->collect_md)
 461                 ip_md_tunnel_xmit(skb, dev, ipproto);
 462         else
 463                 ip_tunnel_xmit(skb, dev, tiph, ipproto);
 464         return NETDEV_TX_OK;
 465
 466 tx_error:
 467         kfree_skb(skb);
 468
 469         dev->stats.tx_errors++;
 470         return NETDEV_TX_OK;
 471 }
 472
 473 static bool ipip_tunnel_ioctl_verify_protocol(u8 ipproto)
 474 {
 475         switch (ipproto) {
 476         case 0:
 477         case IPPROTO_IPIP:
 478 #if IS_ENABLED(CONFIG_MPLS)
 479         case IPPROTO_MPLS:
 480 #endif
 481                 return true;
 482         }
 483
 484         return false;
 485 }
 486
 487 static int
 488 ipip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 489 {
 490         int err = 0;
 491         struct ip_tunnel_parm p;
 492
 493         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
 494                 return -EFAULT;
 495
 496         if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
 497                 if (p.iph.version != 4 ||
 498                     !ipip_tunnel_ioctl_verify_protocol(p.iph.protocol) ||
 499                     p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)))
 500                         return -EINVAL;
 501         }
 502
 503         p.i_key = p.o_key = 0;
 504         p.i_flags = p.o_flags = 0;
 505         err = ip_tunnel_ioctl(dev, &p, cmd);
 506         if (err)
 507                 return err;
 508
 509         if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
 510                 return -EFAULT;
 511
 512         return 0;
 513 }
 514
 515 static const struct net_device_ops ipip_netdev_ops = {
 516         .ndo_init       = ipip_tunnel_init,
 517         .ndo_uninit     = ip_tunnel_uninit,
 518         .ndo_start_xmit = ipip_tunnel_xmit,
 519         .ndo_do_ioctl   = ipip_tunnel_ioctl,
 520         .ndo_change_mtu = ip_tunnel_change_mtu,
 521         .ndo_get_stats64 = ip_tunnel_get_stats64,
 522         .ndo_get_iflink = ip_tunnel_get_iflink,
 523 };
 524
 525 #define IPIP_FEATURES (NETIF_F_SG |             \
 526                        NETIF_F_FRAGLIST |       \
 527                        NETIF_F_HIGHDMA |        \
 528                        NETIF_F_GSO_SOFTWARE |   \
 529                        NETIF_F_HW_CSUM)
 530
 531 static void ipip_tunnel_setup(struct net_device *dev)
 532 {
 533         struct ip_tunnel *t = netdev_priv(dev);
 534
 535         dev->netdev_ops         = &ipip_netdev_ops;
 536
 537         dev->type               = ARPHRD_TUNNEL;
 538         dev->flags              = IFF_NOARP;
 539         dev->addr_len           = 4;
 540         dev->features           |= NETIF_F_LLTX;
 541         netif_keep_dst(dev);
 542
 543         dev->features           |= IPIP_FEATURES;
 544         dev->hw_features        |= IPIP_FEATURES;
 545         ip_tunnel_setup(dev, ipip_net_id);
 546         INIT_LIST_HEAD(&t->fan.fan_maps);
 547 }
 548
 549 static int ipip_tunnel_init(struct net_device *dev)
 550 {
 551         struct ip_tunnel *tunnel = netdev_priv(dev);
 552
 553         memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
 554         memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
 555
 556         tunnel->tun_hlen = 0;
 557         tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
 558         return ip_tunnel_init(dev);
 559 }
 560
 561 static int ipip_tunnel_validate(struct nlattr *tb[], struct nlattr *data[],
 562                                 struct netlink_ext_ack *extack)
 563 {
 564         u8 proto;
 565
 566         if (!data || !data[IFLA_IPTUN_PROTO])
 567                 return 0;
 568
 569         proto = nla_get_u8(data[IFLA_IPTUN_PROTO]);
 570         if (proto != IPPROTO_IPIP && proto != IPPROTO_MPLS && proto != 0)
 571                 return -EINVAL;
 572
 573         return 0;
 574 }
 575
 576 static void ipip_netlink_parms(struct nlattr *data[],
 577                                struct ip_tunnel_parm *parms, bool *collect_md,
 578                                __u32 *fwmark)
 579 {
 580         memset(parms, 0, sizeof(*parms));
 581
 582         parms->iph.version = 4;
 583         parms->iph.protocol = IPPROTO_IPIP;
 584         parms->iph.ihl = 5;
 585         *collect_md = false;
 586
 587         if (!data)
 588                 return;
 589
 590         if (data[IFLA_IPTUN_LINK])
 591                 parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]);
 592
 593         if (data[IFLA_IPTUN_LOCAL])
 594                 parms->iph.saddr = nla_get_in_addr(data[IFLA_IPTUN_LOCAL]);
 595
 596         if (data[IFLA_IPTUN_REMOTE])
 597                 parms->iph.daddr = nla_get_in_addr(data[IFLA_IPTUN_REMOTE]);
 598
 599         if (data[IFLA_IPTUN_TTL]) {
 600                 parms->iph.ttl = nla_get_u8(data[IFLA_IPTUN_TTL]);
 601                 if (parms->iph.ttl)
 602                         parms->iph.frag_off = htons(IP_DF);
 603         }
 604
 605         if (data[IFLA_IPTUN_TOS])
 606                 parms->iph.tos = nla_get_u8(data[IFLA_IPTUN_TOS]);
 607
 608         if (data[IFLA_IPTUN_PROTO])
 609                 parms->iph.protocol = nla_get_u8(data[IFLA_IPTUN_PROTO]);
 610
 611         if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC]))
 612                 parms->iph.frag_off = htons(IP_DF);
 613
 614         if (data[IFLA_IPTUN_COLLECT_METADATA])
 615                 *collect_md = true;
 616
 617         if (data[IFLA_IPTUN_FWMARK])
 618                 *fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]);
 619 }
 620
 621 /* This function returns true when ENCAP attributes are present in the nl msg */
 622 static bool ipip_netlink_encap_parms(struct nlattr *data[],
 623                                      struct ip_tunnel_encap *ipencap)
 624 {
 625         bool ret = false;
 626
 627         memset(ipencap, 0, sizeof(*ipencap));
 628
 629         if (!data)
 630                 return ret;
 631
 632         if (data[IFLA_IPTUN_ENCAP_TYPE]) {
 633                 ret = true;
 634                 ipencap->type = nla_get_u16(data[IFLA_IPTUN_ENCAP_TYPE]);
 635         }
 636
 637         if (data[IFLA_IPTUN_ENCAP_FLAGS]) {
 638                 ret = true;
 639                 ipencap->flags = nla_get_u16(data[IFLA_IPTUN_ENCAP_FLAGS]);
 640         }
 641
 642         if (data[IFLA_IPTUN_ENCAP_SPORT]) {
 643                 ret = true;
 644                 ipencap->sport = nla_get_be16(data[IFLA_IPTUN_ENCAP_SPORT]);
 645         }
 646
 647         if (data[IFLA_IPTUN_ENCAP_DPORT]) {
 648                 ret = true;
 649                 ipencap->dport = nla_get_be16(data[IFLA_IPTUN_ENCAP_DPORT]);
 650         }
 651
 652         return ret;
 653 }
 654
 655 static void ipip_fan_flush_map(struct ip_tunnel *t)
 656 {
 657         struct ip_fan_map *fan_map;
 658
 659         list_for_each_entry_rcu(fan_map, &t->fan.fan_maps, list) {
 660                 list_del_rcu(&fan_map->list);
 661                 kfree_rcu(fan_map, rcu);
 662         }
 663 }
 664
 665 static int ipip_fan_del_map(struct ip_tunnel *t, __be32 overlay)
 666 {
 667         struct ip_fan_map *fan_map;
 668
 669         fan_map = ipip_fan_find_map(t, overlay);
 670         if (!fan_map)
 671                 return -ENOENT;
 672
 673         list_del_rcu(&fan_map->list);
 674         kfree_rcu(fan_map, rcu);
 675
 676         return 0;
 677 }
 678
 679 static int ipip_fan_add_map(struct ip_tunnel *t, struct ifla_fan_map *map)
 680 {
 681         __be32 overlay_mask, underlay_mask;
 682         struct ip_fan_map *fan_map;
 683
 684         overlay_mask = inet_make_mask(map->overlay_prefix);
 685         underlay_mask = inet_make_mask(map->underlay_prefix);
 686
 687         if ((map->overlay & ~overlay_mask) || (map->underlay & ~underlay_mask))
 688                 return -EINVAL;
 689
 690         if (!(map->overlay & overlay_mask) && (map->underlay & underlay_mask))
 691                 return -EINVAL;
 692
 693         /* Special case: overlay 0 and underlay 0: flush all mappings */
 694         if (!map->overlay && !map->underlay) {
 695                 ipip_fan_flush_map(t);
 696                 return 0;
 697         }
 698
 699         /* Special case: overlay set and underlay 0: clear map for overlay */
 700         if (!map->underlay)
 701                 return ipip_fan_del_map(t, map->overlay);
 702
 703         if (ipip_fan_find_map(t, map->overlay))
 704                 return -EEXIST;
 705
 706         fan_map = kmalloc(sizeof(*fan_map), GFP_KERNEL);
 707         fan_map->underlay = map->underlay;
 708         fan_map->overlay = map->overlay;
 709         fan_map->underlay_prefix = map->underlay_prefix;
 710         fan_map->overlay_mask = ntohl(overlay_mask);
 711         fan_map->overlay_prefix = map->overlay_prefix;
 712
 713         list_add_tail_rcu(&fan_map->list, &t->fan.fan_maps);
 714
 715         return 0;
 716 }
 717
 718
 719 static int ipip_netlink_fan(struct nlattr *data[], struct ip_tunnel *t,
 720                             struct ip_tunnel_parm *parms)
 721 {
 722         struct ifla_fan_map *map;
 723         struct nlattr *attr;
 724         int rem, rv;
 725
 726         if (data == NULL || !data[IFLA_IPTUN_FAN_MAP])
 727                 return 0;
 728
 729         if (parms->iph.daddr)
 730                 return -EINVAL;
 731
 732         nla_for_each_nested(attr, data[IFLA_IPTUN_FAN_MAP], rem) {
 733                 map = nla_data(attr);
 734                 rv = ipip_fan_add_map(t, map);
 735                 if (rv)
 736                         return rv;
 737         }
 738
 739         return 0;
 740 }
 741
 742 static int ipip_newlink(struct net *src_net, struct net_device *dev,
 743                         struct nlattr *tb[], struct nlattr *data[],
 744                         struct netlink_ext_ack *extack)
 745 {
 746         struct ip_tunnel *t = netdev_priv(dev);
 747         struct ip_tunnel_parm p;
 748         struct ip_tunnel_encap ipencap;
 749         __u32 fwmark = 0;
 750         int err;
 751
 752         if (ipip_netlink_encap_parms(data, &ipencap)) {
 753                 err = ip_tunnel_encap_setup(t, &ipencap);
 754
 755                 if (err < 0)
 756                         return err;
 757         }
 758
 759         ipip_netlink_parms(data, &p, &t->collect_md, &fwmark);
 760         err = ipip_netlink_fan(data, t, &p);
 761         if (err < 0)
 762                 return err;
 763         return ip_tunnel_newlink(dev, tb, &p, fwmark);
 764 }
 765
 766 static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
 767                            struct nlattr *data[],
 768                            struct netlink_ext_ack *extack)
 769 {
 770         struct ip_tunnel *t = netdev_priv(dev);
 771         struct ip_tunnel_parm p;
 772         struct ip_tunnel_encap ipencap;
 773         bool collect_md;
 774         __u32 fwmark = t->fwmark;
 775         int err;
 776
 777         if (ipip_netlink_encap_parms(data, &ipencap)) {
 778                 err = ip_tunnel_encap_setup(t, &ipencap);
 779
 780                 if (err < 0)
 781                         return err;
 782         }
 783
 784         ipip_netlink_parms(data, &p, &collect_md, &fwmark);
 785         if (collect_md)
 786                 return -EINVAL;
 787         err = ipip_netlink_fan(data, t, &p);
 788         if (err < 0)
 789                 return err;
 790
 791         if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) ||
 792             (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr))
 793                 return -EINVAL;
 794
 795         return ip_tunnel_changelink(dev, tb, &p, fwmark);
 796 }
 797
 798 static size_t ipip_get_size(const struct net_device *dev)
 799 {
 800         return
 801                 /* IFLA_IPTUN_LINK */
 802                 nla_total_size(4) +
 803                 /* IFLA_IPTUN_LOCAL */
 804                 nla_total_size(4) +
 805                 /* IFLA_IPTUN_REMOTE */
 806                 nla_total_size(4) +
 807                 /* IFLA_IPTUN_TTL */
 808                 nla_total_size(1) +
 809                 /* IFLA_IPTUN_TOS */
 810                 nla_total_size(1) +
 811                 /* IFLA_IPTUN_PROTO */
 812                 nla_total_size(1) +
 813                 /* IFLA_IPTUN_PMTUDISC */
 814                 nla_total_size(1) +
 815                 /* IFLA_IPTUN_ENCAP_TYPE */
 816                 nla_total_size(2) +
 817                 /* IFLA_IPTUN_ENCAP_FLAGS */
 818                 nla_total_size(2) +
 819                 /* IFLA_IPTUN_ENCAP_SPORT */
 820                 nla_total_size(2) +
 821                 /* IFLA_IPTUN_ENCAP_DPORT */
 822                 nla_total_size(2) +
 823                 /* IFLA_IPTUN_COLLECT_METADATA */
 824                 nla_total_size(0) +
 825                 /* IFLA_IPTUN_FWMARK */
 826                 nla_total_size(4) +
 827                 /* IFLA_IPTUN_FAN_MAP */
 828                 nla_total_size(sizeof(struct ifla_fan_map)) * 256 +
 829                 0;
 830 }
 831
 832 static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev)
 833 {
 834         struct ip_tunnel *tunnel = netdev_priv(dev);
 835         struct ip_tunnel_parm *parm = &tunnel->parms;
 836
 837         if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) ||
 838             nla_put_in_addr(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) ||
 839             nla_put_in_addr(skb, IFLA_IPTUN_REMOTE, parm->iph.daddr) ||
 840             nla_put_u8(skb, IFLA_IPTUN_TTL, parm->iph.ttl) ||
 841             nla_put_u8(skb, IFLA_IPTUN_TOS, parm->iph.tos) ||
 842             nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->iph.protocol) ||
 843             nla_put_u8(skb, IFLA_IPTUN_PMTUDISC,
 844                        !!(parm->iph.frag_off & htons(IP_DF))) ||
 845             nla_put_u32(skb, IFLA_IPTUN_FWMARK, tunnel->fwmark))
 846                 goto nla_put_failure;
 847
 848         if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE,
 849                         tunnel->encap.type) ||
 850             nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT,
 851                          tunnel->encap.sport) ||
 852             nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT,
 853                          tunnel->encap.dport) ||
 854             nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS,
 855                         tunnel->encap.flags))
 856                 goto nla_put_failure;
 857
 858         if (tunnel->collect_md)
 859                 if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA))
 860                         goto nla_put_failure;
 861         if (fan_has_map(&tunnel->fan)) {
 862                 struct nlattr *fan_nest;
 863                 struct ip_fan_map *fan_map;
 864
 865                 fan_nest = nla_nest_start(skb, IFLA_IPTUN_FAN_MAP);
 866                 if (!fan_nest)
 867                         goto nla_put_failure;
 868                 list_for_each_entry_rcu(fan_map, &tunnel->fan.fan_maps, list) {
 869                         struct ifla_fan_map map;
 870
 871                         map.underlay = fan_map->underlay;
 872                         map.underlay_prefix = fan_map->underlay_prefix;
 873                         map.overlay = fan_map->overlay;
 874                         map.overlay_prefix = fan_map->overlay_prefix;
 875                         if (nla_put(skb, IFLA_FAN_MAPPING, sizeof(map), &map))
 876                                 goto nla_put_failure;
 877                 }
 878                 nla_nest_end(skb, fan_nest);
 879         }
 880
 881         return 0;
 882
 883 nla_put_failure:
 884         return -EMSGSIZE;
 885 }
 886
 887 static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = {
 888         [IFLA_IPTUN_LINK]               = { .type = NLA_U32 },
 889         [IFLA_IPTUN_LOCAL]              = { .type = NLA_U32 },
 890         [IFLA_IPTUN_REMOTE]             = { .type = NLA_U32 },
 891         [IFLA_IPTUN_TTL]                = { .type = NLA_U8 },
 892         [IFLA_IPTUN_TOS]                = { .type = NLA_U8 },
 893         [IFLA_IPTUN_PROTO]              = { .type = NLA_U8 },
 894         [IFLA_IPTUN_PMTUDISC]           = { .type = NLA_U8 },
 895         [IFLA_IPTUN_ENCAP_TYPE]         = { .type = NLA_U16 },
 896         [IFLA_IPTUN_ENCAP_FLAGS]        = { .type = NLA_U16 },
 897         [IFLA_IPTUN_ENCAP_SPORT]        = { .type = NLA_U16 },
 898         [IFLA_IPTUN_ENCAP_DPORT]        = { .type = NLA_U16 },
 899         [IFLA_IPTUN_COLLECT_METADATA]   = { .type = NLA_FLAG },
 900         [IFLA_IPTUN_FWMARK]             = { .type = NLA_U32 },
 901
 902         [__IFLA_IPTUN_VENDOR_BREAK ... IFLA_IPTUN_MAX]  = { .type = NLA_BINARY },
 903         [IFLA_IPTUN_FAN_MAP]            = { .type = NLA_NESTED },
 904 };
 905
 906 static struct rtnl_link_ops ipip_link_ops __read_mostly = {
 907         .kind           = "ipip",
 908         .maxtype        = IFLA_IPTUN_MAX,
 909         .policy         = ipip_policy,
 910         .priv_size      = sizeof(struct ip_tunnel),
 911         .setup          = ipip_tunnel_setup,
 912         .validate       = ipip_tunnel_validate,
 913         .newlink        = ipip_newlink,
 914         .changelink     = ipip_changelink,
 915         .dellink        = ip_tunnel_dellink,
 916         .get_size       = ipip_get_size,
 917         .fill_info      = ipip_fill_info,
 918         .get_link_net   = ip_tunnel_get_link_net,
 919 };
 920
 921 static struct xfrm_tunnel ipip_handler __read_mostly = {
 922         .handler        =       ipip_rcv,
 923         .err_handler    =       ipip_err,
 924         .priority       =       1,
 925 };
 926
 927 #if IS_ENABLED(CONFIG_MPLS)
 928 static struct xfrm_tunnel mplsip_handler __read_mostly = {
 929         .handler        =       mplsip_rcv,
 930         .err_handler    =       ipip_err,
 931         .priority       =       1,
 932 };
 933 #endif
 934
 935 static int __net_init ipip_init_net(struct net *net)
 936 {
 937         return ip_tunnel_init_net(net, ipip_net_id, &ipip_link_ops, "tunl0");
 938 }
 939
 940 static void __net_exit ipip_exit_batch_net(struct list_head *list_net)
 941 {
 942         ip_tunnel_delete_nets(list_net, ipip_net_id, &ipip_link_ops);
 943 }
 944
 945 static struct pernet_operations ipip_net_ops = {
 946         .init = ipip_init_net,
 947         .exit_batch = ipip_exit_batch_net,
 948         .id   = &ipip_net_id,
 949         .size = sizeof(struct ip_tunnel_net),
 950 };
 951
 952 #ifdef CONFIG_SYSCTL
 953 static struct ctl_table_header *ipip_fan_header;
 954 static unsigned int ipip_fan_version = 3;
 955
 956 static struct ctl_table ipip_fan_sysctls[] = {
 957         {
 958                 .procname       = "version",
 959                 .data           = &ipip_fan_version,
 960                 .maxlen         = sizeof(ipip_fan_version),
 961                 .mode           = 0444,
 962                 .proc_handler   = proc_dointvec,
 963         },
 964         {},
 965 };
 966
 967 #endif /* CONFIG_SYSCTL */
 968
 969 static int __init ipip_init(void)
 970 {
 971         int err;
 972
 973         pr_info("ipip: IPv4 and MPLS over IPv4 tunneling driver\n");
 974
 975         err = register_pernet_device(&ipip_net_ops);
 976         if (err < 0)
 977                 return err;
 978         err = xfrm4_tunnel_register(&ipip_handler, AF_INET);
 979         if (err < 0) {
 980                 pr_info("%s: can't register tunnel\n", __func__);
 981                 goto xfrm_tunnel_ipip_failed;
 982         }
 983 #if IS_ENABLED(CONFIG_MPLS)
 984         err = xfrm4_tunnel_register(&mplsip_handler, AF_MPLS);
 985         if (err < 0) {
 986                 pr_info("%s: can't register tunnel\n", __func__);
 987                 goto xfrm_tunnel_mplsip_failed;
 988         }
 989 #endif
 990         err = rtnl_link_register(&ipip_link_ops);
 991         if (err < 0)
 992                 goto rtnl_link_failed;
 993
 994 #ifdef CONFIG_SYSCTL
 995         ipip_fan_header = register_net_sysctl(&init_net, "net/fan",
 996                                               ipip_fan_sysctls);
 997         if (!ipip_fan_header) {
 998                 err = -ENOMEM;
 999                 goto sysctl_failed;
1000         }
1001 #endif /* CONFIG_SYSCTL */
1002
1003 out:
1004         return err;
1005
1006 #ifdef CONFIG_SYSCTL
1007 sysctl_failed:
1008         rtnl_link_unregister(&ipip_link_ops);
1009 #endif /* CONFIG_SYSCTL */
1010 rtnl_link_failed:
1011 #if IS_ENABLED(CONFIG_MPLS)
1012         xfrm4_tunnel_deregister(&mplsip_handler, AF_INET);
1013 xfrm_tunnel_mplsip_failed:
1014
1015 #endif
1016         xfrm4_tunnel_deregister(&ipip_handler, AF_INET);
1017 xfrm_tunnel_ipip_failed:
1018         unregister_pernet_device(&ipip_net_ops);
1019         goto out;
1020 }
1021
1022 static void __exit ipip_fini(void)
1023 {
1024 #ifdef CONFIG_SYSCTL
1025         unregister_net_sysctl_table(ipip_fan_header);
1026 #endif /* CONFIG_SYSCTL */
1027         rtnl_link_unregister(&ipip_link_ops);
1028         if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET))
1029                 pr_info("%s: can't deregister tunnel\n", __func__);
1030 #if IS_ENABLED(CONFIG_MPLS)
1031         if (xfrm4_tunnel_deregister(&mplsip_handler, AF_MPLS))
1032                 pr_info("%s: can't deregister tunnel\n", __func__);
1033 #endif
1034         unregister_pernet_device(&ipip_net_ops);
1035 }
1036
1037 module_init(ipip_init);
1038 module_exit(ipip_fini);
1039 MODULE_LICENSE("GPL");
1040 MODULE_ALIAS_RTNL_LINK("ipip");
1041 MODULE_ALIAS_NETDEV("tunl0");