]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blobdiff - drivers/net/tun.c
tun: reserve extra headroom only when XDP is set
[mirror_ubuntu-bionic-kernel.git] / drivers / net / tun.c
index 32ad87345f5798498584d8dcfbda2f9ac993e619..80ac18f8b55f20ee0a8601dec03b4f0a2d07576d 100644 (file)
@@ -73,6 +73,8 @@
 #include <linux/seq_file.h>
 #include <linux/uio.h>
 #include <linux/skb_array.h>
+#include <linux/bpf.h>
+#include <linux/bpf_trace.h>
 
 #include <linux/uaccess.h>
 
@@ -105,6 +107,9 @@ do {                                                                \
 } while (0)
 #endif
 
+#define TUN_HEADROOM 256
+#define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
+
 /* TUN device flags */
 
 /* IFF_ATTACH_QUEUE is never stored in device flags,
@@ -199,7 +204,7 @@ struct tun_struct {
        struct net_device       *dev;
        netdev_features_t       set_features;
 #define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \
-                         NETIF_F_TSO6|NETIF_F_UFO)
+                         NETIF_F_TSO6)
 
        int                     align;
        int                     vnet_hdr_sz;
@@ -221,6 +226,7 @@ struct tun_struct {
        u32 flow_count;
        u32 rx_batched;
        struct tun_pcpu_stats __percpu *pcpu_stats;
+       struct bpf_prog __rcu *xdp_prog;
 };
 
 #ifdef CONFIG_TUN_VNET_CROSS_LE
@@ -585,6 +591,7 @@ static void tun_detach(struct tun_file *tfile, bool clean)
 static void tun_detach_all(struct net_device *dev)
 {
        struct tun_struct *tun = netdev_priv(dev);
+       struct bpf_prog *xdp_prog = rtnl_dereference(tun->xdp_prog);
        struct tun_file *tfile, *tmp;
        int i, n = tun->numqueues;
 
@@ -617,6 +624,9 @@ static void tun_detach_all(struct net_device *dev)
        }
        BUG_ON(tun->numdisabled != 0);
 
+       if (xdp_prog)
+               bpf_prog_put(xdp_prog);
+
        if (tun->flags & IFF_PERSIST)
                module_put(THIS_MODULE);
 }
@@ -892,7 +902,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
            sk_filter(tfile->socket.sk, skb))
                goto drop;
 
-       if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
+       if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
                goto drop;
 
        skb_tx_timestamp(skb);
@@ -1003,6 +1013,46 @@ tun_net_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
        stats->tx_dropped = tx_dropped;
 }
 
+static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog,
+                      struct netlink_ext_ack *extack)
+{
+       struct tun_struct *tun = netdev_priv(dev);
+       struct bpf_prog *old_prog;
+
+       old_prog = rtnl_dereference(tun->xdp_prog);
+       rcu_assign_pointer(tun->xdp_prog, prog);
+       if (old_prog)
+               bpf_prog_put(old_prog);
+
+       return 0;
+}
+
+static u32 tun_xdp_query(struct net_device *dev)
+{
+       struct tun_struct *tun = netdev_priv(dev);
+       const struct bpf_prog *xdp_prog;
+
+       xdp_prog = rtnl_dereference(tun->xdp_prog);
+       if (xdp_prog)
+               return xdp_prog->aux->id;
+
+       return 0;
+}
+
+static int tun_xdp(struct net_device *dev, struct netdev_xdp *xdp)
+{
+       switch (xdp->command) {
+       case XDP_SETUP_PROG:
+               return tun_xdp_set(dev, xdp->prog, xdp->extack);
+       case XDP_QUERY_PROG:
+               xdp->prog_id = tun_xdp_query(dev);
+               xdp->prog_attached = !!xdp->prog_id;
+               return 0;
+       default:
+               return -EINVAL;
+       }
+}
+
 static const struct net_device_ops tun_netdev_ops = {
        .ndo_uninit             = tun_net_uninit,
        .ndo_open               = tun_net_open,
@@ -1033,6 +1083,7 @@ static const struct net_device_ops tap_netdev_ops = {
        .ndo_features_check     = passthru_features_check,
        .ndo_set_rx_headroom    = tun_set_headroom,
        .ndo_get_stats64        = tun_net_get_stats64,
+       .ndo_xdp                = tun_xdp,
 };
 
 static void tun_flow_init(struct tun_struct *tun)
@@ -1190,6 +1241,138 @@ static void tun_rx_batched(struct tun_struct *tun, struct tun_file *tfile,
        }
 }
 
+static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile,
+                             int len, int noblock, bool zerocopy)
+{
+       if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
+               return false;
+
+       if (tfile->socket.sk->sk_sndbuf != INT_MAX)
+               return false;
+
+       if (!noblock)
+               return false;
+
+       if (zerocopy)
+               return false;
+
+       if (SKB_DATA_ALIGN(len + TUN_RX_PAD) +
+           SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE)
+               return false;
+
+       return true;
+}
+
+static struct sk_buff *tun_build_skb(struct tun_struct *tun,
+                                    struct tun_file *tfile,
+                                    struct iov_iter *from,
+                                    struct virtio_net_hdr *hdr,
+                                    int len, int *generic_xdp)
+{
+       struct page_frag *alloc_frag = &current->task_frag;
+       struct sk_buff *skb;
+       struct bpf_prog *xdp_prog;
+       int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+       unsigned int delta = 0;
+       char *buf;
+       size_t copied;
+       bool xdp_xmit = false;
+       int err, pad = TUN_RX_PAD;
+
+       rcu_read_lock();
+       xdp_prog = rcu_dereference(tun->xdp_prog);
+       if (xdp_prog)
+               pad += TUN_HEADROOM;
+       buflen += SKB_DATA_ALIGN(len + pad);
+       rcu_read_unlock();
+
+       if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL)))
+               return ERR_PTR(-ENOMEM);
+
+       buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
+       copied = copy_page_from_iter(alloc_frag->page,
+                                    alloc_frag->offset + pad,
+                                    len, from);
+       if (copied != len)
+               return ERR_PTR(-EFAULT);
+
+       /* There's a small window that XDP may be set after the check
+        * of xdp_prog above, this should be rare and for simplicity
+        * we do XDP on skb in case the headroom is not enough.
+        */
+       if (hdr->gso_type || !xdp_prog)
+               *generic_xdp = 1;
+       else
+               *generic_xdp = 0;
+
+       rcu_read_lock();
+       xdp_prog = rcu_dereference(tun->xdp_prog);
+       if (xdp_prog && !*generic_xdp) {
+               struct xdp_buff xdp;
+               void *orig_data;
+               u32 act;
+
+               xdp.data_hard_start = buf;
+               xdp.data = buf + pad;
+               xdp.data_end = xdp.data + len;
+               orig_data = xdp.data;
+               act = bpf_prog_run_xdp(xdp_prog, &xdp);
+
+               switch (act) {
+               case XDP_REDIRECT:
+                       get_page(alloc_frag->page);
+                       alloc_frag->offset += buflen;
+                       err = xdp_do_redirect(tun->dev, &xdp, xdp_prog);
+                       if (err)
+                               goto err_redirect;
+                       return NULL;
+               case XDP_TX:
+                       xdp_xmit = true;
+                       /* fall through */
+               case XDP_PASS:
+                       delta = orig_data - xdp.data;
+                       break;
+               default:
+                       bpf_warn_invalid_xdp_action(act);
+                       /* fall through */
+               case XDP_ABORTED:
+                       trace_xdp_exception(tun->dev, xdp_prog, act);
+                       /* fall through */
+               case XDP_DROP:
+                       goto err_xdp;
+               }
+       }
+
+       skb = build_skb(buf, buflen);
+       if (!skb) {
+               rcu_read_unlock();
+               return ERR_PTR(-ENOMEM);
+       }
+
+       skb_reserve(skb, pad - delta);
+       skb_put(skb, len + delta);
+       get_page(alloc_frag->page);
+       alloc_frag->offset += buflen;
+
+       if (xdp_xmit) {
+               skb->dev = tun->dev;
+               generic_xdp_tx(skb, xdp_prog);
+               rcu_read_lock();
+               return NULL;
+       }
+
+       rcu_read_unlock();
+
+       return skb;
+
+err_redirect:
+       put_page(alloc_frag->page);
+err_xdp:
+       rcu_read_unlock();
+       this_cpu_inc(tun->pcpu_stats->rx_dropped);
+       return NULL;
+}
+
 /* Get packet from user space buffer */
 static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
                            void *msg_control, struct iov_iter *from,
@@ -1206,6 +1389,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
        bool zerocopy = false;
        int err;
        u32 rxhash;
+       int generic_xdp = 1;
 
        if (!(tun->dev->flags & IFF_UP))
                return -EIO;
@@ -1263,30 +1447,40 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
                        zerocopy = true;
        }
 
-       if (!zerocopy) {
-               copylen = len;
-               if (tun16_to_cpu(tun, gso.hdr_len) > good_linear)
-                       linear = good_linear;
-               else
-                       linear = tun16_to_cpu(tun, gso.hdr_len);
-       }
-
-       skb = tun_alloc_skb(tfile, align, copylen, linear, noblock);
-       if (IS_ERR(skb)) {
-               if (PTR_ERR(skb) != -EAGAIN)
+       if (tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {
+               skb = tun_build_skb(tun, tfile, from, &gso, len, &generic_xdp);
+               if (IS_ERR(skb)) {
                        this_cpu_inc(tun->pcpu_stats->rx_dropped);
-               return PTR_ERR(skb);
-       }
+                       return PTR_ERR(skb);
+               }
+               if (!skb)
+                       return total_len;
+       } else {
+               if (!zerocopy) {
+                       copylen = len;
+                       if (tun16_to_cpu(tun, gso.hdr_len) > good_linear)
+                               linear = good_linear;
+                       else
+                               linear = tun16_to_cpu(tun, gso.hdr_len);
+               }
 
-       if (zerocopy)
-               err = zerocopy_sg_from_iter(skb, from);
-       else
-               err = skb_copy_datagram_from_iter(skb, 0, from, len);
+               skb = tun_alloc_skb(tfile, align, copylen, linear, noblock);
+               if (IS_ERR(skb)) {
+                       if (PTR_ERR(skb) != -EAGAIN)
+                               this_cpu_inc(tun->pcpu_stats->rx_dropped);
+                       return PTR_ERR(skb);
+               }
 
-       if (err) {
-               this_cpu_inc(tun->pcpu_stats->rx_dropped);
-               kfree_skb(skb);
-               return -EFAULT;
+               if (zerocopy)
+                       err = zerocopy_sg_from_iter(skb, from);
+               else
+                       err = skb_copy_datagram_from_iter(skb, 0, from, len);
+
+               if (err) {
+                       this_cpu_inc(tun->pcpu_stats->rx_dropped);
+                       kfree_skb(skb);
+                       return -EFAULT;
+               }
        }
 
        if (virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun))) {
@@ -1334,6 +1528,22 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
        skb_reset_network_header(skb);
        skb_probe_transport_header(skb, 0);
 
+       if (generic_xdp) {
+               struct bpf_prog *xdp_prog;
+               int ret;
+
+               rcu_read_lock();
+               xdp_prog = rcu_dereference(tun->xdp_prog);
+               if (xdp_prog) {
+                       ret = do_xdp_generic(xdp_prog, skb);
+                       if (ret != XDP_PASS) {
+                               rcu_read_unlock();
+                               return total_len;
+                       }
+               }
+               rcu_read_unlock();
+       }
+
        rxhash = __skb_get_hash_symmetric(skb);
 #ifndef CONFIG_4KSTACKS
        tun_rx_batched(tun, tfile, skb, more);
@@ -1879,6 +2089,9 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 
 err_detach:
        tun_detach_all(dev);
+       /* register_netdevice() already called tun_free_netdev() */
+       goto err_free_dev;
+
 err_free_flow:
        tun_flow_uninit(tun);
        security_tun_dev_free_security(tun->security);
@@ -1921,11 +2134,6 @@ static int set_offload(struct tun_struct *tun, unsigned long arg)
                                features |= NETIF_F_TSO6;
                        arg &= ~(TUN_F_TSO4|TUN_F_TSO6);
                }
-
-               if (arg & TUN_F_UFO) {
-                       features |= NETIF_F_UFO;
-                       arg &= ~TUN_F_UFO;
-               }
        }
 
        /* This gives the user a way to test for new features in future by
@@ -2537,7 +2745,7 @@ static int tun_queue_resize(struct tun_struct *tun)
        int n = tun->numqueues + tun->numdisabled;
        int ret, i;
 
-       arrays = kmalloc(sizeof *arrays * n, GFP_KERNEL);
+       arrays = kmalloc_array(n, sizeof(*arrays), GFP_KERNEL);
        if (!arrays)
                return -ENOMEM;