]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blobdiff - drivers/net/tun.c
tun: add mutex_unlock() call and napi.skb clearing in tun_get_user()
[mirror_ubuntu-bionic-kernel.git] / drivers / net / tun.c
index 4f4a842a1c9cb8ac3397b329854a0fc7bd2f6aa3..83d267ded3465b9ce10e0eb2ba1ed9befd0670bd 100644 (file)
@@ -176,6 +176,7 @@ struct tun_file {
        };
        struct napi_struct napi;
        bool napi_enabled;
+       bool napi_frags_enabled;
        struct mutex napi_mutex;        /* Protects access to the above napi */
        struct list_head next;
        struct tun_struct *detached;
@@ -275,32 +276,32 @@ static int tun_napi_poll(struct napi_struct *napi, int budget)
 }
 
 static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile,
-                         bool napi_en)
+                         bool napi_en, bool napi_frags)
 {
        tfile->napi_enabled = napi_en;
+       tfile->napi_frags_enabled = napi_en && napi_frags;
        if (napi_en) {
-               netif_napi_add(tun->dev, &tfile->napi, tun_napi_poll,
-                              NAPI_POLL_WEIGHT);
+               netif_tx_napi_add(tun->dev, &tfile->napi, tun_napi_poll,
+                                 NAPI_POLL_WEIGHT);
                napi_enable(&tfile->napi);
-               mutex_init(&tfile->napi_mutex);
        }
 }
 
-static void tun_napi_disable(struct tun_struct *tun, struct tun_file *tfile)
+static void tun_napi_disable(struct tun_file *tfile)
 {
        if (tfile->napi_enabled)
                napi_disable(&tfile->napi);
 }
 
-static void tun_napi_del(struct tun_struct *tun, struct tun_file *tfile)
+static void tun_napi_del(struct tun_file *tfile)
 {
        if (tfile->napi_enabled)
                netif_napi_del(&tfile->napi);
 }
 
-static bool tun_napi_frags_enabled(const struct tun_struct *tun)
+static bool tun_napi_frags_enabled(const struct tun_file *tfile)
 {
-       return READ_ONCE(tun->flags) & IFF_NAPI_FRAGS;
+       return tfile->napi_frags_enabled;
 }
 
 #ifdef CONFIG_TUN_VNET_CROSS_LE
@@ -619,8 +620,8 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
        tun = rtnl_dereference(tfile->tun);
 
        if (tun && clean) {
-               tun_napi_disable(tun, tfile);
-               tun_napi_del(tun, tfile);
+               tun_napi_disable(tfile);
+               tun_napi_del(tfile);
        }
 
        if (tun && !tfile->detached) {
@@ -657,8 +658,7 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
                            tun->dev->reg_state == NETREG_REGISTERED)
                                unregister_netdevice(tun->dev);
                }
-               if (tun)
-                       skb_array_cleanup(&tfile->tx_array);
+               skb_array_cleanup(&tfile->tx_array);
                sock_put(&tfile->sk);
        }
 }
@@ -680,7 +680,7 @@ static void tun_detach_all(struct net_device *dev)
        for (i = 0; i < n; i++) {
                tfile = rtnl_dereference(tun->tfiles[i]);
                BUG_ON(!tfile);
-               tun_napi_disable(tun, tfile);
+               tun_napi_disable(tfile);
                tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN;
                tfile->socket.sk->sk_data_ready(tfile->socket.sk);
                RCU_INIT_POINTER(tfile->tun, NULL);
@@ -696,7 +696,7 @@ static void tun_detach_all(struct net_device *dev)
        synchronize_net();
        for (i = 0; i < n; i++) {
                tfile = rtnl_dereference(tun->tfiles[i]);
-               tun_napi_del(tun, tfile);
+               tun_napi_del(tfile);
                /* Drop read queue */
                tun_queue_purge(tfile);
                sock_put(&tfile->sk);
@@ -716,7 +716,8 @@ static void tun_detach_all(struct net_device *dev)
 }
 
 static int tun_attach(struct tun_struct *tun, struct file *file,
-                     bool skip_filter, bool napi)
+                     bool skip_filter, bool napi, bool napi_frags,
+                     bool publish_tun)
 {
        struct tun_file *tfile = file->private_data;
        struct net_device *dev = tun->dev;
@@ -751,30 +752,33 @@ static int tun_attach(struct tun_struct *tun, struct file *file,
        }
 
        if (!tfile->detached &&
-           skb_array_init(&tfile->tx_array, dev->tx_queue_len, GFP_KERNEL)) {
+           skb_array_resize(&tfile->tx_array, dev->tx_queue_len, GFP_KERNEL)) {
                err = -ENOMEM;
                goto out;
        }
 
        tfile->queue_index = tun->numqueues;
        tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN;
-       rcu_assign_pointer(tfile->tun, tun);
-       rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
-       tun->numqueues++;
-
        if (tfile->detached) {
                tun_enable_queue(tfile);
        } else {
                sock_hold(&tfile->sk);
-               tun_napi_init(tun, tfile, napi);
+               tun_napi_init(tun, tfile, napi, napi_frags);
        }
 
-       tun_set_real_num_queues(tun);
-
        /* device is allowed to go away first, so no need to hold extra
         * refcnt.
         */
 
+       /* Publish tfile->tun and tun->tfiles only after we've fully
+        * initialized tfile; otherwise we risk using half-initialized
+        * object.
+        */
+       if (publish_tun)
+               rcu_assign_pointer(tfile->tun, tun);
+       rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
+       tun->numqueues++;
+       tun_set_real_num_queues(tun);
 out:
        return err;
 }
@@ -914,18 +918,8 @@ static void tun_net_uninit(struct net_device *dev)
 /* Net device open. */
 static int tun_net_open(struct net_device *dev)
 {
-       struct tun_struct *tun = netdev_priv(dev);
-       int i;
-
        netif_tx_start_all_queues(dev);
 
-       for (i = 0; i < tun->numqueues; i++) {
-               struct tun_file *tfile;
-
-               tfile = rtnl_dereference(tun->tfiles[i]);
-               tfile->socket.sk->sk_write_space(tfile->socket.sk);
-       }
-
        return 0;
 }
 
@@ -1054,13 +1048,11 @@ static void tun_poll_controller(struct net_device *dev)
                struct tun_file *tfile;
                int i;
 
-               if (tun_napi_frags_enabled(tun))
-                       return;
-
                rcu_read_lock();
                for (i = 0; i < tun->numqueues; i++) {
                        tfile = rcu_dereference(tun->tfiles[i]);
-                       if (tfile->napi_enabled)
+                       if (!tun_napi_frags_enabled(tfile) &&
+                           tfile->napi_enabled)
                                napi_schedule(&tfile->napi);
                }
                rcu_read_unlock();
@@ -1155,6 +1147,21 @@ static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp)
        }
 }
 
+static int tun_net_change_carrier(struct net_device *dev, bool new_carrier)
+{
+       if (new_carrier) {
+               struct tun_struct *tun = netdev_priv(dev);
+
+               if (!tun->numqueues)
+                       return -EPERM;
+
+               netif_carrier_on(dev);
+       } else {
+               netif_carrier_off(dev);
+       }
+       return 0;
+}
+
 static const struct net_device_ops tun_netdev_ops = {
        .ndo_uninit             = tun_net_uninit,
        .ndo_open               = tun_net_open,
@@ -1167,6 +1174,7 @@ static const struct net_device_ops tun_netdev_ops = {
 #endif
        .ndo_set_rx_headroom    = tun_set_headroom,
        .ndo_get_stats64        = tun_net_get_stats64,
+       .ndo_change_carrier     = tun_net_change_carrier,
 };
 
 static const struct net_device_ops tap_netdev_ops = {
@@ -1186,6 +1194,7 @@ static const struct net_device_ops tap_netdev_ops = {
        .ndo_set_rx_headroom    = tun_set_headroom,
        .ndo_get_stats64        = tun_net_get_stats64,
        .ndo_bpf                = tun_xdp,
+       .ndo_change_carrier     = tun_net_change_carrier,
 };
 
 static void tun_flow_init(struct tun_struct *tun)
@@ -1245,6 +1254,13 @@ static void tun_net_init(struct net_device *dev)
        dev->max_mtu = MAX_MTU - dev->hard_header_len;
 }
 
+static bool tun_sock_writeable(struct tun_struct *tun, struct tun_file *tfile)
+{
+       struct sock *sk = tfile->socket.sk;
+
+       return (tun->dev->flags & IFF_UP) && sock_writeable(sk);
+}
+
 /* Character device part */
 
 /* Poll */
@@ -1267,10 +1283,14 @@ static unsigned int tun_chr_poll(struct file *file, poll_table *wait)
        if (!skb_array_empty(&tfile->tx_array))
                mask |= POLLIN | POLLRDNORM;
 
-       if (tun->dev->flags & IFF_UP &&
-           (sock_writeable(sk) ||
-            (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags) &&
-             sock_writeable(sk))))
+       /* Make sure SOCKWQ_ASYNC_NOSPACE is set if not writable to
+        * guarantee EPOLLOUT to be raised by either here or
+        * tun_sock_write_space(). Then process could get notification
+        * after it writes to a down device and meets -EIO.
+        */
+       if (tun_sock_writeable(tun, tfile) ||
+           (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags) &&
+            tun_sock_writeable(tun, tfile)))
                mask |= POLLOUT | POLLWRNORM;
 
        if (tun->dev->reg_state != NETREG_REGISTERED)
@@ -1308,27 +1328,23 @@ static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile,
        skb->truesize += skb->data_len;
 
        for (i = 1; i < it->nr_segs; i++) {
+               struct page_frag *pfrag = &current->task_frag;
                size_t fragsz = it->iov[i].iov_len;
-               unsigned long offset;
-               struct page *page;
-               void *data;
 
                if (fragsz == 0 || fragsz > PAGE_SIZE) {
                        err = -EINVAL;
                        goto free;
                }
 
-               local_bh_disable();
-               data = napi_alloc_frag(fragsz);
-               local_bh_enable();
-               if (!data) {
+               if (!skb_page_frag_refill(fragsz, pfrag, GFP_KERNEL)) {
                        err = -ENOMEM;
                        goto free;
                }
 
-               page = virt_to_head_page(data);
-               offset = data - page_address(page);
-               skb_fill_page_desc(skb, i - 1, page, offset, fragsz);
+               skb_fill_page_desc(skb, i - 1, pfrag->page,
+                                  pfrag->offset, fragsz);
+               page_ref_inc(pfrag->page);
+               pfrag->offset += fragsz;
        }
 
        return skb;
@@ -1375,6 +1391,7 @@ static void tun_rx_batched(struct tun_struct *tun, struct tun_file *tfile,
 
        if (!rx_batched || (!more && skb_queue_empty(queue))) {
                local_bh_disable();
+               skb_record_rx_queue(skb, tfile->queue_index);
                netif_receive_skb(skb);
                local_bh_enable();
                return;
@@ -1394,8 +1411,11 @@ static void tun_rx_batched(struct tun_struct *tun, struct tun_file *tfile,
                struct sk_buff *nskb;
 
                local_bh_disable();
-               while ((nskb = __skb_dequeue(&process_queue)))
+               while ((nskb = __skb_dequeue(&process_queue))) {
+                       skb_record_rx_queue(nskb, tfile->queue_index);
                        netif_receive_skb(nskb);
+               }
+               skb_record_rx_queue(skb, tfile->queue_index);
                netif_receive_skb(skb);
                local_bh_enable();
        }
@@ -1466,6 +1486,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
        else
                *skb_xdp = 0;
 
+       local_bh_disable();
        rcu_read_lock();
        xdp_prog = rcu_dereference(tun->xdp_prog);
        if (xdp_prog && !*skb_xdp) {
@@ -1485,9 +1506,11 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
                        get_page(alloc_frag->page);
                        alloc_frag->offset += buflen;
                        err = xdp_do_redirect(tun->dev, &xdp, xdp_prog);
+                       xdp_do_flush_map();
                        if (err)
                                goto err_redirect;
                        rcu_read_unlock();
+                       local_bh_enable();
                        return NULL;
                case XDP_TX:
                        xdp_xmit = true;
@@ -1509,11 +1532,13 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
        skb = build_skb(buf, buflen);
        if (!skb) {
                rcu_read_unlock();
+               local_bh_enable();
                return ERR_PTR(-ENOMEM);
        }
 
        skb_reserve(skb, pad - delta);
        skb_put(skb, len + delta);
+       skb_set_owner_w(skb, tfile->socket.sk);
        get_page(alloc_frag->page);
        alloc_frag->offset += buflen;
 
@@ -1521,10 +1546,12 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
                skb->dev = tun->dev;
                generic_xdp_tx(skb, xdp_prog);
                rcu_read_unlock();
+               local_bh_enable();
                return NULL;
        }
 
        rcu_read_unlock();
+       local_bh_enable();
 
        return skb;
 
@@ -1532,6 +1559,7 @@ err_redirect:
        put_page(alloc_frag->page);
 err_xdp:
        rcu_read_unlock();
+       local_bh_enable();
        this_cpu_inc(tun->pcpu_stats->rx_dropped);
        return NULL;
 }
@@ -1553,10 +1581,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
        int err;
        u32 rxhash;
        int skb_xdp = 1;
-       bool frags = tun_napi_frags_enabled(tun);
-
-       if (!(tun->dev->flags & IFF_UP))
-               return -EIO;
+       bool frags = tun_napi_frags_enabled(tfile);
 
        if (!(tun->flags & IFF_NO_PI)) {
                if (len < sizeof(pi))
@@ -1659,6 +1684,8 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
                        err = skb_copy_datagram_from_iter(skb, 0, from, len);
 
                if (err) {
+                       err = -EFAULT;
+drop:
                        this_cpu_inc(tun->pcpu_stats->rx_dropped);
                        kfree_skb(skb);
                        if (frags) {
@@ -1666,7 +1693,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
                                mutex_unlock(&tfile->napi_mutex);
                        }
 
-                       return -EFAULT;
+                       return err;
                }
        }
 
@@ -1727,20 +1754,34 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
                struct bpf_prog *xdp_prog;
                int ret;
 
+               local_bh_disable();
                rcu_read_lock();
                xdp_prog = rcu_dereference(tun->xdp_prog);
                if (xdp_prog) {
                        ret = do_xdp_generic(xdp_prog, skb);
                        if (ret != XDP_PASS) {
                                rcu_read_unlock();
+                               local_bh_enable();
+                               if (frags) {
+                                       tfile->napi.skb = NULL;
+                                       mutex_unlock(&tfile->napi_mutex);
+                               }
                                return total_len;
                        }
                }
                rcu_read_unlock();
+               local_bh_enable();
        }
 
        rxhash = __skb_get_hash_symmetric(skb);
 
+       rcu_read_lock();
+       if (unlikely(!(tun->dev->flags & IFF_UP))) {
+               err = -EIO;
+               rcu_read_unlock();
+               goto drop;
+       }
+
        if (frags) {
                /* Exercise flow dissector code path. */
                u32 headlen = eth_get_headlen(skb->data, skb_headlen(skb));
@@ -1748,6 +1789,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
                if (unlikely(headlen > skb_headlen(skb))) {
                        this_cpu_inc(tun->pcpu_stats->rx_dropped);
                        napi_free_frags(&tfile->napi);
+                       rcu_read_unlock();
                        mutex_unlock(&tfile->napi_mutex);
                        WARN_ON(1);
                        return -ENOMEM;
@@ -1775,6 +1817,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
        } else {
                netif_rx_ni(skb);
        }
+       rcu_read_unlock();
 
        stats = get_cpu_ptr(tun->pcpu_stats);
        u64_stats_update_begin(&stats->syncp);
@@ -1846,7 +1889,8 @@ static ssize_t tun_put_user(struct tun_struct *tun,
                        return -EINVAL;
 
                if (virtio_net_hdr_from_skb(skb, &gso,
-                                           tun_is_little_endian(tun), true)) {
+                                           tun_is_little_endian(tun), true,
+                                           vlan_hlen)) {
                        struct skb_shared_info *sinfo = skb_shinfo(skb);
                        pr_err("unexpected GSO type: "
                               "0x%x, gso_size %d, hdr_len %d\n",
@@ -1917,9 +1961,9 @@ static struct sk_buff *tun_ring_recv(struct tun_file *tfile, int noblock,
        }
 
        add_wait_queue(&tfile->wq.wait, &wait);
-       current->state = TASK_INTERRUPTIBLE;
 
        while (1) {
+               set_current_state(TASK_INTERRUPTIBLE);
                skb = skb_array_consume(&tfile->tx_array);
                if (skb)
                        break;
@@ -1935,7 +1979,7 @@ static struct sk_buff *tun_ring_recv(struct tun_file *tfile, int noblock,
                schedule();
        }
 
-       current->state = TASK_RUNNING;
+       __set_current_state(TASK_RUNNING);
        remove_wait_queue(&tfile->wq.wait, &wait);
 
 out:
@@ -2021,7 +2065,9 @@ static void tun_setup(struct net_device *dev)
 static int tun_validate(struct nlattr *tb[], struct nlattr *data[],
                        struct netlink_ext_ack *extack)
 {
-       return -EINVAL;
+       NL_SET_ERR_MSG(extack,
+                      "tun/tap creation via rtnetlink is not supported.");
+       return -EOPNOTSUPP;
 }
 
 static struct rtnl_link_ops tun_link_ops __read_mostly = {
@@ -2223,7 +2269,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
                        return err;
 
                err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER,
-                                ifr->ifr_flags & IFF_NAPI);
+                                ifr->ifr_flags & IFF_NAPI,
+                                ifr->ifr_flags & IFF_NAPI_FRAGS, true);
                if (err < 0)
                        return err;
 
@@ -2312,13 +2359,18 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
                                       NETIF_F_HW_VLAN_STAG_TX);
 
                INIT_LIST_HEAD(&tun->disabled);
-               err = tun_attach(tun, file, false, ifr->ifr_flags & IFF_NAPI);
+               err = tun_attach(tun, file, false, ifr->ifr_flags & IFF_NAPI,
+                                ifr->ifr_flags & IFF_NAPI_FRAGS, false);
                if (err < 0)
                        goto err_free_flow;
 
                err = register_netdevice(tun->dev);
                if (err < 0)
                        goto err_detach;
+               /* free_netdev() won't check refcnt, to aovid race
+                * with dev_put() we need publish tun after registration.
+                */
+               rcu_assign_pointer(tfile->tun, tun);
        }
 
        netif_carrier_on(tun->dev);
@@ -2464,7 +2516,8 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr)
                ret = security_tun_dev_attach_queue(tun->security);
                if (ret < 0)
                        goto unlock;
-               ret = tun_attach(tun, file, false, tun->flags & IFF_NAPI);
+               ret = tun_attach(tun, file, false, tun->flags & IFF_NAPI,
+                                tun->flags & IFF_NAPI_FRAGS, true);
        } else if (ifr->ifr_flags & IFF_DETACH_QUEUE) {
                tun = rtnl_dereference(tfile->tun);
                if (!tun || !(tun->flags & IFF_MULTI_QUEUE) || tfile->detached)
@@ -2485,12 +2538,12 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
        struct tun_file *tfile = file->private_data;
        struct tun_struct *tun;
        void __user* argp = (void __user*)arg;
+       unsigned int ifindex, carrier;
        struct ifreq ifr;
        kuid_t owner;
        kgid_t group;
        int sndbuf;
        int vnet_hdr_sz;
-       unsigned int ifindex;
        int le;
        int ret;
 
@@ -2755,6 +2808,14 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
                ret = 0;
                break;
 
+       case TUNSETCARRIER:
+               ret = -EFAULT;
+               if (copy_from_user(&carrier, argp, sizeof(carrier)))
+                       goto unlock;
+
+               ret = tun_net_change_carrier(tun->dev, (bool)carrier);
+               break;
+
        default:
                ret = -EINVAL;
                break;
@@ -2831,6 +2892,12 @@ static int tun_chr_open(struct inode *inode, struct file * file)
                                            &tun_proto, 0);
        if (!tfile)
                return -ENOMEM;
+       if (skb_array_init(&tfile->tx_array, 0, GFP_KERNEL)) {
+               sk_free(&tfile->sk);
+               return -ENOMEM;
+       }
+
+       mutex_init(&tfile->napi_mutex);
        RCU_INIT_POINTER(tfile->tun, NULL);
        tfile->flags = 0;
        tfile->ifindex = 0;
@@ -3025,6 +3092,7 @@ static int tun_device_event(struct notifier_block *unused,
 {
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct tun_struct *tun = netdev_priv(dev);
+       int i;
 
        if (dev->rtnl_link_ops != &tun_link_ops)
                return NOTIFY_DONE;
@@ -3034,6 +3102,14 @@ static int tun_device_event(struct notifier_block *unused,
                if (tun_queue_resize(tun))
                        return NOTIFY_BAD;
                break;
+       case NETDEV_UP:
+               for (i = 0; i < tun->numqueues; i++) {
+                       struct tun_file *tfile;
+
+                       tfile = rtnl_dereference(tun->tfiles[i]);
+                       tfile->socket.sk->sk_write_space(tfile->socket.sk);
+               }
+               break;
        default:
                break;
        }