};
struct napi_struct napi;
bool napi_enabled;
+ bool napi_frags_enabled;
struct mutex napi_mutex; /* Protects access to the above napi */
struct list_head next;
struct tun_struct *detached;
}
static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile,
- bool napi_en)
+ bool napi_en, bool napi_frags)
{
tfile->napi_enabled = napi_en;
+ tfile->napi_frags_enabled = napi_en && napi_frags;
if (napi_en) {
- netif_napi_add(tun->dev, &tfile->napi, tun_napi_poll,
- NAPI_POLL_WEIGHT);
+ netif_tx_napi_add(tun->dev, &tfile->napi, tun_napi_poll,
+ NAPI_POLL_WEIGHT);
napi_enable(&tfile->napi);
- mutex_init(&tfile->napi_mutex);
}
}
-static void tun_napi_disable(struct tun_struct *tun, struct tun_file *tfile)
+static void tun_napi_disable(struct tun_file *tfile)
{
if (tfile->napi_enabled)
napi_disable(&tfile->napi);
}
-static void tun_napi_del(struct tun_struct *tun, struct tun_file *tfile)
+static void tun_napi_del(struct tun_file *tfile)
{
if (tfile->napi_enabled)
netif_napi_del(&tfile->napi);
}
-static bool tun_napi_frags_enabled(const struct tun_struct *tun)
+static bool tun_napi_frags_enabled(const struct tun_file *tfile)
{
- return READ_ONCE(tun->flags) & IFF_NAPI_FRAGS;
+ return tfile->napi_frags_enabled;
}
#ifdef CONFIG_TUN_VNET_CROSS_LE
skb_queue_purge(&tfile->sk.sk_error_queue);
}
-static void tun_cleanup_tx_array(struct tun_file *tfile)
-{
- if (tfile->tx_array.ring.queue) {
- skb_array_cleanup(&tfile->tx_array);
- memset(&tfile->tx_array, 0, sizeof(tfile->tx_array));
- }
-}
-
static void __tun_detach(struct tun_file *tfile, bool clean)
{
struct tun_file *ntfile;
tun = rtnl_dereference(tfile->tun);
if (tun && clean) {
- tun_napi_disable(tun, tfile);
- tun_napi_del(tun, tfile);
+ tun_napi_disable(tfile);
+ tun_napi_del(tfile);
}
if (tun && !tfile->detached) {
tun->dev->reg_state == NETREG_REGISTERED)
unregister_netdevice(tun->dev);
}
- tun_cleanup_tx_array(tfile);
+ skb_array_cleanup(&tfile->tx_array);
sock_put(&tfile->sk);
}
}
for (i = 0; i < n; i++) {
tfile = rtnl_dereference(tun->tfiles[i]);
BUG_ON(!tfile);
- tun_napi_disable(tun, tfile);
+ tun_napi_disable(tfile);
tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN;
tfile->socket.sk->sk_data_ready(tfile->socket.sk);
RCU_INIT_POINTER(tfile->tun, NULL);
synchronize_net();
for (i = 0; i < n; i++) {
tfile = rtnl_dereference(tun->tfiles[i]);
- tun_napi_del(tun, tfile);
+ tun_napi_del(tfile);
/* Drop read queue */
tun_queue_purge(tfile);
sock_put(&tfile->sk);
- tun_cleanup_tx_array(tfile);
}
list_for_each_entry_safe(tfile, tmp, &tun->disabled, next) {
tun_enable_queue(tfile);
tun_queue_purge(tfile);
sock_put(&tfile->sk);
- tun_cleanup_tx_array(tfile);
}
BUG_ON(tun->numdisabled != 0);
}
static int tun_attach(struct tun_struct *tun, struct file *file,
- bool skip_filter, bool napi)
+ bool skip_filter, bool napi, bool napi_frags,
+ bool publish_tun)
{
struct tun_file *tfile = file->private_data;
struct net_device *dev = tun->dev;
}
if (!tfile->detached &&
- skb_array_init(&tfile->tx_array, dev->tx_queue_len, GFP_KERNEL)) {
+ skb_array_resize(&tfile->tx_array, dev->tx_queue_len, GFP_KERNEL)) {
err = -ENOMEM;
goto out;
}
tfile->queue_index = tun->numqueues;
tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN;
- rcu_assign_pointer(tfile->tun, tun);
- rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
- tun->numqueues++;
-
if (tfile->detached) {
tun_enable_queue(tfile);
} else {
sock_hold(&tfile->sk);
- tun_napi_init(tun, tfile, napi);
+ tun_napi_init(tun, tfile, napi, napi_frags);
}
- tun_set_real_num_queues(tun);
-
/* device is allowed to go away first, so no need to hold extra
* refcnt.
*/
+ /* Publish tfile->tun and tun->tfiles only after we've fully
+ * initialized tfile; otherwise we risk using half-initialized
+ * object.
+ */
+ if (publish_tun)
+ rcu_assign_pointer(tfile->tun, tun);
+ rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
+ tun->numqueues++;
+ tun_set_real_num_queues(tun);
out:
return err;
}
/* Net device open. */
static int tun_net_open(struct net_device *dev)
{
- struct tun_struct *tun = netdev_priv(dev);
- int i;
-
netif_tx_start_all_queues(dev);
- for (i = 0; i < tun->numqueues; i++) {
- struct tun_file *tfile;
-
- tfile = rtnl_dereference(tun->tfiles[i]);
- tfile->socket.sk->sk_write_space(tfile->socket.sk);
- }
-
return 0;
}
struct tun_file *tfile;
int i;
- if (tun_napi_frags_enabled(tun))
- return;
-
rcu_read_lock();
for (i = 0; i < tun->numqueues; i++) {
tfile = rcu_dereference(tun->tfiles[i]);
- if (tfile->napi_enabled)
+ if (!tun_napi_frags_enabled(tfile) &&
+ tfile->napi_enabled)
napi_schedule(&tfile->napi);
}
rcu_read_unlock();
}
}
+static int tun_net_change_carrier(struct net_device *dev, bool new_carrier)
+{
+ if (new_carrier) {
+ struct tun_struct *tun = netdev_priv(dev);
+
+ if (!tun->numqueues)
+ return -EPERM;
+
+ netif_carrier_on(dev);
+ } else {
+ netif_carrier_off(dev);
+ }
+ return 0;
+}
+
static const struct net_device_ops tun_netdev_ops = {
.ndo_uninit = tun_net_uninit,
.ndo_open = tun_net_open,
#endif
.ndo_set_rx_headroom = tun_set_headroom,
.ndo_get_stats64 = tun_net_get_stats64,
+ .ndo_change_carrier = tun_net_change_carrier,
};
static const struct net_device_ops tap_netdev_ops = {
.ndo_set_rx_headroom = tun_set_headroom,
.ndo_get_stats64 = tun_net_get_stats64,
.ndo_bpf = tun_xdp,
+ .ndo_change_carrier = tun_net_change_carrier,
};
static void tun_flow_init(struct tun_struct *tun)
dev->max_mtu = MAX_MTU - dev->hard_header_len;
}
+static bool tun_sock_writeable(struct tun_struct *tun, struct tun_file *tfile)
+{
+ struct sock *sk = tfile->socket.sk;
+
+ return (tun->dev->flags & IFF_UP) && sock_writeable(sk);
+}
+
/* Character device part */
/* Poll */
if (!skb_array_empty(&tfile->tx_array))
mask |= POLLIN | POLLRDNORM;
- if (tun->dev->flags & IFF_UP &&
- (sock_writeable(sk) ||
- (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags) &&
- sock_writeable(sk))))
+ /* Make sure SOCKWQ_ASYNC_NOSPACE is set if not writable to
+ * guarantee EPOLLOUT to be raised by either here or
+ * tun_sock_write_space(). Then process could get notification
+ * after it writes to a down device and meets -EIO.
+ */
+ if (tun_sock_writeable(tun, tfile) ||
+ (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags) &&
+ tun_sock_writeable(tun, tfile)))
mask |= POLLOUT | POLLWRNORM;
if (tun->dev->reg_state != NETREG_REGISTERED)
skb->truesize += skb->data_len;
for (i = 1; i < it->nr_segs; i++) {
+ struct page_frag *pfrag = ¤t->task_frag;
size_t fragsz = it->iov[i].iov_len;
- unsigned long offset;
- struct page *page;
- void *data;
if (fragsz == 0 || fragsz > PAGE_SIZE) {
err = -EINVAL;
goto free;
}
- local_bh_disable();
- data = napi_alloc_frag(fragsz);
- local_bh_enable();
- if (!data) {
+ if (!skb_page_frag_refill(fragsz, pfrag, GFP_KERNEL)) {
err = -ENOMEM;
goto free;
}
- page = virt_to_head_page(data);
- offset = data - page_address(page);
- skb_fill_page_desc(skb, i - 1, page, offset, fragsz);
+ skb_fill_page_desc(skb, i - 1, pfrag->page,
+ pfrag->offset, fragsz);
+ page_ref_inc(pfrag->page);
+ pfrag->offset += fragsz;
}
return skb;
if (!rx_batched || (!more && skb_queue_empty(queue))) {
local_bh_disable();
+ skb_record_rx_queue(skb, tfile->queue_index);
netif_receive_skb(skb);
local_bh_enable();
return;
struct sk_buff *nskb;
local_bh_disable();
- while ((nskb = __skb_dequeue(&process_queue)))
+ while ((nskb = __skb_dequeue(&process_queue))) {
+ skb_record_rx_queue(nskb, tfile->queue_index);
netif_receive_skb(nskb);
+ }
+ skb_record_rx_queue(skb, tfile->queue_index);
netif_receive_skb(skb);
local_bh_enable();
}
else
*skb_xdp = 0;
+ local_bh_disable();
rcu_read_lock();
xdp_prog = rcu_dereference(tun->xdp_prog);
if (xdp_prog && !*skb_xdp) {
get_page(alloc_frag->page);
alloc_frag->offset += buflen;
err = xdp_do_redirect(tun->dev, &xdp, xdp_prog);
+ xdp_do_flush_map();
if (err)
goto err_redirect;
rcu_read_unlock();
+ local_bh_enable();
return NULL;
case XDP_TX:
xdp_xmit = true;
skb = build_skb(buf, buflen);
if (!skb) {
rcu_read_unlock();
+ local_bh_enable();
return ERR_PTR(-ENOMEM);
}
skb_reserve(skb, pad - delta);
skb_put(skb, len + delta);
+ skb_set_owner_w(skb, tfile->socket.sk);
get_page(alloc_frag->page);
alloc_frag->offset += buflen;
skb->dev = tun->dev;
generic_xdp_tx(skb, xdp_prog);
rcu_read_unlock();
+ local_bh_enable();
return NULL;
}
rcu_read_unlock();
+ local_bh_enable();
return skb;
put_page(alloc_frag->page);
err_xdp:
rcu_read_unlock();
+ local_bh_enable();
this_cpu_inc(tun->pcpu_stats->rx_dropped);
return NULL;
}
int err;
u32 rxhash;
int skb_xdp = 1;
- bool frags = tun_napi_frags_enabled(tun);
-
- if (!(tun->dev->flags & IFF_UP))
- return -EIO;
+ bool frags = tun_napi_frags_enabled(tfile);
if (!(tun->flags & IFF_NO_PI)) {
if (len < sizeof(pi))
err = skb_copy_datagram_from_iter(skb, 0, from, len);
if (err) {
+ err = -EFAULT;
+drop:
this_cpu_inc(tun->pcpu_stats->rx_dropped);
kfree_skb(skb);
if (frags) {
mutex_unlock(&tfile->napi_mutex);
}
- return -EFAULT;
+ return err;
}
}
struct bpf_prog *xdp_prog;
int ret;
+ local_bh_disable();
rcu_read_lock();
xdp_prog = rcu_dereference(tun->xdp_prog);
if (xdp_prog) {
ret = do_xdp_generic(xdp_prog, skb);
if (ret != XDP_PASS) {
rcu_read_unlock();
+ local_bh_enable();
+ if (frags) {
+ tfile->napi.skb = NULL;
+ mutex_unlock(&tfile->napi_mutex);
+ }
return total_len;
}
}
rcu_read_unlock();
+ local_bh_enable();
}
rxhash = __skb_get_hash_symmetric(skb);
+ rcu_read_lock();
+ if (unlikely(!(tun->dev->flags & IFF_UP))) {
+ err = -EIO;
+ rcu_read_unlock();
+ goto drop;
+ }
+
if (frags) {
/* Exercise flow dissector code path. */
u32 headlen = eth_get_headlen(skb->data, skb_headlen(skb));
if (unlikely(headlen > skb_headlen(skb))) {
this_cpu_inc(tun->pcpu_stats->rx_dropped);
napi_free_frags(&tfile->napi);
+ rcu_read_unlock();
mutex_unlock(&tfile->napi_mutex);
WARN_ON(1);
return -ENOMEM;
} else {
netif_rx_ni(skb);
}
+ rcu_read_unlock();
stats = get_cpu_ptr(tun->pcpu_stats);
u64_stats_update_begin(&stats->syncp);
return -EINVAL;
if (virtio_net_hdr_from_skb(skb, &gso,
- tun_is_little_endian(tun), true)) {
+ tun_is_little_endian(tun), true,
+ vlan_hlen)) {
struct skb_shared_info *sinfo = skb_shinfo(skb);
pr_err("unexpected GSO type: "
"0x%x, gso_size %d, hdr_len %d\n",
}
add_wait_queue(&tfile->wq.wait, &wait);
- current->state = TASK_INTERRUPTIBLE;
while (1) {
+ set_current_state(TASK_INTERRUPTIBLE);
skb = skb_array_consume(&tfile->tx_array);
if (skb)
break;
schedule();
}
- current->state = TASK_RUNNING;
+ __set_current_state(TASK_RUNNING);
remove_wait_queue(&tfile->wq.wait, &wait);
out:
static int tun_validate(struct nlattr *tb[], struct nlattr *data[],
struct netlink_ext_ack *extack)
{
- return -EINVAL;
+ NL_SET_ERR_MSG(extack,
+ "tun/tap creation via rtnetlink is not supported.");
+ return -EOPNOTSUPP;
}
static struct rtnl_link_ops tun_link_ops __read_mostly = {
return err;
err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER,
- ifr->ifr_flags & IFF_NAPI);
+ ifr->ifr_flags & IFF_NAPI,
+ ifr->ifr_flags & IFF_NAPI_FRAGS, true);
if (err < 0)
return err;
NETIF_F_HW_VLAN_STAG_TX);
INIT_LIST_HEAD(&tun->disabled);
- err = tun_attach(tun, file, false, ifr->ifr_flags & IFF_NAPI);
+ err = tun_attach(tun, file, false, ifr->ifr_flags & IFF_NAPI,
+ ifr->ifr_flags & IFF_NAPI_FRAGS, false);
if (err < 0)
goto err_free_flow;
err = register_netdevice(tun->dev);
if (err < 0)
goto err_detach;
+ /* free_netdev() won't check refcnt, to aovid race
+ * with dev_put() we need publish tun after registration.
+ */
+ rcu_assign_pointer(tfile->tun, tun);
}
netif_carrier_on(tun->dev);
ret = security_tun_dev_attach_queue(tun->security);
if (ret < 0)
goto unlock;
- ret = tun_attach(tun, file, false, tun->flags & IFF_NAPI);
+ ret = tun_attach(tun, file, false, tun->flags & IFF_NAPI,
+ tun->flags & IFF_NAPI_FRAGS, true);
} else if (ifr->ifr_flags & IFF_DETACH_QUEUE) {
tun = rtnl_dereference(tfile->tun);
if (!tun || !(tun->flags & IFF_MULTI_QUEUE) || tfile->detached)
struct tun_file *tfile = file->private_data;
struct tun_struct *tun;
void __user* argp = (void __user*)arg;
+ unsigned int ifindex, carrier;
struct ifreq ifr;
kuid_t owner;
kgid_t group;
int sndbuf;
int vnet_hdr_sz;
- unsigned int ifindex;
int le;
int ret;
ret = 0;
break;
+ case TUNSETCARRIER:
+ ret = -EFAULT;
+ if (copy_from_user(&carrier, argp, sizeof(carrier)))
+ goto unlock;
+
+ ret = tun_net_change_carrier(tun->dev, (bool)carrier);
+ break;
+
default:
ret = -EINVAL;
break;
&tun_proto, 0);
if (!tfile)
return -ENOMEM;
+ if (skb_array_init(&tfile->tx_array, 0, GFP_KERNEL)) {
+ sk_free(&tfile->sk);
+ return -ENOMEM;
+ }
+
+ mutex_init(&tfile->napi_mutex);
RCU_INIT_POINTER(tfile->tun, NULL);
tfile->flags = 0;
tfile->ifindex = 0;
sock_set_flag(&tfile->sk, SOCK_ZEROCOPY);
- memset(&tfile->tx_array, 0, sizeof(tfile->tx_array));
-
return 0;
}
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct tun_struct *tun = netdev_priv(dev);
+ int i;
if (dev->rtnl_link_ops != &tun_link_ops)
return NOTIFY_DONE;
if (tun_queue_resize(tun))
return NOTIFY_BAD;
break;
+ case NETDEV_UP:
+ for (i = 0; i < tun->numqueues; i++) {
+ struct tun_file *tfile;
+
+ tfile = rtnl_dereference(tun->tfiles[i]);
+ tfile->socket.sk->sk_write_space(tfile->socket.sk);
+ }
+ break;
default:
break;
}