[mirror_ubuntu-bionic-kernel.git] / drivers / vhost / net.c

/* Copyright (C) 2009 Red Hat, Inc.
 * Author: Michael S. Tsirkin <mst@redhat.com>
 *
 * This work is licensed under the terms of the GNU GPL, version 2.
 *
 * virtio-net server in host kernel.
 */

#include <linux/compat.h>
#include <linux/eventfd.h>
#include <linux/vhost.h>
#include <linux/virtio_net.h>
#include <linux/mmu_context.h>
#include <linux/miscdevice.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/workqueue.h>
#include <linux/rcupdate.h>
#include <linux/file.h>

#include <linux/net.h>
#include <linux/if_packet.h>
#include <linux/if_arp.h>
#include <linux/if_tun.h>

#include <net/sock.h>

#include "vhost.h"

/* Max number of bytes transferred before requeueing the job.
 * Using this limit prevents one virtqueue from starving others. */
#define VHOST_NET_WEIGHT 0x80000

enum {
	VHOST_NET_VQ_RX = 0,
	VHOST_NET_VQ_TX = 1,
	VHOST_NET_VQ_MAX = 2,
};

enum vhost_net_poll_state {
	VHOST_NET_POLL_DISABLED = 0,
	VHOST_NET_POLL_STARTED = 1,
	VHOST_NET_POLL_STOPPED = 2,
};

struct vhost_net {
	struct vhost_dev dev;
	struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
	struct vhost_poll poll[VHOST_NET_VQ_MAX];
	/* Tells us whether we are polling a socket for TX.
	 * We only do this when socket buffer fills up.
	 * Protected by tx vq lock. */
	enum vhost_net_poll_state tx_poll_state;
};

/* Pop first len bytes from iovec. Return number of segments used. */
static int move_iovec_hdr(struct iovec *from, struct iovec *to,
			  size_t len, int iov_count)
{
	int seg = 0;
	size_t size;
	while (len && seg < iov_count) {
		size = min(from->iov_len, len);
		to->iov_base = from->iov_base;
		to->iov_len = size;
		from->iov_len -= size;
		from->iov_base += size;
		len -= size;
		++from;
		++to;
		++seg;
	}
	return seg;
}

/* Caller must have TX VQ lock */
static void tx_poll_stop(struct vhost_net *net)
{
	if (likely(net->tx_poll_state != VHOST_NET_POLL_STARTED))
		return;
	vhost_poll_stop(net->poll + VHOST_NET_VQ_TX);
	net->tx_poll_state = VHOST_NET_POLL_STOPPED;
}

/* Caller must have TX VQ lock */
static void tx_poll_start(struct vhost_net *net, struct socket *sock)
{
	if (unlikely(net->tx_poll_state != VHOST_NET_POLL_STOPPED))
		return;
	vhost_poll_start(net->poll + VHOST_NET_VQ_TX, sock->file);
	net->tx_poll_state = VHOST_NET_POLL_STARTED;
}

/* Expects to be always run from workqueue - which acts as
 * read-size critical section for our kind of RCU. */
static void handle_tx(struct vhost_net *net)
{
	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX];
	unsigned head, out, in, s;
	struct msghdr msg = {
		.msg_name = NULL,
		.msg_namelen = 0,
		.msg_control = NULL,
		.msg_controllen = 0,
		.msg_iov = vq->iov,
		.msg_flags = MSG_DONTWAIT,
	};
	size_t len, total_len = 0;
	int err, wmem;
	size_t hdr_size;
	struct socket *sock = rcu_dereference(vq->private_data);
	if (!sock)
		return;

	wmem = atomic_read(&sock->sk->sk_wmem_alloc);
	if (wmem >= sock->sk->sk_sndbuf)
		return;

	use_mm(net->dev.mm);
	mutex_lock(&vq->mutex);
	vhost_disable_notify(vq);

	if (wmem < sock->sk->sk_sndbuf * 2)
		tx_poll_stop(net);
	hdr_size = vq->hdr_size;

	for (;;) {
		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
					 ARRAY_SIZE(vq->iov),
					 &out, &in,
					 NULL, NULL);
		/* Nothing new?  Wait for eventfd to tell us they refilled. */
		if (head == vq->num) {
			wmem = atomic_read(&sock->sk->sk_wmem_alloc);
			if (wmem >= sock->sk->sk_sndbuf * 3 / 4) {
				tx_poll_start(net, sock);
				set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
				break;
			}
			if (unlikely(vhost_enable_notify(vq))) {
				vhost_disable_notify(vq);
				continue;
			}
			break;
		}
		if (in) {
			vq_err(vq, "Unexpected descriptor format for TX: "
			       "out %d, int %d\n", out, in);
			break;
		}
		/* Skip header. TODO: support TSO. */
		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
		msg.msg_iovlen = out;
		len = iov_length(vq->iov, out);
		/* Sanity check */
		if (!len) {
			vq_err(vq, "Unexpected header len for TX: "
			       "%zd expected %zd\n",
			       iov_length(vq->hdr, s), hdr_size);
			break;
		}
		/* TODO: Check specific error and bomb out unless ENOBUFS? */
		err = sock->ops->sendmsg(NULL, sock, &msg, len);
		if (unlikely(err < 0)) {
			vhost_discard_vq_desc(vq);
			tx_poll_start(net, sock);
			break;
		}
		if (err != len)
			pr_err("Truncated TX packet: "
			       " len %d != %zd\n", err, len);
		vhost_add_used_and_signal(&net->dev, vq, head, 0);
		total_len += len;
		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
			vhost_poll_queue(&vq->poll);
			break;
		}
	}

	mutex_unlock(&vq->mutex);
	unuse_mm(net->dev.mm);
}

/* Expects to be always run from workqueue - which acts as
 * read-size critical section for our kind of RCU. */
static void handle_rx(struct vhost_net *net)
{
	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
	unsigned head, out, in, log, s;
	struct vhost_log *vq_log;
	struct msghdr msg = {
		.msg_name = NULL,
		.msg_namelen = 0,
		.msg_control = NULL, /* FIXME: get and handle RX aux data. */
		.msg_controllen = 0,
		.msg_iov = vq->iov,
		.msg_flags = MSG_DONTWAIT,
	};

	struct virtio_net_hdr hdr = {
		.flags = 0,
		.gso_type = VIRTIO_NET_HDR_GSO_NONE
	};

	size_t len, total_len = 0;
	int err;
	size_t hdr_size;
	struct socket *sock = rcu_dereference(vq->private_data);
	if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
		return;

	use_mm(net->dev.mm);
	mutex_lock(&vq->mutex);
	vhost_disable_notify(vq);
	hdr_size = vq->hdr_size;

	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
		vq->log : NULL;

	for (;;) {
		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
					 ARRAY_SIZE(vq->iov),
					 &out, &in,
					 vq_log, &log);
		/* OK, now we need to know about added descriptors. */
		if (head == vq->num) {
			if (unlikely(vhost_enable_notify(vq))) {
				/* They have slipped one in as we were
				 * doing that: check again. */
				vhost_disable_notify(vq);
				continue;
			}
			/* Nothing new?  Wait for eventfd to tell us
			 * they refilled. */
			break;
		}
		/* We don't need to be notified again. */
		if (out) {
			vq_err(vq, "Unexpected descriptor format for RX: "
			       "out %d, int %d\n",
			       out, in);
			break;
		}
		/* Skip header. TODO: support TSO/mergeable rx buffers. */
		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
		msg.msg_iovlen = in;
		len = iov_length(vq->iov, in);
		/* Sanity check */
		if (!len) {
			vq_err(vq, "Unexpected header len for RX: "
			       "%zd expected %zd\n",
			       iov_length(vq->hdr, s), hdr_size);
			break;
		}
		err = sock->ops->recvmsg(NULL, sock, &msg,
					 len, MSG_DONTWAIT | MSG_TRUNC);
		/* TODO: Check specific error and bomb out unless EAGAIN? */
		if (err < 0) {
			vhost_discard_vq_desc(vq);
			break;
		}
		/* TODO: Should check and handle checksum. */
		if (err > len) {
			pr_err("Discarded truncated rx packet: "
			       " len %d > %zd\n", err, len);
			vhost_discard_vq_desc(vq);
			continue;
		}
		len = err;
		err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr, hdr_size);
		if (err) {
			vq_err(vq, "Unable to write vnet_hdr at addr %p: %d\n",
			       vq->iov->iov_base, err);
			break;
		}
		len += hdr_size;
		vhost_add_used_and_signal(&net->dev, vq, head, len);
		if (unlikely(vq_log))
			vhost_log_write(vq, vq_log, log, len);
		total_len += len;
		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
			vhost_poll_queue(&vq->poll);
			break;
		}
	}

	mutex_unlock(&vq->mutex);
	unuse_mm(net->dev.mm);
}

static void handle_tx_kick(struct work_struct *work)
{
	struct vhost_virtqueue *vq;
	struct vhost_net *net;
	vq = container_of(work, struct vhost_virtqueue, poll.work);
	net = container_of(vq->dev, struct vhost_net, dev);
	handle_tx(net);
}

static void handle_rx_kick(struct work_struct *work)
{
	struct vhost_virtqueue *vq;
	struct vhost_net *net;
	vq = container_of(work, struct vhost_virtqueue, poll.work);
	net = container_of(vq->dev, struct vhost_net, dev);
	handle_rx(net);
}

static void handle_tx_net(struct work_struct *work)
{
	struct vhost_net *net;
	net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_TX].work);
	handle_tx(net);
}

static void handle_rx_net(struct work_struct *work)
{
	struct vhost_net *net;
	net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_RX].work);
	handle_rx(net);
}

static int vhost_net_open(struct inode *inode, struct file *f)
{
	struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL);
	int r;
	if (!n)
		return -ENOMEM;
	n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick;
	n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick;
	r = vhost_dev_init(&n->dev, n->vqs, VHOST_NET_VQ_MAX);
	if (r < 0) {
		kfree(n);
		return r;
	}

	vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT);
	vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN);
	n->tx_poll_state = VHOST_NET_POLL_DISABLED;

	f->private_data = n;

	return 0;
}

static void vhost_net_disable_vq(struct vhost_net *n,
				 struct vhost_virtqueue *vq)
{
	if (!vq->private_data)
		return;
	if (vq == n->vqs + VHOST_NET_VQ_TX) {
		tx_poll_stop(n);
		n->tx_poll_state = VHOST_NET_POLL_DISABLED;
	} else
		vhost_poll_stop(n->poll + VHOST_NET_VQ_RX);
}

static void vhost_net_enable_vq(struct vhost_net *n,
				struct vhost_virtqueue *vq)
{
	struct socket *sock = vq->private_data;
	if (!sock)
		return;
	if (vq == n->vqs + VHOST_NET_VQ_TX) {
		n->tx_poll_state = VHOST_NET_POLL_STOPPED;
		tx_poll_start(n, sock);
	} else
		vhost_poll_start(n->poll + VHOST_NET_VQ_RX, sock->file);
}

static struct socket *vhost_net_stop_vq(struct vhost_net *n,
					struct vhost_virtqueue *vq)
{
	struct socket *sock;

	mutex_lock(&vq->mutex);
	sock = vq->private_data;
	vhost_net_disable_vq(n, vq);
	rcu_assign_pointer(vq->private_data, NULL);
	mutex_unlock(&vq->mutex);
	return sock;
}

static void vhost_net_stop(struct vhost_net *n, struct socket **tx_sock,
			   struct socket **rx_sock)
{
	*tx_sock = vhost_net_stop_vq(n, n->vqs + VHOST_NET_VQ_TX);
	*rx_sock = vhost_net_stop_vq(n, n->vqs + VHOST_NET_VQ_RX);
}

static void vhost_net_flush_vq(struct vhost_net *n, int index)
{
	vhost_poll_flush(n->poll + index);
	vhost_poll_flush(&n->dev.vqs[index].poll);
}

static void vhost_net_flush(struct vhost_net *n)
{
	vhost_net_flush_vq(n, VHOST_NET_VQ_TX);
	vhost_net_flush_vq(n, VHOST_NET_VQ_RX);
}

static int vhost_net_release(struct inode *inode, struct file *f)
{
	struct vhost_net *n = f->private_data;
	struct socket *tx_sock;
	struct socket *rx_sock;

	vhost_net_stop(n, &tx_sock, &rx_sock);
	vhost_net_flush(n);
	vhost_dev_cleanup(&n->dev);
	if (tx_sock)
		fput(tx_sock->file);
	if (rx_sock)
		fput(rx_sock->file);
	/* We do an extra flush before freeing memory,
	 * since jobs can re-queue themselves. */
	vhost_net_flush(n);
	kfree(n);
	return 0;
}

static struct socket *get_raw_socket(int fd)
{
	struct {
		struct sockaddr_ll sa;
		char  buf[MAX_ADDR_LEN];
	} uaddr;
	int uaddr_len = sizeof uaddr, r;
	struct socket *sock = sockfd_lookup(fd, &r);
	if (!sock)
		return ERR_PTR(-ENOTSOCK);

	/* Parameter checking */
	if (sock->sk->sk_type != SOCK_RAW) {
		r = -ESOCKTNOSUPPORT;
		goto err;
	}

	r = sock->ops->getname(sock, (struct sockaddr *)&uaddr.sa,
			       &uaddr_len, 0);
	if (r)
		goto err;

	if (uaddr.sa.sll_family != AF_PACKET) {
		r = -EPFNOSUPPORT;
		goto err;
	}
	return sock;
err:
	fput(sock->file);
	return ERR_PTR(r);
}

static struct socket *get_tun_socket(int fd)
{
	struct file *file = fget(fd);
	struct socket *sock;
	if (!file)
		return ERR_PTR(-EBADF);
	sock = tun_get_socket(file);
	if (IS_ERR(sock))
		fput(file);
	return sock;
}

static struct socket *get_socket(int fd)
{
	struct socket *sock;
	/* special case to disable backend */
	if (fd == -1)
		return NULL;
	sock = get_raw_socket(fd);
	if (!IS_ERR(sock))
		return sock;
	sock = get_tun_socket(fd);
	if (!IS_ERR(sock))
		return sock;
	return ERR_PTR(-ENOTSOCK);
}

static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
{
	struct socket *sock, *oldsock;
	struct vhost_virtqueue *vq;
	int r;

	mutex_lock(&n->dev.mutex);
	r = vhost_dev_check_owner(&n->dev);
	if (r)
		goto err;

	if (index >= VHOST_NET_VQ_MAX) {
		r = -ENOBUFS;
		goto err;
	}
	vq = n->vqs + index;
	mutex_lock(&vq->mutex);

	/* Verify that ring has been setup correctly. */
	if (!vhost_vq_access_ok(vq)) {
		r = -EFAULT;
		goto err;
	}
	sock = get_socket(fd);
	if (IS_ERR(sock)) {
		r = PTR_ERR(sock);
		goto err;
	}

	/* start polling new socket */
	oldsock = vq->private_data;
	if (sock == oldsock)
		goto done;

	vhost_net_disable_vq(n, vq);
	rcu_assign_pointer(vq->private_data, sock);
	vhost_net_enable_vq(n, vq);
	mutex_unlock(&vq->mutex);
done:
	if (oldsock) {
		vhost_net_flush_vq(n, index);
		fput(oldsock->file);
	}
err:
	mutex_unlock(&n->dev.mutex);
	return r;
}

static long vhost_net_reset_owner(struct vhost_net *n)
{
	struct socket *tx_sock = NULL;
	struct socket *rx_sock = NULL;
	long err;
	mutex_lock(&n->dev.mutex);
	err = vhost_dev_check_owner(&n->dev);
	if (err)
		goto done;
	vhost_net_stop(n, &tx_sock, &rx_sock);
	vhost_net_flush(n);
	err = vhost_dev_reset_owner(&n->dev);
done:
	mutex_unlock(&n->dev.mutex);
	if (tx_sock)
		fput(tx_sock->file);
	if (rx_sock)
		fput(rx_sock->file);
	return err;
}

static int vhost_net_set_features(struct vhost_net *n, u64 features)
{
	size_t hdr_size = features & (1 << VHOST_NET_F_VIRTIO_NET_HDR) ?
		sizeof(struct virtio_net_hdr) : 0;
	int i;
	mutex_lock(&n->dev.mutex);
	if ((features & (1 << VHOST_F_LOG_ALL)) &&
	    !vhost_log_access_ok(&n->dev)) {
		mutex_unlock(&n->dev.mutex);
		return -EFAULT;
	}
	n->dev.acked_features = features;
	smp_wmb();
	for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
		mutex_lock(&n->vqs[i].mutex);
		n->vqs[i].hdr_size = hdr_size;
		mutex_unlock(&n->vqs[i].mutex);
	}
	vhost_net_flush(n);
	mutex_unlock(&n->dev.mutex);
	return 0;
}

static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
			    unsigned long arg)
{
	struct vhost_net *n = f->private_data;
	void __user *argp = (void __user *)arg;
	u64 __user *featurep = argp;
	struct vhost_vring_file backend;
	u64 features;
	int r;
	switch (ioctl) {
	case VHOST_NET_SET_BACKEND:
		r = copy_from_user(&backend, argp, sizeof backend);
		if (r < 0)
			return r;
		return vhost_net_set_backend(n, backend.index, backend.fd);
	case VHOST_GET_FEATURES:
		features = VHOST_FEATURES;
		return copy_to_user(featurep, &features, sizeof features);
	case VHOST_SET_FEATURES:
		r = copy_from_user(&features, featurep, sizeof features);
		if (r < 0)
			return r;
		if (features & ~VHOST_FEATURES)
			return -EOPNOTSUPP;
		return vhost_net_set_features(n, features);
	case VHOST_RESET_OWNER:
		return vhost_net_reset_owner(n);
	default:
		mutex_lock(&n->dev.mutex);
		r = vhost_dev_ioctl(&n->dev, ioctl, arg);
		vhost_net_flush(n);
		mutex_unlock(&n->dev.mutex);
		return r;
	}
}

#ifdef CONFIG_COMPAT
static long vhost_net_compat_ioctl(struct file *f, unsigned int ioctl,
				   unsigned long arg)
{
	return vhost_net_ioctl(f, ioctl, (unsigned long)compat_ptr(arg));
}
#endif

const static struct file_operations vhost_net_fops = {
	.owner          = THIS_MODULE,
	.release        = vhost_net_release,
	.unlocked_ioctl = vhost_net_ioctl,
#ifdef CONFIG_COMPAT
	.compat_ioctl   = vhost_net_compat_ioctl,
#endif
	.open           = vhost_net_open,
};

static struct miscdevice vhost_net_misc = {
	VHOST_NET_MINOR,
	"vhost-net",
	&vhost_net_fops,
};

int vhost_net_init(void)
{
	int r = vhost_init();
	if (r)
		goto err_init;
	r = misc_register(&vhost_net_misc);
	if (r)
		goto err_reg;
	return 0;
err_reg:
	vhost_cleanup();
err_init:
	return r;

}
module_init(vhost_net_init);

void vhost_net_exit(void)
{
	misc_deregister(&vhost_net_misc);
	vhost_cleanup();
}
module_exit(vhost_net_exit);

MODULE_VERSION("0.0.1");
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR("Michael S. Tsirkin");
MODULE_DESCRIPTION("Host kernel accelerator for virtio net");
Commit	Line	Data
3a4d5c94 MT	1	/* Copyright (C) 2009 Red Hat, Inc.
	2	* Author: Michael S. Tsirkin <mst@redhat.com>
	3	*
	4	* This work is licensed under the terms of the GNU GPL, version 2.
	5	*
	6	* virtio-net server in host kernel.
	7	*/
	8
	9	#include <linux/compat.h>
	10	#include <linux/eventfd.h>
	11	#include <linux/vhost.h>
	12	#include <linux/virtio_net.h>
	13	#include <linux/mmu_context.h>
	14	#include <linux/miscdevice.h>
	15	#include <linux/module.h>
	16	#include <linux/mutex.h>
	17	#include <linux/workqueue.h>
	18	#include <linux/rcupdate.h>
	19	#include <linux/file.h>
	20
	21	#include <linux/net.h>
	22	#include <linux/if_packet.h>
	23	#include <linux/if_arp.h>
	24	#include <linux/if_tun.h>
	25
	26	#include <net/sock.h>
	27
	28	#include "vhost.h"
	29
	30	/* Max number of bytes transferred before requeueing the job.
	31	* Using this limit prevents one virtqueue from starving others. */
	32	#define VHOST_NET_WEIGHT 0x80000
	33
	34	enum {
	35	VHOST_NET_VQ_RX = 0,
	36	VHOST_NET_VQ_TX = 1,
	37	VHOST_NET_VQ_MAX = 2,
	38	};
	39
	40	enum vhost_net_poll_state {
	41	VHOST_NET_POLL_DISABLED = 0,
	42	VHOST_NET_POLL_STARTED = 1,
	43	VHOST_NET_POLL_STOPPED = 2,
	44	};
	45
	46	struct vhost_net {
	47	struct vhost_dev dev;
	48	struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
	49	struct vhost_poll poll[VHOST_NET_VQ_MAX];
	50	/* Tells us whether we are polling a socket for TX.
	51	* We only do this when socket buffer fills up.
	52	* Protected by tx vq lock. */
	53	enum vhost_net_poll_state tx_poll_state;
	54	};
	55
	56	/* Pop first len bytes from iovec. Return number of segments used. */
	57	static int move_iovec_hdr(struct iovec from, struct iovec to,
	58	size_t len, int iov_count)
	59	{
	60	int seg = 0;
	61	size_t size;
	62	while (len && seg < iov_count) {
	63	size = min(from->iov_len, len);
	64	to->iov_base = from->iov_base;
65	to->iov_len = size;
66	from->iov_len -= size;
67	from->iov_base += size;
68	len -= size;
69	++from;
70	++to;
71	++seg;
72	}
73	return seg;
74	}
75
76	/* Caller must have TX VQ lock */
77	static void tx_poll_stop(struct vhost_net *net)
78	{
79	if (likely(net->tx_poll_state != VHOST_NET_POLL_STARTED))
80	return;
81	vhost_poll_stop(net->poll + VHOST_NET_VQ_TX);
82	net->tx_poll_state = VHOST_NET_POLL_STOPPED;
83	}
84
85	/* Caller must have TX VQ lock */
86	static void tx_poll_start(struct vhost_net net, struct socket sock)
87	{
88	if (unlikely(net->tx_poll_state != VHOST_NET_POLL_STOPPED))
89	return;
90	vhost_poll_start(net->poll + VHOST_NET_VQ_TX, sock->file);
91	net->tx_poll_state = VHOST_NET_POLL_STARTED;
92	}
93
94	/* Expects to be always run from workqueue - which acts as
95	* read-size critical section for our kind of RCU. */
96	static void handle_tx(struct vhost_net *net)
97	{
98	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX];
99	unsigned head, out, in, s;
100	struct msghdr msg = {
101	.msg_name = NULL,
102	.msg_namelen = 0,
103	.msg_control = NULL,
104	.msg_controllen = 0,
105	.msg_iov = vq->iov,
106	.msg_flags = MSG_DONTWAIT,
107	};
108	size_t len, total_len = 0;
109	int err, wmem;
110	size_t hdr_size;
111	struct socket *sock = rcu_dereference(vq->private_data);
112	if (!sock)
113	return;
114
115	wmem = atomic_read(&sock->sk->sk_wmem_alloc);
116	if (wmem >= sock->sk->sk_sndbuf)
117	return;
118
119	use_mm(net->dev.mm);
120	mutex_lock(&vq->mutex);
121	vhost_disable_notify(vq);
122
123	if (wmem < sock->sk->sk_sndbuf * 2)
124	tx_poll_stop(net);
125	hdr_size = vq->hdr_size;
126
127	for (;;) {
128	head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
129	ARRAY_SIZE(vq->iov),
130	&out, &in,
131	NULL, NULL);
132	/* Nothing new? Wait for eventfd to tell us they refilled. */
133	if (head == vq->num) {
134	wmem = atomic_read(&sock->sk->sk_wmem_alloc);
135	if (wmem >= sock->sk->sk_sndbuf * 3 / 4) {
136	tx_poll_start(net, sock);
137	set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
138	break;
139	}
140	if (unlikely(vhost_enable_notify(vq))) {
141	vhost_disable_notify(vq);
142	continue;
143	}
144	break;
145	}
146	if (in) {
147	vq_err(vq, "Unexpected descriptor format for TX: "
148	"out %d, int %d\n", out, in);
149	break;
150	}
151	/* Skip header. TODO: support TSO. */
152	s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
153	msg.msg_iovlen = out;
154	len = iov_length(vq->iov, out);
155	/* Sanity check */
156	if (!len) {
157	vq_err(vq, "Unexpected header len for TX: "
158	"%zd expected %zd\n",
159	iov_length(vq->hdr, s), hdr_size);
160	break;
161	}
162	/* TODO: Check specific error and bomb out unless ENOBUFS? */
163	err = sock->ops->sendmsg(NULL, sock, &msg, len);
164	if (unlikely(err < 0)) {
165	vhost_discard_vq_desc(vq);
166	tx_poll_start(net, sock);
167	break;
168	}
169	if (err != len)
170	pr_err("Truncated TX packet: "
171	" len %d != %zd\n", err, len);
172	vhost_add_used_and_signal(&net->dev, vq, head, 0);
173	total_len += len;
174	if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
175	vhost_poll_queue(&vq->poll);
176	break;
177	}
178	}
179
180	mutex_unlock(&vq->mutex);
181	unuse_mm(net->dev.mm);
182	}
183
184	/* Expects to be always run from workqueue - which acts as
185	* read-size critical section for our kind of RCU. */
186	static void handle_rx(struct vhost_net *net)
187	{
188	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
189	unsigned head, out, in, log, s;
190	struct vhost_log *vq_log;
191	struct msghdr msg = {
192	.msg_name = NULL,
193	.msg_namelen = 0,
194	.msg_control = NULL, /* FIXME: get and handle RX aux data. */
195	.msg_controllen = 0,
196	.msg_iov = vq->iov,
197	.msg_flags = MSG_DONTWAIT,
198	};
199
200	struct virtio_net_hdr hdr = {
201	.flags = 0,
202	.gso_type = VIRTIO_NET_HDR_GSO_NONE
203	};
204
205	size_t len, total_len = 0;
206	int err;
207	size_t hdr_size;
208	struct socket *sock = rcu_dereference(vq->private_data);
209	if (!sock \|\| skb_queue_empty(&sock->sk->sk_receive_queue))
210	return;
211
212	use_mm(net->dev.mm);
213	mutex_lock(&vq->mutex);
214	vhost_disable_notify(vq);
215	hdr_size = vq->hdr_size;
216
217	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
218	vq->log : NULL;
219
220	for (;;) {
221	head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
222	ARRAY_SIZE(vq->iov),
223	&out, &in,
224	vq_log, &log);
225	/* OK, now we need to know about added descriptors. */
226	if (head == vq->num) {
227	if (unlikely(vhost_enable_notify(vq))) {
228	/* They have slipped one in as we were
229	* doing that: check again. */
230	vhost_disable_notify(vq);
231	continue;
232	}
233	/* Nothing new? Wait for eventfd to tell us
234	* they refilled. */
235	break;
236	}
237	/* We don't need to be notified again. */
238	if (out) {
239	vq_err(vq, "Unexpected descriptor format for RX: "
240	"out %d, int %d\n",
241	out, in);
242	break;
243	}
244	/* Skip header. TODO: support TSO/mergeable rx buffers. */
245	s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
246	msg.msg_iovlen = in;
247	len = iov_length(vq->iov, in);
248	/* Sanity check */
249	if (!len) {
250	vq_err(vq, "Unexpected header len for RX: "
251	"%zd expected %zd\n",
252	iov_length(vq->hdr, s), hdr_size);
253	break;
254	}
255	err = sock->ops->recvmsg(NULL, sock, &msg,
256	len, MSG_DONTWAIT \| MSG_TRUNC);
257	/* TODO: Check specific error and bomb out unless EAGAIN? */
258	if (err < 0) {
259	vhost_discard_vq_desc(vq);
260	break;
261	}
262	/* TODO: Should check and handle checksum. */
263	if (err > len) {
264	pr_err("Discarded truncated rx packet: "
265	" len %d > %zd\n", err, len);
266	vhost_discard_vq_desc(vq);
267	continue;
268	}
269	len = err;
270	err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr, hdr_size);
271	if (err) {
272	vq_err(vq, "Unable to write vnet_hdr at addr %p: %d\n",
273	vq->iov->iov_base, err);
274	break;
275	}
276	len += hdr_size;
277	vhost_add_used_and_signal(&net->dev, vq, head, len);
278	if (unlikely(vq_log))
279	vhost_log_write(vq, vq_log, log, len);
280	total_len += len;
281	if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
282	vhost_poll_queue(&vq->poll);
283	break;
284	}
285	}
286
287	mutex_unlock(&vq->mutex);
288	unuse_mm(net->dev.mm);
289	}
290
291	static void handle_tx_kick(struct work_struct *work)
292	{
293	struct vhost_virtqueue *vq;
294	struct vhost_net *net;
295	vq = container_of(work, struct vhost_virtqueue, poll.work);
296	net = container_of(vq->dev, struct vhost_net, dev);
297	handle_tx(net);
298	}
299
300	static void handle_rx_kick(struct work_struct *work)
301	{
302	struct vhost_virtqueue *vq;
303	struct vhost_net *net;
304	vq = container_of(work, struct vhost_virtqueue, poll.work);
305	net = container_of(vq->dev, struct vhost_net, dev);
306	handle_rx(net);
307	}
308
309	static void handle_tx_net(struct work_struct *work)
310	{
311	struct vhost_net *net;
312	net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_TX].work);
313	handle_tx(net);
314	}
315
316	static void handle_rx_net(struct work_struct *work)
317	{
318	struct vhost_net *net;
319	net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_RX].work);
320	handle_rx(net);
321	}
322
323	static int vhost_net_open(struct inode inode, struct file f)
324	{
325	struct vhost_net n = kmalloc(sizeof n, GFP_KERNEL);
326	int r;
327	if (!n)
328	return -ENOMEM;
329	n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick;
330	n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick;
331	r = vhost_dev_init(&n->dev, n->vqs, VHOST_NET_VQ_MAX);
332	if (r < 0) {
333	kfree(n);
334	return r;
335	}
336
337	vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT);
338	vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN);
339	n->tx_poll_state = VHOST_NET_POLL_DISABLED;
340
341	f->private_data = n;
342
343	return 0;
344	}
345
346	static void vhost_net_disable_vq(struct vhost_net *n,
347	struct vhost_virtqueue *vq)
348	{
349	if (!vq->private_data)
350	return;
351	if (vq == n->vqs + VHOST_NET_VQ_TX) {
352	tx_poll_stop(n);
353	n->tx_poll_state = VHOST_NET_POLL_DISABLED;
354	} else
355	vhost_poll_stop(n->poll + VHOST_NET_VQ_RX);
356	}
357
358	static void vhost_net_enable_vq(struct vhost_net *n,
359	struct vhost_virtqueue *vq)
360	{
361	struct socket *sock = vq->private_data;
362	if (!sock)
363	return;
364	if (vq == n->vqs + VHOST_NET_VQ_TX) {
365	n->tx_poll_state = VHOST_NET_POLL_STOPPED;
366	tx_poll_start(n, sock);
367	} else
368	vhost_poll_start(n->poll + VHOST_NET_VQ_RX, sock->file);
369	}
370
371	static struct socket vhost_net_stop_vq(struct vhost_net n,
372	struct vhost_virtqueue *vq)
373	{
374	struct socket *sock;
375
376	mutex_lock(&vq->mutex);
377	sock = vq->private_data;
378	vhost_net_disable_vq(n, vq);
379	rcu_assign_pointer(vq->private_data, NULL);
380	mutex_unlock(&vq->mutex);
381	return sock;
382	}
383
384	static void vhost_net_stop(struct vhost_net n, struct socket *tx_sock,
385	struct socket **rx_sock)
386	{
387	*tx_sock = vhost_net_stop_vq(n, n->vqs + VHOST_NET_VQ_TX);
388	*rx_sock = vhost_net_stop_vq(n, n->vqs + VHOST_NET_VQ_RX);
389	}
390
391	static void vhost_net_flush_vq(struct vhost_net *n, int index)
392	{
393	vhost_poll_flush(n->poll + index);
394	vhost_poll_flush(&n->dev.vqs[index].poll);
395	}
396
397	static void vhost_net_flush(struct vhost_net *n)
398	{
399	vhost_net_flush_vq(n, VHOST_NET_VQ_TX);
400	vhost_net_flush_vq(n, VHOST_NET_VQ_RX);
401	}
402
403	static int vhost_net_release(struct inode inode, struct file f)
404	{
405	struct vhost_net *n = f->private_data;
406	struct socket *tx_sock;
407	struct socket *rx_sock;
408
409	vhost_net_stop(n, &tx_sock, &rx_sock);
410	vhost_net_flush(n);
411	vhost_dev_cleanup(&n->dev);
412	if (tx_sock)
413	fput(tx_sock->file);
414	if (rx_sock)
415	fput(rx_sock->file);
416	/* We do an extra flush before freeing memory,
417	* since jobs can re-queue themselves. */
418	vhost_net_flush(n);
419	kfree(n);
420	return 0;
421	}
422
423	static struct socket *get_raw_socket(int fd)
424	{
425	struct {
426	struct sockaddr_ll sa;
427	char buf[MAX_ADDR_LEN];
428	} uaddr;
429	int uaddr_len = sizeof uaddr, r;
430	struct socket *sock = sockfd_lookup(fd, &r);
431	if (!sock)
432	return ERR_PTR(-ENOTSOCK);
433
434	/* Parameter checking */
435	if (sock->sk->sk_type != SOCK_RAW) {
436	r = -ESOCKTNOSUPPORT;
437	goto err;
438	}
439
440	r = sock->ops->getname(sock, (struct sockaddr *)&uaddr.sa,
441	&uaddr_len, 0);
442	if (r)
443	goto err;
444
445	if (uaddr.sa.sll_family != AF_PACKET) {
446	r = -EPFNOSUPPORT;
447	goto err;
448	}
449	return sock;
450	err:
451	fput(sock->file);
452	return ERR_PTR(r);
453	}
454
455	static struct socket *get_tun_socket(int fd)
456	{
457	struct file *file = fget(fd);
458	struct socket *sock;
459	if (!file)
460	return ERR_PTR(-EBADF);
461	sock = tun_get_socket(file);
462	if (IS_ERR(sock))
463	fput(file);
464	return sock;
465	}
466
467	static struct socket *get_socket(int fd)
468	{
469	struct socket *sock;
470	/* special case to disable backend */
471	if (fd == -1)
472	return NULL;
473	sock = get_raw_socket(fd);
474	if (!IS_ERR(sock))
475	return sock;
476	sock = get_tun_socket(fd);
477	if (!IS_ERR(sock))
478	return sock;
479	return ERR_PTR(-ENOTSOCK);
480	}
481
482	static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
483	{
484	struct socket sock, oldsock;
485	struct vhost_virtqueue *vq;
486	int r;
487
488	mutex_lock(&n->dev.mutex);
489	r = vhost_dev_check_owner(&n->dev);
490	if (r)
491	goto err;
492
493	if (index >= VHOST_NET_VQ_MAX) {
494	r = -ENOBUFS;
495	goto err;
496	}
497	vq = n->vqs + index;
498	mutex_lock(&vq->mutex);
499
500	/* Verify that ring has been setup correctly. */
501	if (!vhost_vq_access_ok(vq)) {
502	r = -EFAULT;
503	goto err;
504	}
505	sock = get_socket(fd);
506	if (IS_ERR(sock)) {
507	r = PTR_ERR(sock);
508	goto err;
509	}
510
511	/* start polling new socket */
512	oldsock = vq->private_data;
513	if (sock == oldsock)
514	goto done;
515
516	vhost_net_disable_vq(n, vq);
517	rcu_assign_pointer(vq->private_data, sock);
518	vhost_net_enable_vq(n, vq);
519	mutex_unlock(&vq->mutex);
520	done:
521	if (oldsock) {
522	vhost_net_flush_vq(n, index);
523	fput(oldsock->file);
524	}
525	err:
526	mutex_unlock(&n->dev.mutex);
527	return r;
528	}
529
530	static long vhost_net_reset_owner(struct vhost_net *n)
531	{
532	struct socket *tx_sock = NULL;
533	struct socket *rx_sock = NULL;
534	long err;
535	mutex_lock(&n->dev.mutex);
536	err = vhost_dev_check_owner(&n->dev);
537	if (err)
538	goto done;
539	vhost_net_stop(n, &tx_sock, &rx_sock);
540	vhost_net_flush(n);
541	err = vhost_dev_reset_owner(&n->dev);
542	done:
543	mutex_unlock(&n->dev.mutex);
544	if (tx_sock)
545	fput(tx_sock->file);
546	if (rx_sock)
547	fput(rx_sock->file);
548	return err;
549	}
550
551	static int vhost_net_set_features(struct vhost_net *n, u64 features)
552	{
553	size_t hdr_size = features & (1 << VHOST_NET_F_VIRTIO_NET_HDR) ?
554	sizeof(struct virtio_net_hdr) : 0;
555	int i;
556	mutex_lock(&n->dev.mutex);
557	if ((features & (1 << VHOST_F_LOG_ALL)) &&
558	!vhost_log_access_ok(&n->dev)) {
559	mutex_unlock(&n->dev.mutex);
560	return -EFAULT;
561	}
562	n->dev.acked_features = features;
563	smp_wmb();
564	for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
565	mutex_lock(&n->vqs[i].mutex);
566	n->vqs[i].hdr_size = hdr_size;
567	mutex_unlock(&n->vqs[i].mutex);
568	}
569	vhost_net_flush(n);
570	mutex_unlock(&n->dev.mutex);
571	return 0;
572	}
573
574	static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
575	unsigned long arg)
576	{
577	struct vhost_net *n = f->private_data;
578	void __user argp = (void __user )arg;
579	u64 __user *featurep = argp;
580	struct vhost_vring_file backend;
581	u64 features;
582	int r;
583	switch (ioctl) {
584	case VHOST_NET_SET_BACKEND:
585	r = copy_from_user(&backend, argp, sizeof backend);
586	if (r < 0)
587	return r;
588	return vhost_net_set_backend(n, backend.index, backend.fd);
589	case VHOST_GET_FEATURES:
590	features = VHOST_FEATURES;
591	return copy_to_user(featurep, &features, sizeof features);
592	case VHOST_SET_FEATURES:
593	r = copy_from_user(&features, featurep, sizeof features);
594	if (r < 0)
595	return r;
596	if (features & ~VHOST_FEATURES)
597	return -EOPNOTSUPP;
598	return vhost_net_set_features(n, features);
599	case VHOST_RESET_OWNER:
600	return vhost_net_reset_owner(n);
601	default:
602	mutex_lock(&n->dev.mutex);
603	r = vhost_dev_ioctl(&n->dev, ioctl, arg);
604	vhost_net_flush(n);
605	mutex_unlock(&n->dev.mutex);
606	return r;
607	}
608	}
609
610	#ifdef CONFIG_COMPAT
611	static long vhost_net_compat_ioctl(struct file *f, unsigned int ioctl,
612	unsigned long arg)
613	{
614	return vhost_net_ioctl(f, ioctl, (unsigned long)compat_ptr(arg));
615	}
616	#endif
617
618	const static struct file_operations vhost_net_fops = {
619	.owner = THIS_MODULE,
620	.release = vhost_net_release,
621	.unlocked_ioctl = vhost_net_ioctl,
622	#ifdef CONFIG_COMPAT
623	.compat_ioctl = vhost_net_compat_ioctl,
624	#endif
625	.open = vhost_net_open,
626	};
627
628	static struct miscdevice vhost_net_misc = {
629	VHOST_NET_MINOR,
630	"vhost-net",
631	&vhost_net_fops,
632	};
633
634	int vhost_net_init(void)
635	{
636	int r = vhost_init();
637	if (r)
638	goto err_init;
639	r = misc_register(&vhost_net_misc);
640	if (r)
641	goto err_reg;
642	return 0;
643	err_reg:
644	vhost_cleanup();
645	err_init:
646	return r;
647
648	}
649	module_init(vhost_net_init);
650
651	void vhost_net_exit(void)
652	{
653	misc_deregister(&vhost_net_misc);
654	vhost_cleanup();
655	}
656	module_exit(vhost_net_exit);
657
658	MODULE_VERSION("0.0.1");
659	MODULE_LICENSE("GPL v2");
660	MODULE_AUTHOR("Michael S. Tsirkin");
661	MODULE_DESCRIPTION("Host kernel accelerator for virtio net");