]> git.proxmox.com Git - mirror_ubuntu-eoan-kernel.git/blame - drivers/vhost/net.c
treewide: kmalloc() -> kmalloc_array()
[mirror_ubuntu-eoan-kernel.git] / drivers / vhost / net.c
CommitLineData
3a4d5c94
MT
1/* Copyright (C) 2009 Red Hat, Inc.
2 * Author: Michael S. Tsirkin <mst@redhat.com>
3 *
4 * This work is licensed under the terms of the GNU GPL, version 2.
5 *
6 * virtio-net server in host kernel.
7 */
8
9#include <linux/compat.h>
10#include <linux/eventfd.h>
11#include <linux/vhost.h>
12#include <linux/virtio_net.h>
3a4d5c94
MT
13#include <linux/miscdevice.h>
14#include <linux/module.h>
bab632d6 15#include <linux/moduleparam.h>
3a4d5c94
MT
16#include <linux/mutex.h>
17#include <linux/workqueue.h>
3a4d5c94 18#include <linux/file.h>
5a0e3ad6 19#include <linux/slab.h>
e6017571 20#include <linux/sched/clock.h>
174cd4b1 21#include <linux/sched/signal.h>
23cc5a99 22#include <linux/vmalloc.h>
3a4d5c94
MT
23
24#include <linux/net.h>
25#include <linux/if_packet.h>
26#include <linux/if_arp.h>
27#include <linux/if_tun.h>
501c774c 28#include <linux/if_macvlan.h>
635b8c8e 29#include <linux/if_tap.h>
c53cff5e 30#include <linux/if_vlan.h>
c67df11f
JW
31#include <linux/skb_array.h>
32#include <linux/skbuff.h>
3a4d5c94
MT
33
34#include <net/sock.h>
1ffcbc85 35#include <net/xdp.h>
3a4d5c94
MT
36
37#include "vhost.h"
38
f9611c43 39static int experimental_zcopytx = 1;
bab632d6 40module_param(experimental_zcopytx, int, 0444);
f9611c43
MT
41MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;"
42 " 1 -Enable; 0 - Disable");
bab632d6 43
3a4d5c94
MT
44/* Max number of bytes transferred before requeueing the job.
45 * Using this limit prevents one virtqueue from starving others. */
46#define VHOST_NET_WEIGHT 0x80000
47
a2ac9990 48/* Max number of packets transferred before requeueing the job.
db688c24
PA
49 * Using this limit prevents one virtqueue from starving others with small
50 * pkts.
51 */
52#define VHOST_NET_PKT_WEIGHT 256
a2ac9990 53
bab632d6
MT
54/* MAX number of TX used buffers for outstanding zerocopy */
55#define VHOST_MAX_PEND 128
56#define VHOST_GOODCOPY_LEN 256
57
eaae8132
MT
58/*
59 * For transmit, used buffer len is unused; we override it to track buffer
60 * status internally; used for zerocopy tx only.
61 */
62/* Lower device DMA failed */
bf995734 63#define VHOST_DMA_FAILED_LEN ((__force __virtio32)3)
eaae8132 64/* Lower device DMA done */
bf995734 65#define VHOST_DMA_DONE_LEN ((__force __virtio32)2)
eaae8132 66/* Lower device DMA in progress */
bf995734 67#define VHOST_DMA_IN_PROGRESS ((__force __virtio32)1)
eaae8132 68/* Buffer unused */
bf995734 69#define VHOST_DMA_CLEAR_LEN ((__force __virtio32)0)
eaae8132 70
bf995734 71#define VHOST_DMA_IS_DONE(len) ((__force u32)(len) >= (__force u32)VHOST_DMA_DONE_LEN)
eaae8132 72
8570a6e7
AH
73enum {
74 VHOST_NET_FEATURES = VHOST_FEATURES |
75 (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) |
6b1e6cc7
JW
76 (1ULL << VIRTIO_NET_F_MRG_RXBUF) |
77 (1ULL << VIRTIO_F_IOMMU_PLATFORM)
8570a6e7
AH
78};
79
3a4d5c94
MT
80enum {
81 VHOST_NET_VQ_RX = 0,
82 VHOST_NET_VQ_TX = 1,
83 VHOST_NET_VQ_MAX = 2,
84};
85
fe729a57 86struct vhost_net_ubuf_ref {
0ad8b480
MT
87 /* refcount follows semantics similar to kref:
88 * 0: object is released
89 * 1: no outstanding ubufs
90 * >1: outstanding ubufs
91 */
92 atomic_t refcount;
2839400f
AH
93 wait_queue_head_t wait;
94 struct vhost_virtqueue *vq;
95};
96
c67df11f
JW
97#define VHOST_RX_BATCH 64
98struct vhost_net_buf {
5990a305 99 void **queue;
c67df11f
JW
100 int tail;
101 int head;
102};
103
3ab2e420
AH
104struct vhost_net_virtqueue {
105 struct vhost_virtqueue vq;
81f95a55
MT
106 size_t vhost_hlen;
107 size_t sock_hlen;
2839400f
AH
108 /* vhost zerocopy support fields below: */
109 /* last used idx for outstanding DMA zerocopy buffers */
110 int upend_idx;
f5a4941a
JW
111 /* For TX, first used idx for DMA done zerocopy buffers
112 * For RX, number of batched heads
113 */
2839400f
AH
114 int done_idx;
115 /* an array of userspace buffers info */
116 struct ubuf_info *ubuf_info;
117 /* Reference counting for outstanding ubufs.
118 * Protected by vq mutex. Writers must also take device mutex. */
fe729a57 119 struct vhost_net_ubuf_ref *ubufs;
5990a305 120 struct ptr_ring *rx_ring;
c67df11f 121 struct vhost_net_buf rxq;
3ab2e420
AH
122};
123
3a4d5c94
MT
124struct vhost_net {
125 struct vhost_dev dev;
3ab2e420 126 struct vhost_net_virtqueue vqs[VHOST_NET_VQ_MAX];
3a4d5c94 127 struct vhost_poll poll[VHOST_NET_VQ_MAX];
eaae8132
MT
128 /* Number of TX recently submitted.
129 * Protected by tx vq lock. */
130 unsigned tx_packets;
131 /* Number of times zerocopy TX recently failed.
132 * Protected by tx vq lock. */
133 unsigned tx_zcopy_err;
1280c27f
MT
134 /* Flush in progress. Protected by tx vq lock. */
135 bool tx_flush;
3a4d5c94
MT
136};
137
fe729a57 138static unsigned vhost_net_zcopy_mask __read_mostly;
2839400f 139
c67df11f
JW
140static void *vhost_net_buf_get_ptr(struct vhost_net_buf *rxq)
141{
142 if (rxq->tail != rxq->head)
143 return rxq->queue[rxq->head];
144 else
145 return NULL;
146}
147
148static int vhost_net_buf_get_size(struct vhost_net_buf *rxq)
149{
150 return rxq->tail - rxq->head;
151}
152
153static int vhost_net_buf_is_empty(struct vhost_net_buf *rxq)
154{
155 return rxq->tail == rxq->head;
156}
157
158static void *vhost_net_buf_consume(struct vhost_net_buf *rxq)
159{
160 void *ret = vhost_net_buf_get_ptr(rxq);
161 ++rxq->head;
162 return ret;
163}
164
165static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq)
166{
167 struct vhost_net_buf *rxq = &nvq->rxq;
168
169 rxq->head = 0;
5990a305 170 rxq->tail = ptr_ring_consume_batched(nvq->rx_ring, rxq->queue,
c67df11f
JW
171 VHOST_RX_BATCH);
172 return rxq->tail;
173}
174
175static void vhost_net_buf_unproduce(struct vhost_net_virtqueue *nvq)
176{
177 struct vhost_net_buf *rxq = &nvq->rxq;
178
5990a305
JW
179 if (nvq->rx_ring && !vhost_net_buf_is_empty(rxq)) {
180 ptr_ring_unconsume(nvq->rx_ring, rxq->queue + rxq->head,
181 vhost_net_buf_get_size(rxq),
3a403076 182 tun_ptr_free);
c67df11f
JW
183 rxq->head = rxq->tail = 0;
184 }
185}
186
fc72d1d5
JW
187static int vhost_net_buf_peek_len(void *ptr)
188{
1ffcbc85
JDB
189 if (tun_is_xdp_frame(ptr)) {
190 struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
fc72d1d5 191
1ffcbc85 192 return xdpf->len;
fc72d1d5
JW
193 }
194
195 return __skb_array_len_with_tag(ptr);
196}
197
c67df11f
JW
198static int vhost_net_buf_peek(struct vhost_net_virtqueue *nvq)
199{
200 struct vhost_net_buf *rxq = &nvq->rxq;
201
202 if (!vhost_net_buf_is_empty(rxq))
203 goto out;
204
205 if (!vhost_net_buf_produce(nvq))
206 return 0;
207
208out:
fc72d1d5 209 return vhost_net_buf_peek_len(vhost_net_buf_get_ptr(rxq));
c67df11f
JW
210}
211
212static void vhost_net_buf_init(struct vhost_net_buf *rxq)
213{
214 rxq->head = rxq->tail = 0;
215}
216
fe729a57 217static void vhost_net_enable_zcopy(int vq)
2839400f 218{
fe729a57 219 vhost_net_zcopy_mask |= 0x1 << vq;
2839400f
AH
220}
221
fe729a57
AH
222static struct vhost_net_ubuf_ref *
223vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy)
2839400f 224{
fe729a57 225 struct vhost_net_ubuf_ref *ubufs;
2839400f
AH
226 /* No zero copy backend? Nothing to count. */
227 if (!zcopy)
228 return NULL;
229 ubufs = kmalloc(sizeof(*ubufs), GFP_KERNEL);
230 if (!ubufs)
231 return ERR_PTR(-ENOMEM);
0ad8b480 232 atomic_set(&ubufs->refcount, 1);
2839400f
AH
233 init_waitqueue_head(&ubufs->wait);
234 ubufs->vq = vq;
235 return ubufs;
236}
237
0ad8b480 238static int vhost_net_ubuf_put(struct vhost_net_ubuf_ref *ubufs)
2839400f 239{
0ad8b480
MT
240 int r = atomic_sub_return(1, &ubufs->refcount);
241 if (unlikely(!r))
242 wake_up(&ubufs->wait);
243 return r;
2839400f
AH
244}
245
fe729a57 246static void vhost_net_ubuf_put_and_wait(struct vhost_net_ubuf_ref *ubufs)
2839400f 247{
0ad8b480
MT
248 vhost_net_ubuf_put(ubufs);
249 wait_event(ubufs->wait, !atomic_read(&ubufs->refcount));
c38e39c3
MT
250}
251
252static void vhost_net_ubuf_put_wait_and_free(struct vhost_net_ubuf_ref *ubufs)
253{
254 vhost_net_ubuf_put_and_wait(ubufs);
2839400f
AH
255 kfree(ubufs);
256}
257
b1ad8496
AH
258static void vhost_net_clear_ubuf_info(struct vhost_net *n)
259{
b1ad8496
AH
260 int i;
261
288cfe78
MT
262 for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
263 kfree(n->vqs[i].ubuf_info);
264 n->vqs[i].ubuf_info = NULL;
b1ad8496
AH
265 }
266}
267
0a1febf7 268static int vhost_net_set_ubuf_info(struct vhost_net *n)
2839400f
AH
269{
270 bool zcopy;
271 int i;
272
288cfe78 273 for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
fe729a57 274 zcopy = vhost_net_zcopy_mask & (0x1 << i);
2839400f
AH
275 if (!zcopy)
276 continue;
6da2ec56
KC
277 n->vqs[i].ubuf_info =
278 kmalloc_array(UIO_MAXIOV,
279 sizeof(*n->vqs[i].ubuf_info),
280 GFP_KERNEL);
2839400f
AH
281 if (!n->vqs[i].ubuf_info)
282 goto err;
283 }
284 return 0;
285
286err:
288cfe78 287 vhost_net_clear_ubuf_info(n);
2839400f
AH
288 return -ENOMEM;
289}
290
0a1febf7 291static void vhost_net_vq_reset(struct vhost_net *n)
2839400f
AH
292{
293 int i;
294
288cfe78
MT
295 vhost_net_clear_ubuf_info(n);
296
2839400f
AH
297 for (i = 0; i < VHOST_NET_VQ_MAX; i++) {
298 n->vqs[i].done_idx = 0;
299 n->vqs[i].upend_idx = 0;
300 n->vqs[i].ubufs = NULL;
81f95a55
MT
301 n->vqs[i].vhost_hlen = 0;
302 n->vqs[i].sock_hlen = 0;
c67df11f 303 vhost_net_buf_init(&n->vqs[i].rxq);
2839400f
AH
304 }
305
306}
307
eaae8132
MT
308static void vhost_net_tx_packet(struct vhost_net *net)
309{
310 ++net->tx_packets;
311 if (net->tx_packets < 1024)
312 return;
313 net->tx_packets = 0;
314 net->tx_zcopy_err = 0;
315}
316
317static void vhost_net_tx_err(struct vhost_net *net)
318{
319 ++net->tx_zcopy_err;
320}
321
322static bool vhost_net_tx_select_zcopy(struct vhost_net *net)
323{
1280c27f
MT
324 /* TX flush waits for outstanding DMAs to be done.
325 * Don't start new DMAs.
326 */
327 return !net->tx_flush &&
328 net->tx_packets / 64 >= net->tx_zcopy_err;
eaae8132
MT
329}
330
bab632d6
MT
331static bool vhost_sock_zcopy(struct socket *sock)
332{
333 return unlikely(experimental_zcopytx) &&
334 sock_flag(sock->sk, SOCK_ZEROCOPY);
335}
336
b211616d
MT
337/* In case of DMA done not in order in lower device driver for some reason.
338 * upend_idx is used to track end of used idx, done_idx is used to track head
339 * of used idx. Once lower device DMA done contiguously, we will signal KVM
340 * guest used idx.
341 */
094afe7d
JW
342static void vhost_zerocopy_signal_used(struct vhost_net *net,
343 struct vhost_virtqueue *vq)
b211616d 344{
2839400f
AH
345 struct vhost_net_virtqueue *nvq =
346 container_of(vq, struct vhost_net_virtqueue, vq);
c92112ae 347 int i, add;
b211616d
MT
348 int j = 0;
349
2839400f 350 for (i = nvq->done_idx; i != nvq->upend_idx; i = (i + 1) % UIO_MAXIOV) {
eaae8132
MT
351 if (vq->heads[i].len == VHOST_DMA_FAILED_LEN)
352 vhost_net_tx_err(net);
b211616d
MT
353 if (VHOST_DMA_IS_DONE(vq->heads[i].len)) {
354 vq->heads[i].len = VHOST_DMA_CLEAR_LEN;
b211616d
MT
355 ++j;
356 } else
357 break;
358 }
c92112ae
JW
359 while (j) {
360 add = min(UIO_MAXIOV - nvq->done_idx, j);
361 vhost_add_used_and_signal_n(vq->dev, vq,
362 &vq->heads[nvq->done_idx], add);
363 nvq->done_idx = (nvq->done_idx + add) % UIO_MAXIOV;
364 j -= add;
365 }
b211616d
MT
366}
367
eaae8132 368static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success)
b211616d 369{
fe729a57 370 struct vhost_net_ubuf_ref *ubufs = ubuf->ctx;
b211616d 371 struct vhost_virtqueue *vq = ubufs->vq;
0ad8b480 372 int cnt;
24eb21a1 373
b0c057ca
MT
374 rcu_read_lock_bh();
375
19c73b3e
JW
376 /* set len to mark this desc buffers done DMA */
377 vq->heads[ubuf->desc].len = success ?
378 VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN;
0ad8b480 379 cnt = vhost_net_ubuf_put(ubufs);
19c73b3e 380
24eb21a1
MT
381 /*
382 * Trigger polling thread if guest stopped submitting new buffers:
0ad8b480 383 * in this case, the refcount after decrement will eventually reach 1.
24eb21a1
MT
384 * We also trigger polling periodically after each 16 packets
385 * (the value 16 here is more or less arbitrary, it's tuned to trigger
386 * less than 10% of times).
387 */
0ad8b480 388 if (cnt <= 1 || !(cnt % 16))
24eb21a1 389 vhost_poll_queue(&vq->poll);
b0c057ca
MT
390
391 rcu_read_unlock_bh();
b211616d
MT
392}
393
03088137
JW
394static inline unsigned long busy_clock(void)
395{
396 return local_clock() >> 10;
397}
398
399static bool vhost_can_busy_poll(struct vhost_dev *dev,
400 unsigned long endtime)
401{
402 return likely(!need_resched()) &&
403 likely(!time_after(busy_clock(), endtime)) &&
404 likely(!signal_pending(current)) &&
405 !vhost_has_work(dev);
406}
407
8241a1e4
JW
408static void vhost_net_disable_vq(struct vhost_net *n,
409 struct vhost_virtqueue *vq)
410{
411 struct vhost_net_virtqueue *nvq =
412 container_of(vq, struct vhost_net_virtqueue, vq);
413 struct vhost_poll *poll = n->poll + (nvq - n->vqs);
414 if (!vq->private_data)
415 return;
416 vhost_poll_stop(poll);
417}
418
419static int vhost_net_enable_vq(struct vhost_net *n,
420 struct vhost_virtqueue *vq)
421{
422 struct vhost_net_virtqueue *nvq =
423 container_of(vq, struct vhost_net_virtqueue, vq);
424 struct vhost_poll *poll = n->poll + (nvq - n->vqs);
425 struct socket *sock;
426
427 sock = vq->private_data;
428 if (!sock)
429 return 0;
430
431 return vhost_poll_start(poll, sock->file);
432}
433
03088137
JW
434static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
435 struct vhost_virtqueue *vq,
436 struct iovec iov[], unsigned int iov_size,
437 unsigned int *out_num, unsigned int *in_num)
438{
439 unsigned long uninitialized_var(endtime);
440 int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
6b1e6cc7 441 out_num, in_num, NULL, NULL);
03088137
JW
442
443 if (r == vq->num && vq->busyloop_timeout) {
444 preempt_disable();
445 endtime = busy_clock() + vq->busyloop_timeout;
446 while (vhost_can_busy_poll(vq->dev, endtime) &&
447 vhost_vq_avail_empty(vq->dev, vq))
f2f09a4c 448 cpu_relax();
03088137
JW
449 preempt_enable();
450 r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
6b1e6cc7 451 out_num, in_num, NULL, NULL);
03088137
JW
452 }
453
454 return r;
455}
456
0ed005ce
JW
457static bool vhost_exceeds_maxpend(struct vhost_net *net)
458{
459 struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
460 struct vhost_virtqueue *vq = &nvq->vq;
461
1e6f7453
WB
462 return (nvq->upend_idx + UIO_MAXIOV - nvq->done_idx) % UIO_MAXIOV >
463 min_t(unsigned int, VHOST_MAX_PEND, vq->num >> 2);
0ed005ce
JW
464}
465
3a4d5c94
MT
466/* Expects to be always run from workqueue - which acts as
467 * read-size critical section for our kind of RCU. */
468static void handle_tx(struct vhost_net *net)
469{
2839400f 470 struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
81f95a55 471 struct vhost_virtqueue *vq = &nvq->vq;
98a527aa 472 unsigned out, in;
d5675bd2 473 int head;
3a4d5c94
MT
474 struct msghdr msg = {
475 .msg_name = NULL,
476 .msg_namelen = 0,
477 .msg_control = NULL,
478 .msg_controllen = 0,
3a4d5c94
MT
479 .msg_flags = MSG_DONTWAIT,
480 };
481 size_t len, total_len = 0;
70181d51 482 int err;
3a4d5c94 483 size_t hdr_size;
28457ee6 484 struct socket *sock;
fe729a57 485 struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
cedb9bdc 486 bool zcopy, zcopy_used;
a2ac9990 487 int sent_pkts = 0;
28457ee6 488
2e26af79
AH
489 mutex_lock(&vq->mutex);
490 sock = vq->private_data;
3a4d5c94 491 if (!sock)
2e26af79 492 goto out;
3a4d5c94 493
6b1e6cc7
JW
494 if (!vq_iotlb_prefetch(vq))
495 goto out;
496
8ea8cf89 497 vhost_disable_notify(&net->dev, vq);
feb8892c 498 vhost_net_disable_vq(net, vq);
3a4d5c94 499
81f95a55 500 hdr_size = nvq->vhost_hlen;
2839400f 501 zcopy = nvq->ubufs;
3a4d5c94
MT
502
503 for (;;) {
bab632d6
MT
504 /* Release DMAs done buffers first */
505 if (zcopy)
eaae8132 506 vhost_zerocopy_signal_used(net, vq);
bab632d6 507
f7c6be40 508
03088137
JW
509 head = vhost_net_tx_get_vq_desc(net, vq, vq->iov,
510 ARRAY_SIZE(vq->iov),
511 &out, &in);
d5675bd2 512 /* On error, stop handling until the next kick. */
7b3384fc 513 if (unlikely(head < 0))
d5675bd2 514 break;
3a4d5c94
MT
515 /* Nothing new? Wait for eventfd to tell us they refilled. */
516 if (head == vq->num) {
8ea8cf89
MT
517 if (unlikely(vhost_enable_notify(&net->dev, vq))) {
518 vhost_disable_notify(&net->dev, vq);
3a4d5c94
MT
519 continue;
520 }
521 break;
522 }
523 if (in) {
524 vq_err(vq, "Unexpected descriptor format for TX: "
525 "out %d, int %d\n", out, in);
526 break;
527 }
528 /* Skip header. TODO: support TSO. */
3a4d5c94 529 len = iov_length(vq->iov, out);
c0371da6 530 iov_iter_init(&msg.msg_iter, WRITE, vq->iov, out, len);
98a527aa 531 iov_iter_advance(&msg.msg_iter, hdr_size);
3a4d5c94 532 /* Sanity check */
01e97e65 533 if (!msg_data_left(&msg)) {
3a4d5c94
MT
534 vq_err(vq, "Unexpected header len for TX: "
535 "%zd expected %zd\n",
98a527aa 536 len, hdr_size);
3a4d5c94
MT
537 break;
538 }
01e97e65 539 len = msg_data_left(&msg);
ce21a029
JW
540
541 zcopy_used = zcopy && len >= VHOST_GOODCOPY_LEN
1e6f7453 542 && !vhost_exceeds_maxpend(net)
ce21a029 543 && vhost_net_tx_select_zcopy(net);
cedb9bdc 544
bab632d6 545 /* use msg_control to pass vhost zerocopy ubuf info to skb */
cedb9bdc 546 if (zcopy_used) {
ce21a029
JW
547 struct ubuf_info *ubuf;
548 ubuf = nvq->ubuf_info + nvq->upend_idx;
549
8b38694a 550 vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head);
ce21a029
JW
551 vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS;
552 ubuf->callback = vhost_zerocopy_callback;
553 ubuf->ctx = nvq->ubufs;
554 ubuf->desc = nvq->upend_idx;
c1d1b437 555 refcount_set(&ubuf->refcnt, 1);
ce21a029
JW
556 msg.msg_control = ubuf;
557 msg.msg_controllen = sizeof(ubuf);
558 ubufs = nvq->ubufs;
0ad8b480 559 atomic_inc(&ubufs->refcount);
2839400f 560 nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV;
ce21a029 561 } else {
4364d5f9 562 msg.msg_control = NULL;
ce21a029
JW
563 ubufs = NULL;
564 }
0ed005ce
JW
565
566 total_len += len;
567 if (total_len < VHOST_NET_WEIGHT &&
568 !vhost_vq_avail_empty(&net->dev, vq) &&
569 likely(!vhost_exceeds_maxpend(net))) {
570 msg.msg_flags |= MSG_MORE;
571 } else {
572 msg.msg_flags &= ~MSG_MORE;
573 }
574
3a4d5c94 575 /* TODO: Check specific error and bomb out unless ENOBUFS? */
1b784140 576 err = sock->ops->sendmsg(sock, &msg, len);
3a4d5c94 577 if (unlikely(err < 0)) {
cedb9bdc 578 if (zcopy_used) {
ce21a029 579 vhost_net_ubuf_put(ubufs);
2839400f
AH
580 nvq->upend_idx = ((unsigned)nvq->upend_idx - 1)
581 % UIO_MAXIOV;
bab632d6 582 }
8dd014ad 583 vhost_discard_vq_desc(vq, 1);
feb8892c 584 vhost_net_enable_vq(net, vq);
3a4d5c94
MT
585 break;
586 }
587 if (err != len)
95c0ec6a
MT
588 pr_debug("Truncated TX packet: "
589 " len %d != %zd\n", err, len);
cedb9bdc 590 if (!zcopy_used)
bab632d6 591 vhost_add_used_and_signal(&net->dev, vq, head, 0);
c8fb217a 592 else
eaae8132 593 vhost_zerocopy_signal_used(net, vq);
eaae8132 594 vhost_net_tx_packet(net);
a2ac9990 595 if (unlikely(total_len >= VHOST_NET_WEIGHT) ||
db688c24 596 unlikely(++sent_pkts >= VHOST_NET_PKT_WEIGHT)) {
3a4d5c94
MT
597 vhost_poll_queue(&vq->poll);
598 break;
599 }
600 }
2e26af79 601out:
3a4d5c94 602 mutex_unlock(&vq->mutex);
3a4d5c94
MT
603}
604
c67df11f 605static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk)
8dd014ad
DS
606{
607 struct sk_buff *head;
608 int len = 0;
783e3988 609 unsigned long flags;
8dd014ad 610
5990a305 611 if (rvq->rx_ring)
c67df11f 612 return vhost_net_buf_peek(rvq);
1576d986 613
783e3988 614 spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
8dd014ad 615 head = skb_peek(&sk->sk_receive_queue);
c53cff5e 616 if (likely(head)) {
8dd014ad 617 len = head->len;
df8a39de 618 if (skb_vlan_tag_present(head))
c53cff5e
BG
619 len += VLAN_HLEN;
620 }
621
783e3988 622 spin_unlock_irqrestore(&sk->sk_receive_queue.lock, flags);
8dd014ad
DS
623 return len;
624}
625
1576d986
JW
626static int sk_has_rx_data(struct sock *sk)
627{
628 struct socket *sock = sk->sk_socket;
629
630 if (sock->ops->peek_len)
631 return sock->ops->peek_len(sock);
632
633 return skb_queue_empty(&sk->sk_receive_queue);
634}
635
f5a4941a
JW
636static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq)
637{
638 struct vhost_virtqueue *vq = &nvq->vq;
639 struct vhost_dev *dev = vq->dev;
640
641 if (!nvq->done_idx)
642 return;
643
644 vhost_add_used_and_signal_n(dev, vq, vq->heads, nvq->done_idx);
645 nvq->done_idx = 0;
646}
647
03088137
JW
648static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
649{
c67df11f 650 struct vhost_net_virtqueue *rvq = &net->vqs[VHOST_NET_VQ_RX];
03088137
JW
651 struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
652 struct vhost_virtqueue *vq = &nvq->vq;
653 unsigned long uninitialized_var(endtime);
c67df11f 654 int len = peek_head_len(rvq, sk);
03088137
JW
655
656 if (!len && vq->busyloop_timeout) {
f5a4941a
JW
657 /* Flush batched heads first */
658 vhost_rx_signal_used(rvq);
03088137 659 /* Both tx vq and rx socket were polled here */
aaa3149b 660 mutex_lock_nested(&vq->mutex, 1);
03088137
JW
661 vhost_disable_notify(&net->dev, vq);
662
663 preempt_disable();
664 endtime = busy_clock() + vq->busyloop_timeout;
665
666 while (vhost_can_busy_poll(&net->dev, endtime) &&
1576d986 667 !sk_has_rx_data(sk) &&
03088137 668 vhost_vq_avail_empty(&net->dev, vq))
f2f09a4c 669 cpu_relax();
03088137
JW
670
671 preempt_enable();
672
8b949bef 673 if (!vhost_vq_avail_empty(&net->dev, vq))
03088137 674 vhost_poll_queue(&vq->poll);
8b949bef
JW
675 else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
676 vhost_disable_notify(&net->dev, vq);
677 vhost_poll_queue(&vq->poll);
678 }
679
03088137
JW
680 mutex_unlock(&vq->mutex);
681
c67df11f 682 len = peek_head_len(rvq, sk);
03088137
JW
683 }
684
685 return len;
686}
687
8dd014ad
DS
688/* This is a multi-buffer version of vhost_get_desc, that works if
689 * vq has read descriptors only.
690 * @vq - the relevant virtqueue
691 * @datalen - data length we'll be reading
692 * @iovcount - returned count of io vectors we fill
693 * @log - vhost log
694 * @log_num - log offset
94249369 695 * @quota - headcount quota, 1 for big buffer
8dd014ad
DS
696 * returns number of buffer heads allocated, negative on error
697 */
698static int get_rx_bufs(struct vhost_virtqueue *vq,
699 struct vring_used_elem *heads,
700 int datalen,
701 unsigned *iovcount,
702 struct vhost_log *log,
94249369
JW
703 unsigned *log_num,
704 unsigned int quota)
8dd014ad
DS
705{
706 unsigned int out, in;
707 int seg = 0;
708 int headcount = 0;
709 unsigned d;
710 int r, nlogs = 0;
8b38694a
MT
711 /* len is always initialized before use since we are always called with
712 * datalen > 0.
713 */
714 u32 uninitialized_var(len);
8dd014ad 715
94249369 716 while (datalen > 0 && headcount < quota) {
e0e9b406 717 if (unlikely(seg >= UIO_MAXIOV)) {
8dd014ad
DS
718 r = -ENOBUFS;
719 goto err;
720 }
47283bef 721 r = vhost_get_vq_desc(vq, vq->iov + seg,
8dd014ad
DS
722 ARRAY_SIZE(vq->iov) - seg, &out,
723 &in, log, log_num);
a39ee449
MT
724 if (unlikely(r < 0))
725 goto err;
726
727 d = r;
8dd014ad
DS
728 if (d == vq->num) {
729 r = 0;
730 goto err;
731 }
732 if (unlikely(out || in <= 0)) {
733 vq_err(vq, "unexpected descriptor format for RX: "
734 "out %d, in %d\n", out, in);
735 r = -EINVAL;
736 goto err;
737 }
738 if (unlikely(log)) {
739 nlogs += *log_num;
740 log += *log_num;
741 }
8b38694a
MT
742 heads[headcount].id = cpu_to_vhost32(vq, d);
743 len = iov_length(vq->iov + seg, in);
744 heads[headcount].len = cpu_to_vhost32(vq, len);
745 datalen -= len;
8dd014ad
DS
746 ++headcount;
747 seg += in;
748 }
99975cc6 749 heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen);
8dd014ad
DS
750 *iovcount = seg;
751 if (unlikely(log))
752 *log_num = nlogs;
d8316f39
MT
753
754 /* Detect overrun */
755 if (unlikely(datalen > 0)) {
756 r = UIO_MAXIOV + 1;
757 goto err;
758 }
8dd014ad
DS
759 return headcount;
760err:
761 vhost_discard_vq_desc(vq, headcount);
762 return r;
763}
764
3a4d5c94
MT
765/* Expects to be always run from workqueue - which acts as
766 * read-size critical section for our kind of RCU. */
94249369 767static void handle_rx(struct vhost_net *net)
3a4d5c94 768{
81f95a55
MT
769 struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_RX];
770 struct vhost_virtqueue *vq = &nvq->vq;
8dd014ad
DS
771 unsigned uninitialized_var(in), log;
772 struct vhost_log *vq_log;
773 struct msghdr msg = {
774 .msg_name = NULL,
775 .msg_namelen = 0,
776 .msg_control = NULL, /* FIXME: get and handle RX aux data. */
777 .msg_controllen = 0,
8dd014ad
DS
778 .msg_flags = MSG_DONTWAIT,
779 };
0960b641
JW
780 struct virtio_net_hdr hdr = {
781 .flags = 0,
782 .gso_type = VIRTIO_NET_HDR_GSO_NONE
8dd014ad 783 };
8dd014ad 784 size_t total_len = 0;
910a578f 785 int err, mergeable;
f5a4941a 786 s16 headcount;
8dd014ad
DS
787 size_t vhost_hlen, sock_hlen;
788 size_t vhost_len, sock_len;
2e26af79 789 struct socket *sock;
ba7438ae 790 struct iov_iter fixup;
0960b641 791 __virtio16 num_buffers;
db688c24 792 int recv_pkts = 0;
8dd014ad 793
aaa3149b 794 mutex_lock_nested(&vq->mutex, 0);
2e26af79
AH
795 sock = vq->private_data;
796 if (!sock)
797 goto out;
6b1e6cc7
JW
798
799 if (!vq_iotlb_prefetch(vq))
800 goto out;
801
8ea8cf89 802 vhost_disable_notify(&net->dev, vq);
8241a1e4 803 vhost_net_disable_vq(net, vq);
2e26af79 804
81f95a55
MT
805 vhost_hlen = nvq->vhost_hlen;
806 sock_hlen = nvq->sock_hlen;
8dd014ad 807
ea16c514 808 vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ?
8dd014ad 809 vq->log : NULL;
ea16c514 810 mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF);
8dd014ad 811
03088137 812 while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk))) {
8dd014ad
DS
813 sock_len += sock_hlen;
814 vhost_len = sock_len + vhost_hlen;
f5a4941a
JW
815 headcount = get_rx_bufs(vq, vq->heads + nvq->done_idx,
816 vhost_len, &in, vq_log, &log,
94249369 817 likely(mergeable) ? UIO_MAXIOV : 1);
8dd014ad
DS
818 /* On error, stop handling until the next kick. */
819 if (unlikely(headcount < 0))
8241a1e4 820 goto out;
8dd014ad
DS
821 /* OK, now we need to know about added descriptors. */
822 if (!headcount) {
8ea8cf89 823 if (unlikely(vhost_enable_notify(&net->dev, vq))) {
8dd014ad
DS
824 /* They have slipped one in as we were
825 * doing that: check again. */
8ea8cf89 826 vhost_disable_notify(&net->dev, vq);
8dd014ad
DS
827 continue;
828 }
829 /* Nothing new? Wait for eventfd to tell us
830 * they refilled. */
8241a1e4 831 goto out;
8dd014ad 832 }
5990a305 833 if (nvq->rx_ring)
6e474083
WX
834 msg.msg_control = vhost_net_buf_consume(&nvq->rxq);
835 /* On overrun, truncate and discard */
836 if (unlikely(headcount > UIO_MAXIOV)) {
837 iov_iter_init(&msg.msg_iter, READ, vq->iov, 1, 1);
838 err = sock->ops->recvmsg(sock, &msg,
839 1, MSG_DONTWAIT | MSG_TRUNC);
840 pr_debug("Discarded rx packet: len %zd\n", sock_len);
841 continue;
842 }
8dd014ad 843 /* We don't need to be notified again. */
ba7438ae
AV
844 iov_iter_init(&msg.msg_iter, READ, vq->iov, in, vhost_len);
845 fixup = msg.msg_iter;
846 if (unlikely((vhost_hlen))) {
847 /* We will supply the header ourselves
848 * TODO: support TSO.
849 */
850 iov_iter_advance(&msg.msg_iter, vhost_hlen);
ba7438ae 851 }
1b784140 852 err = sock->ops->recvmsg(sock, &msg,
8dd014ad
DS
853 sock_len, MSG_DONTWAIT | MSG_TRUNC);
854 /* Userspace might have consumed the packet meanwhile:
855 * it's not supposed to do this usually, but might be hard
856 * to prevent. Discard data we got (if any) and keep going. */
857 if (unlikely(err != sock_len)) {
858 pr_debug("Discarded rx packet: "
859 " len %d, expected %zd\n", err, sock_len);
860 vhost_discard_vq_desc(vq, headcount);
861 continue;
862 }
ba7438ae 863 /* Supply virtio_net_hdr if VHOST_NET_F_VIRTIO_NET_HDR */
4c5a8442
MT
864 if (unlikely(vhost_hlen)) {
865 if (copy_to_iter(&hdr, sizeof(hdr),
866 &fixup) != sizeof(hdr)) {
867 vq_err(vq, "Unable to write vnet_hdr "
868 "at addr %p\n", vq->iov->iov_base);
8241a1e4 869 goto out;
4c5a8442
MT
870 }
871 } else {
872 /* Header came from socket; we'll need to patch
873 * ->num_buffers over if VIRTIO_NET_F_MRG_RXBUF
874 */
875 iov_iter_advance(&fixup, sizeof(hdr));
8dd014ad
DS
876 }
877 /* TODO: Should check and handle checksum. */
5201aa49 878
0960b641 879 num_buffers = cpu_to_vhost16(vq, headcount);
cfbdab95 880 if (likely(mergeable) &&
0d79a493
MT
881 copy_to_iter(&num_buffers, sizeof num_buffers,
882 &fixup) != sizeof num_buffers) {
8dd014ad
DS
883 vq_err(vq, "Failed num_buffers write");
884 vhost_discard_vq_desc(vq, headcount);
8241a1e4 885 goto out;
8dd014ad 886 }
f5a4941a
JW
887 nvq->done_idx += headcount;
888 if (nvq->done_idx > VHOST_RX_BATCH)
889 vhost_rx_signal_used(nvq);
8dd014ad
DS
890 if (unlikely(vq_log))
891 vhost_log_write(vq, vq_log, log, vhost_len);
892 total_len += vhost_len;
db688c24
PA
893 if (unlikely(total_len >= VHOST_NET_WEIGHT) ||
894 unlikely(++recv_pkts >= VHOST_NET_PKT_WEIGHT)) {
8dd014ad 895 vhost_poll_queue(&vq->poll);
8241a1e4 896 goto out;
8dd014ad
DS
897 }
898 }
8241a1e4 899 vhost_net_enable_vq(net, vq);
2e26af79 900out:
f5a4941a 901 vhost_rx_signal_used(nvq);
8dd014ad 902 mutex_unlock(&vq->mutex);
8dd014ad
DS
903}
904
c23f3445 905static void handle_tx_kick(struct vhost_work *work)
3a4d5c94 906{
c23f3445
TH
907 struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
908 poll.work);
909 struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev);
910
3a4d5c94
MT
911 handle_tx(net);
912}
913
c23f3445 914static void handle_rx_kick(struct vhost_work *work)
3a4d5c94 915{
c23f3445
TH
916 struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
917 poll.work);
918 struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev);
919
3a4d5c94
MT
920 handle_rx(net);
921}
922
c23f3445 923static void handle_tx_net(struct vhost_work *work)
3a4d5c94 924{
c23f3445
TH
925 struct vhost_net *net = container_of(work, struct vhost_net,
926 poll[VHOST_NET_VQ_TX].work);
3a4d5c94
MT
927 handle_tx(net);
928}
929
c23f3445 930static void handle_rx_net(struct vhost_work *work)
3a4d5c94 931{
c23f3445
TH
932 struct vhost_net *net = container_of(work, struct vhost_net,
933 poll[VHOST_NET_VQ_RX].work);
3a4d5c94
MT
934 handle_rx(net);
935}
936
937static int vhost_net_open(struct inode *inode, struct file *f)
938{
23cc5a99 939 struct vhost_net *n;
c23f3445 940 struct vhost_dev *dev;
3ab2e420 941 struct vhost_virtqueue **vqs;
5990a305 942 void **queue;
59566b6e 943 int i;
c23f3445 944
dcda9b04 945 n = kvmalloc(sizeof *n, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
6c5ab651
MH
946 if (!n)
947 return -ENOMEM;
6da2ec56 948 vqs = kmalloc_array(VHOST_NET_VQ_MAX, sizeof(*vqs), GFP_KERNEL);
3ab2e420 949 if (!vqs) {
d04257b0 950 kvfree(n);
3ab2e420
AH
951 return -ENOMEM;
952 }
c23f3445 953
5990a305 954 queue = kmalloc_array(VHOST_RX_BATCH, sizeof(void *),
c67df11f
JW
955 GFP_KERNEL);
956 if (!queue) {
957 kfree(vqs);
958 kvfree(n);
959 return -ENOMEM;
960 }
961 n->vqs[VHOST_NET_VQ_RX].rxq.queue = queue;
962
c23f3445 963 dev = &n->dev;
3ab2e420
AH
964 vqs[VHOST_NET_VQ_TX] = &n->vqs[VHOST_NET_VQ_TX].vq;
965 vqs[VHOST_NET_VQ_RX] = &n->vqs[VHOST_NET_VQ_RX].vq;
966 n->vqs[VHOST_NET_VQ_TX].vq.handle_kick = handle_tx_kick;
967 n->vqs[VHOST_NET_VQ_RX].vq.handle_kick = handle_rx_kick;
2839400f
AH
968 for (i = 0; i < VHOST_NET_VQ_MAX; i++) {
969 n->vqs[i].ubufs = NULL;
970 n->vqs[i].ubuf_info = NULL;
971 n->vqs[i].upend_idx = 0;
972 n->vqs[i].done_idx = 0;
81f95a55
MT
973 n->vqs[i].vhost_hlen = 0;
974 n->vqs[i].sock_hlen = 0;
ab7e34b3 975 n->vqs[i].rx_ring = NULL;
c67df11f 976 vhost_net_buf_init(&n->vqs[i].rxq);
2839400f 977 }
59566b6e 978 vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX);
3a4d5c94 979
a9a08845
LT
980 vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, EPOLLOUT, dev);
981 vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, EPOLLIN, dev);
3a4d5c94
MT
982
983 f->private_data = n;
984
985 return 0;
986}
987
3a4d5c94
MT
988static struct socket *vhost_net_stop_vq(struct vhost_net *n,
989 struct vhost_virtqueue *vq)
990{
991 struct socket *sock;
c67df11f
JW
992 struct vhost_net_virtqueue *nvq =
993 container_of(vq, struct vhost_net_virtqueue, vq);
3a4d5c94
MT
994
995 mutex_lock(&vq->mutex);
22fa90c7 996 sock = vq->private_data;
3a4d5c94 997 vhost_net_disable_vq(n, vq);
22fa90c7 998 vq->private_data = NULL;
c67df11f 999 vhost_net_buf_unproduce(nvq);
303fd71b 1000 nvq->rx_ring = NULL;
3a4d5c94
MT
1001 mutex_unlock(&vq->mutex);
1002 return sock;
1003}
1004
1005static void vhost_net_stop(struct vhost_net *n, struct socket **tx_sock,
1006 struct socket **rx_sock)
1007{
3ab2e420
AH
1008 *tx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_TX].vq);
1009 *rx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_RX].vq);
3a4d5c94
MT
1010}
1011
1012static void vhost_net_flush_vq(struct vhost_net *n, int index)
1013{
1014 vhost_poll_flush(n->poll + index);
3ab2e420 1015 vhost_poll_flush(&n->vqs[index].vq.poll);
3a4d5c94
MT
1016}
1017
1018static void vhost_net_flush(struct vhost_net *n)
1019{
1020 vhost_net_flush_vq(n, VHOST_NET_VQ_TX);
1021 vhost_net_flush_vq(n, VHOST_NET_VQ_RX);
2839400f 1022 if (n->vqs[VHOST_NET_VQ_TX].ubufs) {
3ab2e420 1023 mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
1280c27f 1024 n->tx_flush = true;
3ab2e420 1025 mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
1280c27f 1026 /* Wait for all lower device DMAs done. */
fe729a57 1027 vhost_net_ubuf_put_and_wait(n->vqs[VHOST_NET_VQ_TX].ubufs);
3ab2e420 1028 mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
1280c27f 1029 n->tx_flush = false;
0ad8b480 1030 atomic_set(&n->vqs[VHOST_NET_VQ_TX].ubufs->refcount, 1);
3ab2e420 1031 mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
1280c27f 1032 }
3a4d5c94
MT
1033}
1034
1035static int vhost_net_release(struct inode *inode, struct file *f)
1036{
1037 struct vhost_net *n = f->private_data;
1038 struct socket *tx_sock;
1039 struct socket *rx_sock;
1040
1041 vhost_net_stop(n, &tx_sock, &rx_sock);
1042 vhost_net_flush(n);
b211616d 1043 vhost_dev_stop(&n->dev);
f6f93f75 1044 vhost_dev_cleanup(&n->dev);
81f95a55 1045 vhost_net_vq_reset(n);
3a4d5c94 1046 if (tx_sock)
09aaacf0 1047 sockfd_put(tx_sock);
3a4d5c94 1048 if (rx_sock)
09aaacf0 1049 sockfd_put(rx_sock);
b0c057ca
MT
1050 /* Make sure no callbacks are outstanding */
1051 synchronize_rcu_bh();
3a4d5c94
MT
1052 /* We do an extra flush before freeing memory,
1053 * since jobs can re-queue themselves. */
1054 vhost_net_flush(n);
c67df11f 1055 kfree(n->vqs[VHOST_NET_VQ_RX].rxq.queue);
3ab2e420 1056 kfree(n->dev.vqs);
d04257b0 1057 kvfree(n);
3a4d5c94
MT
1058 return 0;
1059}
1060
1061static struct socket *get_raw_socket(int fd)
1062{
1063 struct {
1064 struct sockaddr_ll sa;
1065 char buf[MAX_ADDR_LEN];
1066 } uaddr;
9b2c45d4 1067 int r;
3a4d5c94 1068 struct socket *sock = sockfd_lookup(fd, &r);
d47effe1 1069
3a4d5c94
MT
1070 if (!sock)
1071 return ERR_PTR(-ENOTSOCK);
1072
1073 /* Parameter checking */
1074 if (sock->sk->sk_type != SOCK_RAW) {
1075 r = -ESOCKTNOSUPPORT;
1076 goto err;
1077 }
1078
9b2c45d4
DV
1079 r = sock->ops->getname(sock, (struct sockaddr *)&uaddr.sa, 0);
1080 if (r < 0)
3a4d5c94
MT
1081 goto err;
1082
1083 if (uaddr.sa.sll_family != AF_PACKET) {
1084 r = -EPFNOSUPPORT;
1085 goto err;
1086 }
1087 return sock;
1088err:
09aaacf0 1089 sockfd_put(sock);
3a4d5c94
MT
1090 return ERR_PTR(r);
1091}
1092
5990a305 1093static struct ptr_ring *get_tap_ptr_ring(int fd)
c67df11f 1094{
5990a305 1095 struct ptr_ring *ring;
c67df11f
JW
1096 struct file *file = fget(fd);
1097
1098 if (!file)
1099 return NULL;
5990a305
JW
1100 ring = tun_get_tx_ring(file);
1101 if (!IS_ERR(ring))
c67df11f 1102 goto out;
5990a305
JW
1103 ring = tap_get_ptr_ring(file);
1104 if (!IS_ERR(ring))
c67df11f 1105 goto out;
5990a305 1106 ring = NULL;
c67df11f
JW
1107out:
1108 fput(file);
5990a305 1109 return ring;
c67df11f
JW
1110}
1111
501c774c 1112static struct socket *get_tap_socket(int fd)
3a4d5c94
MT
1113{
1114 struct file *file = fget(fd);
1115 struct socket *sock;
d47effe1 1116
3a4d5c94
MT
1117 if (!file)
1118 return ERR_PTR(-EBADF);
1119 sock = tun_get_socket(file);
501c774c
AB
1120 if (!IS_ERR(sock))
1121 return sock;
635b8c8e 1122 sock = tap_get_socket(file);
3a4d5c94
MT
1123 if (IS_ERR(sock))
1124 fput(file);
1125 return sock;
1126}
1127
1128static struct socket *get_socket(int fd)
1129{
1130 struct socket *sock;
d47effe1 1131
3a4d5c94
MT
1132 /* special case to disable backend */
1133 if (fd == -1)
1134 return NULL;
1135 sock = get_raw_socket(fd);
1136 if (!IS_ERR(sock))
1137 return sock;
501c774c 1138 sock = get_tap_socket(fd);
3a4d5c94
MT
1139 if (!IS_ERR(sock))
1140 return sock;
1141 return ERR_PTR(-ENOTSOCK);
1142}
1143
1144static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
1145{
1146 struct socket *sock, *oldsock;
1147 struct vhost_virtqueue *vq;
2839400f 1148 struct vhost_net_virtqueue *nvq;
fe729a57 1149 struct vhost_net_ubuf_ref *ubufs, *oldubufs = NULL;
3a4d5c94
MT
1150 int r;
1151
1152 mutex_lock(&n->dev.mutex);
1153 r = vhost_dev_check_owner(&n->dev);
1154 if (r)
1155 goto err;
1156
1157 if (index >= VHOST_NET_VQ_MAX) {
1158 r = -ENOBUFS;
1159 goto err;
1160 }
3ab2e420 1161 vq = &n->vqs[index].vq;
2839400f 1162 nvq = &n->vqs[index];
3a4d5c94
MT
1163 mutex_lock(&vq->mutex);
1164
1165 /* Verify that ring has been setup correctly. */
1166 if (!vhost_vq_access_ok(vq)) {
1167 r = -EFAULT;
1dace8c8 1168 goto err_vq;
3a4d5c94
MT
1169 }
1170 sock = get_socket(fd);
1171 if (IS_ERR(sock)) {
1172 r = PTR_ERR(sock);
1dace8c8 1173 goto err_vq;
3a4d5c94
MT
1174 }
1175
1176 /* start polling new socket */
22fa90c7 1177 oldsock = vq->private_data;
11fe8839 1178 if (sock != oldsock) {
fe729a57
AH
1179 ubufs = vhost_net_ubuf_alloc(vq,
1180 sock && vhost_sock_zcopy(sock));
bab632d6
MT
1181 if (IS_ERR(ubufs)) {
1182 r = PTR_ERR(ubufs);
1183 goto err_ubufs;
1184 }
692a998b 1185
d47effe1 1186 vhost_net_disable_vq(n, vq);
22fa90c7 1187 vq->private_data = sock;
c67df11f 1188 vhost_net_buf_unproduce(nvq);
80f7d030 1189 r = vhost_vq_init_access(vq);
f59281da 1190 if (r)
692a998b 1191 goto err_used;
2b8b328b
JW
1192 r = vhost_net_enable_vq(n, vq);
1193 if (r)
1194 goto err_used;
303fd71b
JW
1195 if (index == VHOST_NET_VQ_RX)
1196 nvq->rx_ring = get_tap_ptr_ring(fd);
692a998b 1197
2839400f
AH
1198 oldubufs = nvq->ubufs;
1199 nvq->ubufs = ubufs;
64e9a9b8
MT
1200
1201 n->tx_packets = 0;
1202 n->tx_zcopy_err = 0;
1280c27f 1203 n->tx_flush = false;
dd1f4078 1204 }
3a4d5c94 1205
1680e906
MT
1206 mutex_unlock(&vq->mutex);
1207
c047e5f3 1208 if (oldubufs) {
c38e39c3 1209 vhost_net_ubuf_put_wait_and_free(oldubufs);
c047e5f3 1210 mutex_lock(&vq->mutex);
eaae8132 1211 vhost_zerocopy_signal_used(n, vq);
c047e5f3
MT
1212 mutex_unlock(&vq->mutex);
1213 }
bab632d6 1214
3a4d5c94
MT
1215 if (oldsock) {
1216 vhost_net_flush_vq(n, index);
09aaacf0 1217 sockfd_put(oldsock);
3a4d5c94 1218 }
1dace8c8 1219
1680e906
MT
1220 mutex_unlock(&n->dev.mutex);
1221 return 0;
1222
692a998b 1223err_used:
22fa90c7 1224 vq->private_data = oldsock;
692a998b
JW
1225 vhost_net_enable_vq(n, vq);
1226 if (ubufs)
c38e39c3 1227 vhost_net_ubuf_put_wait_and_free(ubufs);
bab632d6 1228err_ubufs:
09aaacf0 1229 sockfd_put(sock);
1dace8c8
JD
1230err_vq:
1231 mutex_unlock(&vq->mutex);
3a4d5c94
MT
1232err:
1233 mutex_unlock(&n->dev.mutex);
1234 return r;
1235}
1236
1237static long vhost_net_reset_owner(struct vhost_net *n)
1238{
1239 struct socket *tx_sock = NULL;
1240 struct socket *rx_sock = NULL;
1241 long err;
a9709d68 1242 struct vhost_umem *umem;
d47effe1 1243
3a4d5c94
MT
1244 mutex_lock(&n->dev.mutex);
1245 err = vhost_dev_check_owner(&n->dev);
1246 if (err)
1247 goto done;
a9709d68
JW
1248 umem = vhost_dev_reset_owner_prepare();
1249 if (!umem) {
150b9e51
MT
1250 err = -ENOMEM;
1251 goto done;
1252 }
3a4d5c94
MT
1253 vhost_net_stop(n, &tx_sock, &rx_sock);
1254 vhost_net_flush(n);
4cd87951 1255 vhost_dev_stop(&n->dev);
a9709d68 1256 vhost_dev_reset_owner(&n->dev, umem);
81f95a55 1257 vhost_net_vq_reset(n);
3a4d5c94
MT
1258done:
1259 mutex_unlock(&n->dev.mutex);
1260 if (tx_sock)
09aaacf0 1261 sockfd_put(tx_sock);
3a4d5c94 1262 if (rx_sock)
09aaacf0 1263 sockfd_put(rx_sock);
3a4d5c94
MT
1264 return err;
1265}
1266
1267static int vhost_net_set_features(struct vhost_net *n, u64 features)
1268{
8dd014ad 1269 size_t vhost_hlen, sock_hlen, hdr_len;
3a4d5c94 1270 int i;
8dd014ad 1271
e4fca7d6
MT
1272 hdr_len = (features & ((1ULL << VIRTIO_NET_F_MRG_RXBUF) |
1273 (1ULL << VIRTIO_F_VERSION_1))) ?
8dd014ad
DS
1274 sizeof(struct virtio_net_hdr_mrg_rxbuf) :
1275 sizeof(struct virtio_net_hdr);
1276 if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) {
1277 /* vhost provides vnet_hdr */
1278 vhost_hlen = hdr_len;
1279 sock_hlen = 0;
1280 } else {
1281 /* socket provides vnet_hdr */
1282 vhost_hlen = 0;
1283 sock_hlen = hdr_len;
1284 }
3a4d5c94
MT
1285 mutex_lock(&n->dev.mutex);
1286 if ((features & (1 << VHOST_F_LOG_ALL)) &&
6b1e6cc7
JW
1287 !vhost_log_access_ok(&n->dev))
1288 goto out_unlock;
1289
1290 if ((features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))) {
1291 if (vhost_init_device_iotlb(&n->dev, true))
1292 goto out_unlock;
3a4d5c94 1293 }
6b1e6cc7 1294
3a4d5c94 1295 for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
3ab2e420 1296 mutex_lock(&n->vqs[i].vq.mutex);
ea16c514 1297 n->vqs[i].vq.acked_features = features;
81f95a55
MT
1298 n->vqs[i].vhost_hlen = vhost_hlen;
1299 n->vqs[i].sock_hlen = sock_hlen;
3ab2e420 1300 mutex_unlock(&n->vqs[i].vq.mutex);
3a4d5c94 1301 }
3a4d5c94
MT
1302 mutex_unlock(&n->dev.mutex);
1303 return 0;
6b1e6cc7
JW
1304
1305out_unlock:
1306 mutex_unlock(&n->dev.mutex);
1307 return -EFAULT;
3a4d5c94
MT
1308}
1309
b1ad8496
AH
1310static long vhost_net_set_owner(struct vhost_net *n)
1311{
1312 int r;
1313
1314 mutex_lock(&n->dev.mutex);
05c05351
MT
1315 if (vhost_dev_has_owner(&n->dev)) {
1316 r = -EBUSY;
1317 goto out;
1318 }
b1ad8496
AH
1319 r = vhost_net_set_ubuf_info(n);
1320 if (r)
1321 goto out;
1322 r = vhost_dev_set_owner(&n->dev);
1323 if (r)
1324 vhost_net_clear_ubuf_info(n);
1325 vhost_net_flush(n);
1326out:
1327 mutex_unlock(&n->dev.mutex);
1328 return r;
1329}
1330
3a4d5c94
MT
1331static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
1332 unsigned long arg)
1333{
1334 struct vhost_net *n = f->private_data;
1335 void __user *argp = (void __user *)arg;
1336 u64 __user *featurep = argp;
1337 struct vhost_vring_file backend;
1338 u64 features;
1339 int r;
d47effe1 1340
3a4d5c94
MT
1341 switch (ioctl) {
1342 case VHOST_NET_SET_BACKEND:
d3553a52
TY
1343 if (copy_from_user(&backend, argp, sizeof backend))
1344 return -EFAULT;
3a4d5c94
MT
1345 return vhost_net_set_backend(n, backend.index, backend.fd);
1346 case VHOST_GET_FEATURES:
0dd05a3b 1347 features = VHOST_NET_FEATURES;
d3553a52
TY
1348 if (copy_to_user(featurep, &features, sizeof features))
1349 return -EFAULT;
1350 return 0;
3a4d5c94 1351 case VHOST_SET_FEATURES:
d3553a52
TY
1352 if (copy_from_user(&features, featurep, sizeof features))
1353 return -EFAULT;
0dd05a3b 1354 if (features & ~VHOST_NET_FEATURES)
3a4d5c94
MT
1355 return -EOPNOTSUPP;
1356 return vhost_net_set_features(n, features);
1357 case VHOST_RESET_OWNER:
1358 return vhost_net_reset_owner(n);
b1ad8496
AH
1359 case VHOST_SET_OWNER:
1360 return vhost_net_set_owner(n);
3a4d5c94
MT
1361 default:
1362 mutex_lock(&n->dev.mutex);
935cdee7
MT
1363 r = vhost_dev_ioctl(&n->dev, ioctl, argp);
1364 if (r == -ENOIOCTLCMD)
1365 r = vhost_vring_ioctl(&n->dev, ioctl, argp);
1366 else
1367 vhost_net_flush(n);
3a4d5c94
MT
1368 mutex_unlock(&n->dev.mutex);
1369 return r;
1370 }
1371}
1372
1373#ifdef CONFIG_COMPAT
1374static long vhost_net_compat_ioctl(struct file *f, unsigned int ioctl,
1375 unsigned long arg)
1376{
1377 return vhost_net_ioctl(f, ioctl, (unsigned long)compat_ptr(arg));
1378}
1379#endif
1380
6b1e6cc7
JW
1381static ssize_t vhost_net_chr_read_iter(struct kiocb *iocb, struct iov_iter *to)
1382{
1383 struct file *file = iocb->ki_filp;
1384 struct vhost_net *n = file->private_data;
1385 struct vhost_dev *dev = &n->dev;
1386 int noblock = file->f_flags & O_NONBLOCK;
1387
1388 return vhost_chr_read_iter(dev, to, noblock);
1389}
1390
1391static ssize_t vhost_net_chr_write_iter(struct kiocb *iocb,
1392 struct iov_iter *from)
1393{
1394 struct file *file = iocb->ki_filp;
1395 struct vhost_net *n = file->private_data;
1396 struct vhost_dev *dev = &n->dev;
1397
1398 return vhost_chr_write_iter(dev, from);
1399}
1400
afc9a42b 1401static __poll_t vhost_net_chr_poll(struct file *file, poll_table *wait)
6b1e6cc7
JW
1402{
1403 struct vhost_net *n = file->private_data;
1404 struct vhost_dev *dev = &n->dev;
1405
1406 return vhost_chr_poll(file, dev, wait);
1407}
1408
373a83a6 1409static const struct file_operations vhost_net_fops = {
3a4d5c94
MT
1410 .owner = THIS_MODULE,
1411 .release = vhost_net_release,
6b1e6cc7
JW
1412 .read_iter = vhost_net_chr_read_iter,
1413 .write_iter = vhost_net_chr_write_iter,
1414 .poll = vhost_net_chr_poll,
3a4d5c94
MT
1415 .unlocked_ioctl = vhost_net_ioctl,
1416#ifdef CONFIG_COMPAT
1417 .compat_ioctl = vhost_net_compat_ioctl,
1418#endif
1419 .open = vhost_net_open,
6038f373 1420 .llseek = noop_llseek,
3a4d5c94
MT
1421};
1422
1423static struct miscdevice vhost_net_misc = {
7c7c7f01 1424 .minor = VHOST_NET_MINOR,
1425 .name = "vhost-net",
1426 .fops = &vhost_net_fops,
3a4d5c94
MT
1427};
1428
a8d3782f 1429static int vhost_net_init(void)
3a4d5c94 1430{
bab632d6 1431 if (experimental_zcopytx)
fe729a57 1432 vhost_net_enable_zcopy(VHOST_NET_VQ_TX);
c23f3445 1433 return misc_register(&vhost_net_misc);
3a4d5c94
MT
1434}
1435module_init(vhost_net_init);
1436
a8d3782f 1437static void vhost_net_exit(void)
3a4d5c94
MT
1438{
1439 misc_deregister(&vhost_net_misc);
3a4d5c94
MT
1440}
1441module_exit(vhost_net_exit);
1442
1443MODULE_VERSION("0.0.1");
1444MODULE_LICENSE("GPL v2");
1445MODULE_AUTHOR("Michael S. Tsirkin");
1446MODULE_DESCRIPTION("Host kernel accelerator for virtio net");
7c7c7f01 1447MODULE_ALIAS_MISCDEV(VHOST_NET_MINOR);
1448MODULE_ALIAS("devname:vhost-net");