]> git.proxmox.com Git - mirror_qemu.git/blame - contrib/libvhost-user/libvhost-user.c
vhost+postcopy: Send address back to qemu
[mirror_qemu.git] / contrib / libvhost-user / libvhost-user.c
CommitLineData
7b2e5c65
MAL
1/*
2 * Vhost User library
3 *
4 * Copyright IBM, Corp. 2007
5 * Copyright (c) 2016 Red Hat, Inc.
6 *
7 * Authors:
8 * Anthony Liguori <aliguori@us.ibm.com>
9 * Marc-André Lureau <mlureau@redhat.com>
10 * Victor Kaplansky <victork@redhat.com>
11 *
12 * This work is licensed under the terms of the GNU GPL, version 2 or
13 * later. See the COPYING file in the top-level directory.
14 */
15
eb078a9f
MAL
16/* this code avoids GLib dependency */
17#include <stdlib.h>
18#include <stdio.h>
19#include <unistd.h>
20#include <stdarg.h>
21#include <errno.h>
22#include <string.h>
23#include <assert.h>
24#include <inttypes.h>
25#include <sys/types.h>
26#include <sys/socket.h>
7b2e5c65 27#include <sys/eventfd.h>
eb078a9f 28#include <sys/mman.h>
2a84ffc0
DDAG
29#include "qemu/compiler.h"
30
31#if defined(__linux__)
32#include <sys/syscall.h>
33#include <fcntl.h>
34#include <sys/ioctl.h>
7b2e5c65
MAL
35#include <linux/vhost.h>
36
2a84ffc0
DDAG
37#ifdef __NR_userfaultfd
38#include <linux/userfaultfd.h>
39#endif
40
41#endif
42
7b2e5c65
MAL
43#include "qemu/atomic.h"
44
45#include "libvhost-user.h"
46
eb078a9f
MAL
47/* usually provided by GLib */
48#ifndef MIN
49#define MIN(x, y) ({ \
50 typeof(x) _min1 = (x); \
51 typeof(y) _min2 = (y); \
52 (void) (&_min1 == &_min2); \
53 _min1 < _min2 ? _min1 : _min2; })
54#endif
55
7b2e5c65
MAL
56#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
57
58/* The version of the protocol we support */
59#define VHOST_USER_VERSION 1
60#define LIBVHOST_USER_DEBUG 0
61
62#define DPRINT(...) \
63 do { \
64 if (LIBVHOST_USER_DEBUG) { \
65 fprintf(stderr, __VA_ARGS__); \
66 } \
67 } while (0)
68
69static const char *
ea642e22 70vu_request_to_string(unsigned int req)
7b2e5c65
MAL
71{
72#define REQ(req) [req] = #req
73 static const char *vu_request_str[] = {
7b2e5c65
MAL
74 REQ(VHOST_USER_NONE),
75 REQ(VHOST_USER_GET_FEATURES),
76 REQ(VHOST_USER_SET_FEATURES),
77 REQ(VHOST_USER_SET_OWNER),
78 REQ(VHOST_USER_RESET_OWNER),
79 REQ(VHOST_USER_SET_MEM_TABLE),
80 REQ(VHOST_USER_SET_LOG_BASE),
81 REQ(VHOST_USER_SET_LOG_FD),
82 REQ(VHOST_USER_SET_VRING_NUM),
83 REQ(VHOST_USER_SET_VRING_ADDR),
84 REQ(VHOST_USER_SET_VRING_BASE),
85 REQ(VHOST_USER_GET_VRING_BASE),
86 REQ(VHOST_USER_SET_VRING_KICK),
87 REQ(VHOST_USER_SET_VRING_CALL),
88 REQ(VHOST_USER_SET_VRING_ERR),
89 REQ(VHOST_USER_GET_PROTOCOL_FEATURES),
90 REQ(VHOST_USER_SET_PROTOCOL_FEATURES),
91 REQ(VHOST_USER_GET_QUEUE_NUM),
92 REQ(VHOST_USER_SET_VRING_ENABLE),
93 REQ(VHOST_USER_SEND_RARP),
ea642e22
DDAG
94 REQ(VHOST_USER_NET_SET_MTU),
95 REQ(VHOST_USER_SET_SLAVE_REQ_FD),
96 REQ(VHOST_USER_IOTLB_MSG),
97 REQ(VHOST_USER_SET_VRING_ENDIAN),
0bc24d83
CL
98 REQ(VHOST_USER_GET_CONFIG),
99 REQ(VHOST_USER_SET_CONFIG),
d3dff7a5 100 REQ(VHOST_USER_POSTCOPY_ADVISE),
6864a7b5 101 REQ(VHOST_USER_POSTCOPY_LISTEN),
7b2e5c65
MAL
102 REQ(VHOST_USER_MAX),
103 };
104#undef REQ
105
106 if (req < VHOST_USER_MAX) {
107 return vu_request_str[req];
108 } else {
109 return "unknown";
110 }
111}
112
113static void
114vu_panic(VuDev *dev, const char *msg, ...)
115{
116 char *buf = NULL;
117 va_list ap;
118
119 va_start(ap, msg);
eb078a9f
MAL
120 if (vasprintf(&buf, msg, ap) < 0) {
121 buf = NULL;
122 }
7b2e5c65
MAL
123 va_end(ap);
124
125 dev->broken = true;
126 dev->panic(dev, buf);
127 free(buf);
128
129 /* FIXME: find a way to call virtio_error? */
130}
131
132/* Translate guest physical address to our virtual address. */
133void *
293084a7 134vu_gpa_to_va(VuDev *dev, uint64_t *plen, uint64_t guest_addr)
7b2e5c65
MAL
135{
136 int i;
137
293084a7
YX
138 if (*plen == 0) {
139 return NULL;
140 }
141
7b2e5c65
MAL
142 /* Find matching memory region. */
143 for (i = 0; i < dev->nregions; i++) {
144 VuDevRegion *r = &dev->regions[i];
145
146 if ((guest_addr >= r->gpa) && (guest_addr < (r->gpa + r->size))) {
293084a7
YX
147 if ((guest_addr + *plen) > (r->gpa + r->size)) {
148 *plen = r->gpa + r->size - guest_addr;
149 }
7b2e5c65
MAL
150 return (void *)(uintptr_t)
151 guest_addr - r->gpa + r->mmap_addr + r->mmap_offset;
152 }
153 }
154
155 return NULL;
156}
157
158/* Translate qemu virtual address to our virtual address. */
159static void *
160qva_to_va(VuDev *dev, uint64_t qemu_addr)
161{
162 int i;
163
164 /* Find matching memory region. */
165 for (i = 0; i < dev->nregions; i++) {
166 VuDevRegion *r = &dev->regions[i];
167
168 if ((qemu_addr >= r->qva) && (qemu_addr < (r->qva + r->size))) {
169 return (void *)(uintptr_t)
170 qemu_addr - r->qva + r->mmap_addr + r->mmap_offset;
171 }
172 }
173
174 return NULL;
175}
176
177static void
178vmsg_close_fds(VhostUserMsg *vmsg)
179{
180 int i;
181
182 for (i = 0; i < vmsg->fd_num; i++) {
183 close(vmsg->fds[i]);
184 }
185}
186
187static bool
188vu_message_read(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)
189{
190 char control[CMSG_SPACE(VHOST_MEMORY_MAX_NREGIONS * sizeof(int))] = { };
191 struct iovec iov = {
192 .iov_base = (char *)vmsg,
193 .iov_len = VHOST_USER_HDR_SIZE,
194 };
195 struct msghdr msg = {
196 .msg_iov = &iov,
197 .msg_iovlen = 1,
198 .msg_control = control,
199 .msg_controllen = sizeof(control),
200 };
201 size_t fd_size;
202 struct cmsghdr *cmsg;
203 int rc;
204
205 do {
206 rc = recvmsg(conn_fd, &msg, 0);
207 } while (rc < 0 && (errno == EINTR || errno == EAGAIN));
208
2566378d 209 if (rc < 0) {
7b2e5c65
MAL
210 vu_panic(dev, "Error while recvmsg: %s", strerror(errno));
211 return false;
212 }
213
214 vmsg->fd_num = 0;
215 for (cmsg = CMSG_FIRSTHDR(&msg);
216 cmsg != NULL;
217 cmsg = CMSG_NXTHDR(&msg, cmsg))
218 {
219 if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
220 fd_size = cmsg->cmsg_len - CMSG_LEN(0);
221 vmsg->fd_num = fd_size / sizeof(int);
222 memcpy(vmsg->fds, CMSG_DATA(cmsg), fd_size);
223 break;
224 }
225 }
226
227 if (vmsg->size > sizeof(vmsg->payload)) {
228 vu_panic(dev,
229 "Error: too big message request: %d, size: vmsg->size: %u, "
230 "while sizeof(vmsg->payload) = %zu\n",
231 vmsg->request, vmsg->size, sizeof(vmsg->payload));
232 goto fail;
233 }
234
235 if (vmsg->size) {
236 do {
237 rc = read(conn_fd, &vmsg->payload, vmsg->size);
238 } while (rc < 0 && (errno == EINTR || errno == EAGAIN));
239
240 if (rc <= 0) {
241 vu_panic(dev, "Error while reading: %s", strerror(errno));
242 goto fail;
243 }
244
245 assert(rc == vmsg->size);
246 }
247
248 return true;
249
250fail:
251 vmsg_close_fds(vmsg);
252
253 return false;
254}
255
256static bool
257vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)
258{
259 int rc;
260 uint8_t *p = (uint8_t *)vmsg;
d6e47717
DDAG
261 char control[CMSG_SPACE(VHOST_MEMORY_MAX_NREGIONS * sizeof(int))] = { };
262 struct iovec iov = {
263 .iov_base = (char *)vmsg,
264 .iov_len = VHOST_USER_HDR_SIZE,
265 };
266 struct msghdr msg = {
267 .msg_iov = &iov,
268 .msg_iovlen = 1,
269 .msg_control = control,
270 };
271 struct cmsghdr *cmsg;
272
273 memset(control, 0, sizeof(control));
274 assert(vmsg->fd_num <= VHOST_MEMORY_MAX_NREGIONS);
275 if (vmsg->fd_num > 0) {
276 size_t fdsize = vmsg->fd_num * sizeof(int);
277 msg.msg_controllen = CMSG_SPACE(fdsize);
278 cmsg = CMSG_FIRSTHDR(&msg);
279 cmsg->cmsg_len = CMSG_LEN(fdsize);
280 cmsg->cmsg_level = SOL_SOCKET;
281 cmsg->cmsg_type = SCM_RIGHTS;
282 memcpy(CMSG_DATA(cmsg), vmsg->fds, fdsize);
283 } else {
284 msg.msg_controllen = 0;
285 }
7b2e5c65
MAL
286
287 /* Set the version in the flags when sending the reply */
288 vmsg->flags &= ~VHOST_USER_VERSION_MASK;
289 vmsg->flags |= VHOST_USER_VERSION;
290 vmsg->flags |= VHOST_USER_REPLY_MASK;
291
292 do {
d6e47717 293 rc = sendmsg(conn_fd, &msg, 0);
7b2e5c65
MAL
294 } while (rc < 0 && (errno == EINTR || errno == EAGAIN));
295
296 do {
297 if (vmsg->data) {
298 rc = write(conn_fd, vmsg->data, vmsg->size);
299 } else {
300 rc = write(conn_fd, p + VHOST_USER_HDR_SIZE, vmsg->size);
301 }
302 } while (rc < 0 && (errno == EINTR || errno == EAGAIN));
303
304 if (rc <= 0) {
305 vu_panic(dev, "Error while writing: %s", strerror(errno));
306 return false;
307 }
308
309 return true;
310}
311
312/* Kick the log_call_fd if required. */
313static void
314vu_log_kick(VuDev *dev)
315{
316 if (dev->log_call_fd != -1) {
317 DPRINT("Kicking the QEMU's log...\n");
318 if (eventfd_write(dev->log_call_fd, 1) < 0) {
319 vu_panic(dev, "Error writing eventfd: %s", strerror(errno));
320 }
321 }
322}
323
324static void
325vu_log_page(uint8_t *log_table, uint64_t page)
326{
327 DPRINT("Logged dirty guest page: %"PRId64"\n", page);
328 atomic_or(&log_table[page / 8], 1 << (page % 8));
329}
330
331static void
332vu_log_write(VuDev *dev, uint64_t address, uint64_t length)
333{
334 uint64_t page;
335
336 if (!(dev->features & (1ULL << VHOST_F_LOG_ALL)) ||
337 !dev->log_table || !length) {
338 return;
339 }
340
341 assert(dev->log_size > ((address + length - 1) / VHOST_LOG_PAGE / 8));
342
343 page = address / VHOST_LOG_PAGE;
344 while (page * VHOST_LOG_PAGE < address + length) {
345 vu_log_page(dev->log_table, page);
346 page += VHOST_LOG_PAGE;
347 }
348
349 vu_log_kick(dev);
350}
351
352static void
353vu_kick_cb(VuDev *dev, int condition, void *data)
354{
355 int index = (intptr_t)data;
356 VuVirtq *vq = &dev->vq[index];
357 int sock = vq->kick_fd;
358 eventfd_t kick_data;
359 ssize_t rc;
360
361 rc = eventfd_read(sock, &kick_data);
362 if (rc == -1) {
363 vu_panic(dev, "kick eventfd_read(): %s", strerror(errno));
364 dev->remove_watch(dev, dev->vq[index].kick_fd);
365 } else {
366 DPRINT("Got kick_data: %016"PRIx64" handler:%p idx:%d\n",
367 kick_data, vq->handler, index);
368 if (vq->handler) {
369 vq->handler(dev, index);
370 }
371 }
372}
373
374static bool
375vu_get_features_exec(VuDev *dev, VhostUserMsg *vmsg)
376{
377 vmsg->payload.u64 =
378 1ULL << VHOST_F_LOG_ALL |
379 1ULL << VHOST_USER_F_PROTOCOL_FEATURES;
380
381 if (dev->iface->get_features) {
382 vmsg->payload.u64 |= dev->iface->get_features(dev);
383 }
384
385 vmsg->size = sizeof(vmsg->payload.u64);
d6e47717 386 vmsg->fd_num = 0;
7b2e5c65
MAL
387
388 DPRINT("Sending back to guest u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
389
390 return true;
391}
392
393static void
394vu_set_enable_all_rings(VuDev *dev, bool enabled)
395{
396 int i;
397
398 for (i = 0; i < VHOST_MAX_NR_VIRTQUEUE; i++) {
399 dev->vq[i].enable = enabled;
400 }
401}
402
403static bool
404vu_set_features_exec(VuDev *dev, VhostUserMsg *vmsg)
405{
406 DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
407
408 dev->features = vmsg->payload.u64;
409
410 if (!(dev->features & VHOST_USER_F_PROTOCOL_FEATURES)) {
411 vu_set_enable_all_rings(dev, true);
412 }
413
414 if (dev->iface->set_features) {
415 dev->iface->set_features(dev, dev->features);
416 }
417
418 return false;
419}
420
421static bool
422vu_set_owner_exec(VuDev *dev, VhostUserMsg *vmsg)
423{
424 return false;
425}
426
427static void
428vu_close_log(VuDev *dev)
429{
430 if (dev->log_table) {
431 if (munmap(dev->log_table, dev->log_size) != 0) {
432 perror("close log munmap() error");
433 }
434
435 dev->log_table = NULL;
436 }
437 if (dev->log_call_fd != -1) {
438 close(dev->log_call_fd);
439 dev->log_call_fd = -1;
440 }
441}
442
443static bool
444vu_reset_device_exec(VuDev *dev, VhostUserMsg *vmsg)
445{
446 vu_set_enable_all_rings(dev, false);
447
448 return false;
449}
450
55d754b3
DDAG
451static bool
452vu_set_mem_table_exec_postcopy(VuDev *dev, VhostUserMsg *vmsg)
453{
454 int i;
455 VhostUserMemory *memory = &vmsg->payload.memory;
456 dev->nregions = memory->nregions;
457 /* TODO: Postcopy specific code */
458 DPRINT("Nregions: %d\n", memory->nregions);
459 for (i = 0; i < dev->nregions; i++) {
460 void *mmap_addr;
461 VhostUserMemoryRegion *msg_region = &memory->regions[i];
462 VuDevRegion *dev_region = &dev->regions[i];
463
464 DPRINT("Region %d\n", i);
465 DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n",
466 msg_region->guest_phys_addr);
467 DPRINT(" memory_size: 0x%016"PRIx64"\n",
468 msg_region->memory_size);
469 DPRINT(" userspace_addr 0x%016"PRIx64"\n",
470 msg_region->userspace_addr);
471 DPRINT(" mmap_offset 0x%016"PRIx64"\n",
472 msg_region->mmap_offset);
473
474 dev_region->gpa = msg_region->guest_phys_addr;
475 dev_region->size = msg_region->memory_size;
476 dev_region->qva = msg_region->userspace_addr;
477 dev_region->mmap_offset = msg_region->mmap_offset;
478
479 /* We don't use offset argument of mmap() since the
480 * mapped address has to be page aligned, and we use huge
481 * pages. */
482 mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset,
483 PROT_READ | PROT_WRITE, MAP_SHARED,
484 vmsg->fds[i], 0);
485
486 if (mmap_addr == MAP_FAILED) {
487 vu_panic(dev, "region mmap error: %s", strerror(errno));
488 } else {
489 dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr;
490 DPRINT(" mmap_addr: 0x%016"PRIx64"\n",
491 dev_region->mmap_addr);
492 }
493
9bb38019
DDAG
494 /* Return the address to QEMU so that it can translate the ufd
495 * fault addresses back.
496 */
497 msg_region->userspace_addr = (uintptr_t)(mmap_addr +
498 dev_region->mmap_offset);
55d754b3
DDAG
499 close(vmsg->fds[i]);
500 }
501
9bb38019
DDAG
502 /* Send the message back to qemu with the addresses filled in */
503 vmsg->fd_num = 0;
504 if (!vu_message_write(dev, dev->sock, vmsg)) {
505 vu_panic(dev, "failed to respond to set-mem-table for postcopy");
506 return false;
507 }
508
509 /* Wait for QEMU to confirm that it's registered the handler for the
510 * faults.
511 */
512 if (!vu_message_read(dev, dev->sock, vmsg) ||
513 vmsg->size != sizeof(vmsg->payload.u64) ||
514 vmsg->payload.u64 != 0) {
515 vu_panic(dev, "failed to receive valid ack for postcopy set-mem-table");
516 return false;
517 }
518
519 /* OK, now we can go and register the memory and generate faults */
51a5d6e5
DDAG
520 for (i = 0; i < dev->nregions; i++) {
521 VuDevRegion *dev_region = &dev->regions[i];
522#ifdef UFFDIO_REGISTER
523 /* We should already have an open ufd. Mark each memory
524 * range as ufd.
525 * Note: Do we need any madvises? Well it's not been accessed
526 * yet, still probably need no THP to be safe, discard to be safe?
527 */
528 struct uffdio_register reg_struct;
529 reg_struct.range.start = (uintptr_t)dev_region->mmap_addr;
530 reg_struct.range.len = dev_region->size + dev_region->mmap_offset;
531 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
532
533 if (ioctl(dev->postcopy_ufd, UFFDIO_REGISTER, &reg_struct)) {
534 vu_panic(dev, "%s: Failed to userfault region %d "
535 "@%p + size:%zx offset: %zx: (ufd=%d)%s\n",
536 __func__, i,
537 dev_region->mmap_addr,
538 dev_region->size, dev_region->mmap_offset,
539 dev->postcopy_ufd, strerror(errno));
540 return false;
541 }
542 if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) {
543 vu_panic(dev, "%s Region (%d) doesn't support COPY",
544 __func__, i);
545 return false;
546 }
547 DPRINT("%s: region %d: Registered userfault for %llx + %llx\n",
548 __func__, i, reg_struct.range.start, reg_struct.range.len);
549 /* TODO: Stash 'zero' support flags somewhere */
550#endif
551 }
552
55d754b3
DDAG
553 return false;
554}
555
7b2e5c65
MAL
556static bool
557vu_set_mem_table_exec(VuDev *dev, VhostUserMsg *vmsg)
558{
559 int i;
560 VhostUserMemory *memory = &vmsg->payload.memory;
bb102d1d
YX
561
562 for (i = 0; i < dev->nregions; i++) {
563 VuDevRegion *r = &dev->regions[i];
564 void *m = (void *) (uintptr_t) r->mmap_addr;
565
566 if (m) {
567 munmap(m, r->size + r->mmap_offset);
568 }
569 }
7b2e5c65
MAL
570 dev->nregions = memory->nregions;
571
55d754b3
DDAG
572 if (dev->postcopy_listening) {
573 return vu_set_mem_table_exec_postcopy(dev, vmsg);
574 }
575
7b2e5c65
MAL
576 DPRINT("Nregions: %d\n", memory->nregions);
577 for (i = 0; i < dev->nregions; i++) {
578 void *mmap_addr;
579 VhostUserMemoryRegion *msg_region = &memory->regions[i];
580 VuDevRegion *dev_region = &dev->regions[i];
581
582 DPRINT("Region %d\n", i);
583 DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n",
584 msg_region->guest_phys_addr);
585 DPRINT(" memory_size: 0x%016"PRIx64"\n",
586 msg_region->memory_size);
587 DPRINT(" userspace_addr 0x%016"PRIx64"\n",
588 msg_region->userspace_addr);
589 DPRINT(" mmap_offset 0x%016"PRIx64"\n",
590 msg_region->mmap_offset);
591
592 dev_region->gpa = msg_region->guest_phys_addr;
593 dev_region->size = msg_region->memory_size;
594 dev_region->qva = msg_region->userspace_addr;
595 dev_region->mmap_offset = msg_region->mmap_offset;
596
597 /* We don't use offset argument of mmap() since the
598 * mapped address has to be page aligned, and we use huge
599 * pages. */
600 mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset,
601 PROT_READ | PROT_WRITE, MAP_SHARED,
602 vmsg->fds[i], 0);
603
604 if (mmap_addr == MAP_FAILED) {
605 vu_panic(dev, "region mmap error: %s", strerror(errno));
606 } else {
607 dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr;
608 DPRINT(" mmap_addr: 0x%016"PRIx64"\n",
609 dev_region->mmap_addr);
610 }
611
612 close(vmsg->fds[i]);
613 }
614
615 return false;
616}
617
618static bool
619vu_set_log_base_exec(VuDev *dev, VhostUserMsg *vmsg)
620{
621 int fd;
622 uint64_t log_mmap_size, log_mmap_offset;
623 void *rc;
624
625 if (vmsg->fd_num != 1 ||
626 vmsg->size != sizeof(vmsg->payload.log)) {
627 vu_panic(dev, "Invalid log_base message");
628 return true;
629 }
630
631 fd = vmsg->fds[0];
632 log_mmap_offset = vmsg->payload.log.mmap_offset;
633 log_mmap_size = vmsg->payload.log.mmap_size;
634 DPRINT("Log mmap_offset: %"PRId64"\n", log_mmap_offset);
635 DPRINT("Log mmap_size: %"PRId64"\n", log_mmap_size);
636
637 rc = mmap(0, log_mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd,
638 log_mmap_offset);
bb102d1d 639 close(fd);
7b2e5c65
MAL
640 if (rc == MAP_FAILED) {
641 perror("log mmap error");
642 }
bb102d1d
YX
643
644 if (dev->log_table) {
645 munmap(dev->log_table, dev->log_size);
646 }
7b2e5c65
MAL
647 dev->log_table = rc;
648 dev->log_size = log_mmap_size;
649
650 vmsg->size = sizeof(vmsg->payload.u64);
d6e47717 651 vmsg->fd_num = 0;
7b2e5c65
MAL
652
653 return true;
654}
655
656static bool
657vu_set_log_fd_exec(VuDev *dev, VhostUserMsg *vmsg)
658{
659 if (vmsg->fd_num != 1) {
660 vu_panic(dev, "Invalid log_fd message");
661 return false;
662 }
663
664 if (dev->log_call_fd != -1) {
665 close(dev->log_call_fd);
666 }
667 dev->log_call_fd = vmsg->fds[0];
668 DPRINT("Got log_call_fd: %d\n", vmsg->fds[0]);
669
670 return false;
671}
672
673static bool
674vu_set_vring_num_exec(VuDev *dev, VhostUserMsg *vmsg)
675{
676 unsigned int index = vmsg->payload.state.index;
677 unsigned int num = vmsg->payload.state.num;
678
679 DPRINT("State.index: %d\n", index);
680 DPRINT("State.num: %d\n", num);
681 dev->vq[index].vring.num = num;
682
683 return false;
684}
685
686static bool
687vu_set_vring_addr_exec(VuDev *dev, VhostUserMsg *vmsg)
688{
689 struct vhost_vring_addr *vra = &vmsg->payload.addr;
690 unsigned int index = vra->index;
691 VuVirtq *vq = &dev->vq[index];
692
693 DPRINT("vhost_vring_addr:\n");
694 DPRINT(" index: %d\n", vra->index);
695 DPRINT(" flags: %d\n", vra->flags);
696 DPRINT(" desc_user_addr: 0x%016llx\n", vra->desc_user_addr);
697 DPRINT(" used_user_addr: 0x%016llx\n", vra->used_user_addr);
698 DPRINT(" avail_user_addr: 0x%016llx\n", vra->avail_user_addr);
699 DPRINT(" log_guest_addr: 0x%016llx\n", vra->log_guest_addr);
700
701 vq->vring.flags = vra->flags;
702 vq->vring.desc = qva_to_va(dev, vra->desc_user_addr);
703 vq->vring.used = qva_to_va(dev, vra->used_user_addr);
704 vq->vring.avail = qva_to_va(dev, vra->avail_user_addr);
705 vq->vring.log_guest_addr = vra->log_guest_addr;
706
707 DPRINT("Setting virtq addresses:\n");
708 DPRINT(" vring_desc at %p\n", vq->vring.desc);
709 DPRINT(" vring_used at %p\n", vq->vring.used);
710 DPRINT(" vring_avail at %p\n", vq->vring.avail);
711
712 if (!(vq->vring.desc && vq->vring.used && vq->vring.avail)) {
713 vu_panic(dev, "Invalid vring_addr message");
714 return false;
715 }
716
717 vq->used_idx = vq->vring.used->idx;
718
35480cbf
MAL
719 if (vq->last_avail_idx != vq->used_idx) {
720 bool resume = dev->iface->queue_is_processed_in_order &&
721 dev->iface->queue_is_processed_in_order(dev, index);
722
723 DPRINT("Last avail index != used index: %u != %u%s\n",
724 vq->last_avail_idx, vq->used_idx,
725 resume ? ", resuming" : "");
726
727 if (resume) {
728 vq->shadow_avail_idx = vq->last_avail_idx = vq->used_idx;
729 }
730 }
731
7b2e5c65
MAL
732 return false;
733}
734
735static bool
736vu_set_vring_base_exec(VuDev *dev, VhostUserMsg *vmsg)
737{
738 unsigned int index = vmsg->payload.state.index;
739 unsigned int num = vmsg->payload.state.num;
740
741 DPRINT("State.index: %d\n", index);
742 DPRINT("State.num: %d\n", num);
743 dev->vq[index].shadow_avail_idx = dev->vq[index].last_avail_idx = num;
744
745 return false;
746}
747
748static bool
749vu_get_vring_base_exec(VuDev *dev, VhostUserMsg *vmsg)
750{
751 unsigned int index = vmsg->payload.state.index;
752
753 DPRINT("State.index: %d\n", index);
754 vmsg->payload.state.num = dev->vq[index].last_avail_idx;
755 vmsg->size = sizeof(vmsg->payload.state);
756
757 dev->vq[index].started = false;
758 if (dev->iface->queue_set_started) {
759 dev->iface->queue_set_started(dev, index, false);
760 }
761
762 if (dev->vq[index].call_fd != -1) {
763 close(dev->vq[index].call_fd);
764 dev->vq[index].call_fd = -1;
765 }
766 if (dev->vq[index].kick_fd != -1) {
767 dev->remove_watch(dev, dev->vq[index].kick_fd);
768 close(dev->vq[index].kick_fd);
769 dev->vq[index].kick_fd = -1;
770 }
771
772 return true;
773}
774
775static bool
776vu_check_queue_msg_file(VuDev *dev, VhostUserMsg *vmsg)
777{
778 int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
779
780 if (index >= VHOST_MAX_NR_VIRTQUEUE) {
781 vmsg_close_fds(vmsg);
782 vu_panic(dev, "Invalid queue index: %u", index);
783 return false;
784 }
785
786 if (vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK ||
787 vmsg->fd_num != 1) {
788 vmsg_close_fds(vmsg);
789 vu_panic(dev, "Invalid fds in request: %d", vmsg->request);
790 return false;
791 }
792
793 return true;
794}
795
796static bool
797vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg)
798{
799 int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
800
801 DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
802
803 if (!vu_check_queue_msg_file(dev, vmsg)) {
804 return false;
805 }
806
807 if (dev->vq[index].kick_fd != -1) {
808 dev->remove_watch(dev, dev->vq[index].kick_fd);
809 close(dev->vq[index].kick_fd);
810 dev->vq[index].kick_fd = -1;
811 }
812
813 if (!(vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)) {
814 dev->vq[index].kick_fd = vmsg->fds[0];
815 DPRINT("Got kick_fd: %d for vq: %d\n", vmsg->fds[0], index);
816 }
817
818 dev->vq[index].started = true;
819 if (dev->iface->queue_set_started) {
820 dev->iface->queue_set_started(dev, index, true);
821 }
822
823 if (dev->vq[index].kick_fd != -1 && dev->vq[index].handler) {
824 dev->set_watch(dev, dev->vq[index].kick_fd, VU_WATCH_IN,
825 vu_kick_cb, (void *)(long)index);
826
827 DPRINT("Waiting for kicks on fd: %d for vq: %d\n",
828 dev->vq[index].kick_fd, index);
829 }
830
831 return false;
832}
833
834void vu_set_queue_handler(VuDev *dev, VuVirtq *vq,
835 vu_queue_handler_cb handler)
836{
837 int qidx = vq - dev->vq;
838
839 vq->handler = handler;
840 if (vq->kick_fd >= 0) {
841 if (handler) {
842 dev->set_watch(dev, vq->kick_fd, VU_WATCH_IN,
843 vu_kick_cb, (void *)(long)qidx);
844 } else {
845 dev->remove_watch(dev, vq->kick_fd);
846 }
847 }
848}
849
850static bool
851vu_set_vring_call_exec(VuDev *dev, VhostUserMsg *vmsg)
852{
853 int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
854
855 DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
856
857 if (!vu_check_queue_msg_file(dev, vmsg)) {
858 return false;
859 }
860
861 if (dev->vq[index].call_fd != -1) {
862 close(dev->vq[index].call_fd);
863 dev->vq[index].call_fd = -1;
864 }
865
866 if (!(vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)) {
867 dev->vq[index].call_fd = vmsg->fds[0];
868 }
869
870 DPRINT("Got call_fd: %d for vq: %d\n", vmsg->fds[0], index);
871
872 return false;
873}
874
875static bool
876vu_set_vring_err_exec(VuDev *dev, VhostUserMsg *vmsg)
877{
878 int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
879
880 DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
881
882 if (!vu_check_queue_msg_file(dev, vmsg)) {
883 return false;
884 }
885
886 if (dev->vq[index].err_fd != -1) {
887 close(dev->vq[index].err_fd);
888 dev->vq[index].err_fd = -1;
889 }
890
891 if (!(vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)) {
892 dev->vq[index].err_fd = vmsg->fds[0];
893 }
894
895 return false;
896}
897
898static bool
899vu_get_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg)
900{
13384f15
DDAG
901 uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD |
902 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ;
7b2e5c65
MAL
903
904 if (dev->iface->get_protocol_features) {
905 features |= dev->iface->get_protocol_features(dev);
906 }
907
908 vmsg->payload.u64 = features;
909 vmsg->size = sizeof(vmsg->payload.u64);
d6e47717 910 vmsg->fd_num = 0;
7b2e5c65
MAL
911
912 return true;
913}
914
915static bool
916vu_set_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg)
917{
918 uint64_t features = vmsg->payload.u64;
919
920 DPRINT("u64: 0x%016"PRIx64"\n", features);
921
922 dev->protocol_features = vmsg->payload.u64;
923
924 if (dev->iface->set_protocol_features) {
925 dev->iface->set_protocol_features(dev, features);
926 }
927
928 return false;
929}
930
931static bool
932vu_get_queue_num_exec(VuDev *dev, VhostUserMsg *vmsg)
933{
934 DPRINT("Function %s() not implemented yet.\n", __func__);
935 return false;
936}
937
938static bool
939vu_set_vring_enable_exec(VuDev *dev, VhostUserMsg *vmsg)
940{
941 unsigned int index = vmsg->payload.state.index;
942 unsigned int enable = vmsg->payload.state.num;
943
944 DPRINT("State.index: %d\n", index);
945 DPRINT("State.enable: %d\n", enable);
946
947 if (index >= VHOST_MAX_NR_VIRTQUEUE) {
948 vu_panic(dev, "Invalid vring_enable index: %u", index);
949 return false;
950 }
951
952 dev->vq[index].enable = enable;
953 return false;
954}
955
13384f15
DDAG
956static bool
957vu_set_slave_req_fd(VuDev *dev, VhostUserMsg *vmsg)
958{
959 if (vmsg->fd_num != 1) {
960 vu_panic(dev, "Invalid slave_req_fd message (%d fd's)", vmsg->fd_num);
961 return false;
962 }
963
964 if (dev->slave_fd != -1) {
965 close(dev->slave_fd);
966 }
967 dev->slave_fd = vmsg->fds[0];
968 DPRINT("Got slave_fd: %d\n", vmsg->fds[0]);
969
970 return false;
971}
972
0bc24d83
CL
973static bool
974vu_get_config(VuDev *dev, VhostUserMsg *vmsg)
975{
976 int ret = -1;
977
978 if (dev->iface->get_config) {
979 ret = dev->iface->get_config(dev, vmsg->payload.config.region,
980 vmsg->payload.config.size);
981 }
982
983 if (ret) {
984 /* resize to zero to indicate an error to master */
985 vmsg->size = 0;
986 }
987
988 return true;
989}
990
991static bool
992vu_set_config(VuDev *dev, VhostUserMsg *vmsg)
993{
994 int ret = -1;
995
996 if (dev->iface->set_config) {
997 ret = dev->iface->set_config(dev, vmsg->payload.config.region,
998 vmsg->payload.config.offset,
999 vmsg->payload.config.size,
1000 vmsg->payload.config.flags);
1001 if (ret) {
1002 vu_panic(dev, "Set virtio configuration space failed");
1003 }
1004 }
1005
1006 return false;
1007}
1008
d3dff7a5
DDAG
1009static bool
1010vu_set_postcopy_advise(VuDev *dev, VhostUserMsg *vmsg)
1011{
2a84ffc0
DDAG
1012 dev->postcopy_ufd = -1;
1013#ifdef UFFDIO_API
1014 struct uffdio_api api_struct;
1015
1016 dev->postcopy_ufd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
d3dff7a5 1017 vmsg->size = 0;
2a84ffc0
DDAG
1018#endif
1019
1020 if (dev->postcopy_ufd == -1) {
1021 vu_panic(dev, "Userfaultfd not available: %s", strerror(errno));
1022 goto out;
1023 }
1024
1025#ifdef UFFDIO_API
1026 api_struct.api = UFFD_API;
1027 api_struct.features = 0;
1028 if (ioctl(dev->postcopy_ufd, UFFDIO_API, &api_struct)) {
1029 vu_panic(dev, "Failed UFFDIO_API: %s", strerror(errno));
1030 close(dev->postcopy_ufd);
1031 dev->postcopy_ufd = -1;
1032 goto out;
1033 }
1034 /* TODO: Stash feature flags somewhere */
1035#endif
1036
1037out:
1038 /* Return a ufd to the QEMU */
1039 vmsg->fd_num = 1;
1040 vmsg->fds[0] = dev->postcopy_ufd;
d3dff7a5
DDAG
1041 return true; /* = send a reply */
1042}
1043
6864a7b5
DDAG
1044static bool
1045vu_set_postcopy_listen(VuDev *dev, VhostUserMsg *vmsg)
1046{
1047 vmsg->payload.u64 = -1;
1048 vmsg->size = sizeof(vmsg->payload.u64);
1049
1050 if (dev->nregions) {
1051 vu_panic(dev, "Regions already registered at postcopy-listen");
1052 return true;
1053 }
1054 dev->postcopy_listening = true;
1055
1056 vmsg->flags = VHOST_USER_VERSION | VHOST_USER_REPLY_MASK;
1057 vmsg->payload.u64 = 0; /* Success */
1058 return true;
1059}
7b2e5c65
MAL
1060static bool
1061vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
1062{
1063 int do_reply = 0;
1064
1065 /* Print out generic part of the request. */
1066 DPRINT("================ Vhost user message ================\n");
1067 DPRINT("Request: %s (%d)\n", vu_request_to_string(vmsg->request),
1068 vmsg->request);
1069 DPRINT("Flags: 0x%x\n", vmsg->flags);
1070 DPRINT("Size: %d\n", vmsg->size);
1071
1072 if (vmsg->fd_num) {
1073 int i;
1074 DPRINT("Fds:");
1075 for (i = 0; i < vmsg->fd_num; i++) {
1076 DPRINT(" %d", vmsg->fds[i]);
1077 }
1078 DPRINT("\n");
1079 }
1080
1081 if (dev->iface->process_msg &&
1082 dev->iface->process_msg(dev, vmsg, &do_reply)) {
1083 return do_reply;
1084 }
1085
1086 switch (vmsg->request) {
1087 case VHOST_USER_GET_FEATURES:
1088 return vu_get_features_exec(dev, vmsg);
1089 case VHOST_USER_SET_FEATURES:
1090 return vu_set_features_exec(dev, vmsg);
1091 case VHOST_USER_GET_PROTOCOL_FEATURES:
1092 return vu_get_protocol_features_exec(dev, vmsg);
1093 case VHOST_USER_SET_PROTOCOL_FEATURES:
1094 return vu_set_protocol_features_exec(dev, vmsg);
1095 case VHOST_USER_SET_OWNER:
1096 return vu_set_owner_exec(dev, vmsg);
1097 case VHOST_USER_RESET_OWNER:
1098 return vu_reset_device_exec(dev, vmsg);
1099 case VHOST_USER_SET_MEM_TABLE:
1100 return vu_set_mem_table_exec(dev, vmsg);
1101 case VHOST_USER_SET_LOG_BASE:
1102 return vu_set_log_base_exec(dev, vmsg);
1103 case VHOST_USER_SET_LOG_FD:
1104 return vu_set_log_fd_exec(dev, vmsg);
1105 case VHOST_USER_SET_VRING_NUM:
1106 return vu_set_vring_num_exec(dev, vmsg);
1107 case VHOST_USER_SET_VRING_ADDR:
1108 return vu_set_vring_addr_exec(dev, vmsg);
1109 case VHOST_USER_SET_VRING_BASE:
1110 return vu_set_vring_base_exec(dev, vmsg);
1111 case VHOST_USER_GET_VRING_BASE:
1112 return vu_get_vring_base_exec(dev, vmsg);
1113 case VHOST_USER_SET_VRING_KICK:
1114 return vu_set_vring_kick_exec(dev, vmsg);
1115 case VHOST_USER_SET_VRING_CALL:
1116 return vu_set_vring_call_exec(dev, vmsg);
1117 case VHOST_USER_SET_VRING_ERR:
1118 return vu_set_vring_err_exec(dev, vmsg);
1119 case VHOST_USER_GET_QUEUE_NUM:
1120 return vu_get_queue_num_exec(dev, vmsg);
1121 case VHOST_USER_SET_VRING_ENABLE:
1122 return vu_set_vring_enable_exec(dev, vmsg);
13384f15
DDAG
1123 case VHOST_USER_SET_SLAVE_REQ_FD:
1124 return vu_set_slave_req_fd(dev, vmsg);
0bc24d83
CL
1125 case VHOST_USER_GET_CONFIG:
1126 return vu_get_config(dev, vmsg);
1127 case VHOST_USER_SET_CONFIG:
1128 return vu_set_config(dev, vmsg);
2566378d
JF
1129 case VHOST_USER_NONE:
1130 break;
d3dff7a5
DDAG
1131 case VHOST_USER_POSTCOPY_ADVISE:
1132 return vu_set_postcopy_advise(dev, vmsg);
6864a7b5
DDAG
1133 case VHOST_USER_POSTCOPY_LISTEN:
1134 return vu_set_postcopy_listen(dev, vmsg);
7b2e5c65
MAL
1135 default:
1136 vmsg_close_fds(vmsg);
1137 vu_panic(dev, "Unhandled request: %d", vmsg->request);
1138 }
1139
1140 return false;
1141}
1142
1143bool
1144vu_dispatch(VuDev *dev)
1145{
1146 VhostUserMsg vmsg = { 0, };
1147 int reply_requested;
1148 bool success = false;
1149
1150 if (!vu_message_read(dev, dev->sock, &vmsg)) {
1151 goto end;
1152 }
1153
1154 reply_requested = vu_process_message(dev, &vmsg);
1155 if (!reply_requested) {
1156 success = true;
1157 goto end;
1158 }
1159
1160 if (!vu_message_write(dev, dev->sock, &vmsg)) {
1161 goto end;
1162 }
1163
1164 success = true;
1165
1166end:
eb078a9f 1167 free(vmsg.data);
7b2e5c65
MAL
1168 return success;
1169}
1170
1171void
1172vu_deinit(VuDev *dev)
1173{
1174 int i;
1175
1176 for (i = 0; i < dev->nregions; i++) {
1177 VuDevRegion *r = &dev->regions[i];
1178 void *m = (void *) (uintptr_t) r->mmap_addr;
1179 if (m != MAP_FAILED) {
1180 munmap(m, r->size + r->mmap_offset);
1181 }
1182 }
1183 dev->nregions = 0;
1184
1185 for (i = 0; i < VHOST_MAX_NR_VIRTQUEUE; i++) {
1186 VuVirtq *vq = &dev->vq[i];
1187
1188 if (vq->call_fd != -1) {
1189 close(vq->call_fd);
1190 vq->call_fd = -1;
1191 }
1192
1193 if (vq->kick_fd != -1) {
1194 close(vq->kick_fd);
1195 vq->kick_fd = -1;
1196 }
1197
1198 if (vq->err_fd != -1) {
1199 close(vq->err_fd);
1200 vq->err_fd = -1;
1201 }
1202 }
1203
1204
1205 vu_close_log(dev);
13384f15
DDAG
1206 if (dev->slave_fd != -1) {
1207 close(dev->slave_fd);
1208 dev->slave_fd = -1;
1209 }
7b2e5c65
MAL
1210
1211 if (dev->sock != -1) {
1212 close(dev->sock);
1213 }
1214}
1215
1216void
1217vu_init(VuDev *dev,
1218 int socket,
1219 vu_panic_cb panic,
1220 vu_set_watch_cb set_watch,
1221 vu_remove_watch_cb remove_watch,
1222 const VuDevIface *iface)
1223{
1224 int i;
1225
1226 assert(socket >= 0);
1227 assert(set_watch);
1228 assert(remove_watch);
1229 assert(iface);
1230 assert(panic);
1231
1232 memset(dev, 0, sizeof(*dev));
1233
1234 dev->sock = socket;
1235 dev->panic = panic;
1236 dev->set_watch = set_watch;
1237 dev->remove_watch = remove_watch;
1238 dev->iface = iface;
1239 dev->log_call_fd = -1;
13384f15 1240 dev->slave_fd = -1;
7b2e5c65
MAL
1241 for (i = 0; i < VHOST_MAX_NR_VIRTQUEUE; i++) {
1242 dev->vq[i] = (VuVirtq) {
1243 .call_fd = -1, .kick_fd = -1, .err_fd = -1,
1244 .notification = true,
1245 };
1246 }
1247}
1248
1249VuVirtq *
1250vu_get_queue(VuDev *dev, int qidx)
1251{
1252 assert(qidx < VHOST_MAX_NR_VIRTQUEUE);
1253 return &dev->vq[qidx];
1254}
1255
1256bool
1257vu_queue_enabled(VuDev *dev, VuVirtq *vq)
1258{
1259 return vq->enable;
1260}
1261
bcf0836d
DDAG
1262bool
1263vu_queue_started(const VuDev *dev, const VuVirtq *vq)
1264{
1265 return vq->started;
1266}
1267
7b2e5c65
MAL
1268static inline uint16_t
1269vring_avail_flags(VuVirtq *vq)
1270{
1271 return vq->vring.avail->flags;
1272}
1273
1274static inline uint16_t
1275vring_avail_idx(VuVirtq *vq)
1276{
1277 vq->shadow_avail_idx = vq->vring.avail->idx;
1278
1279 return vq->shadow_avail_idx;
1280}
1281
1282static inline uint16_t
1283vring_avail_ring(VuVirtq *vq, int i)
1284{
1285 return vq->vring.avail->ring[i];
1286}
1287
1288static inline uint16_t
1289vring_get_used_event(VuVirtq *vq)
1290{
1291 return vring_avail_ring(vq, vq->vring.num);
1292}
1293
1294static int
1295virtqueue_num_heads(VuDev *dev, VuVirtq *vq, unsigned int idx)
1296{
1297 uint16_t num_heads = vring_avail_idx(vq) - idx;
1298
1299 /* Check it isn't doing very strange things with descriptor numbers. */
1300 if (num_heads > vq->vring.num) {
1301 vu_panic(dev, "Guest moved used index from %u to %u",
1302 idx, vq->shadow_avail_idx);
1303 return -1;
1304 }
1305 if (num_heads) {
1306 /* On success, callers read a descriptor at vq->last_avail_idx.
1307 * Make sure descriptor read does not bypass avail index read. */
1308 smp_rmb();
1309 }
1310
1311 return num_heads;
1312}
1313
1314static bool
1315virtqueue_get_head(VuDev *dev, VuVirtq *vq,
1316 unsigned int idx, unsigned int *head)
1317{
1318 /* Grab the next descriptor number they're advertising, and increment
1319 * the index we've seen. */
1320 *head = vring_avail_ring(vq, idx % vq->vring.num);
1321
1322 /* If their number is silly, that's a fatal mistake. */
1323 if (*head >= vq->vring.num) {
1324 vu_panic(dev, "Guest says index %u is available", head);
1325 return false;
1326 }
1327
1328 return true;
1329}
1330
293084a7
YX
1331static int
1332virtqueue_read_indirect_desc(VuDev *dev, struct vring_desc *desc,
1333 uint64_t addr, size_t len)
1334{
1335 struct vring_desc *ori_desc;
1336 uint64_t read_len;
1337
1338 if (len > (VIRTQUEUE_MAX_SIZE * sizeof(struct vring_desc))) {
1339 return -1;
1340 }
1341
1342 if (len == 0) {
1343 return -1;
1344 }
1345
1346 while (len) {
1347 read_len = len;
1348 ori_desc = vu_gpa_to_va(dev, &read_len, addr);
1349 if (!ori_desc) {
1350 return -1;
1351 }
1352
1353 memcpy(desc, ori_desc, read_len);
1354 len -= read_len;
1355 addr += read_len;
1356 desc += read_len;
1357 }
1358
1359 return 0;
1360}
1361
7b2e5c65
MAL
1362enum {
1363 VIRTQUEUE_READ_DESC_ERROR = -1,
1364 VIRTQUEUE_READ_DESC_DONE = 0, /* end of chain */
1365 VIRTQUEUE_READ_DESC_MORE = 1, /* more buffers in chain */
1366};
1367
1368static int
1369virtqueue_read_next_desc(VuDev *dev, struct vring_desc *desc,
1370 int i, unsigned int max, unsigned int *next)
1371{
1372 /* If this descriptor says it doesn't chain, we're done. */
1373 if (!(desc[i].flags & VRING_DESC_F_NEXT)) {
1374 return VIRTQUEUE_READ_DESC_DONE;
1375 }
1376
1377 /* Check they're not leading us off end of descriptors. */
1378 *next = desc[i].next;
1379 /* Make sure compiler knows to grab that: we don't want it changing! */
1380 smp_wmb();
1381
1382 if (*next >= max) {
1383 vu_panic(dev, "Desc next is %u", next);
1384 return VIRTQUEUE_READ_DESC_ERROR;
1385 }
1386
1387 return VIRTQUEUE_READ_DESC_MORE;
1388}
1389
1390void
1391vu_queue_get_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int *in_bytes,
1392 unsigned int *out_bytes,
1393 unsigned max_in_bytes, unsigned max_out_bytes)
1394{
1395 unsigned int idx;
1396 unsigned int total_bufs, in_total, out_total;
1397 int rc;
1398
1399 idx = vq->last_avail_idx;
1400
1401 total_bufs = in_total = out_total = 0;
640601c7
MAL
1402 if (unlikely(dev->broken) ||
1403 unlikely(!vq->vring.avail)) {
1404 goto done;
1405 }
1406
7b2e5c65 1407 while ((rc = virtqueue_num_heads(dev, vq, idx)) > 0) {
293084a7
YX
1408 unsigned int max, desc_len, num_bufs, indirect = 0;
1409 uint64_t desc_addr, read_len;
7b2e5c65 1410 struct vring_desc *desc;
293084a7 1411 struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE];
7b2e5c65
MAL
1412 unsigned int i;
1413
1414 max = vq->vring.num;
1415 num_bufs = total_bufs;
1416 if (!virtqueue_get_head(dev, vq, idx++, &i)) {
1417 goto err;
1418 }
1419 desc = vq->vring.desc;
1420
1421 if (desc[i].flags & VRING_DESC_F_INDIRECT) {
1422 if (desc[i].len % sizeof(struct vring_desc)) {
1423 vu_panic(dev, "Invalid size for indirect buffer table");
1424 goto err;
1425 }
1426
1427 /* If we've got too many, that implies a descriptor loop. */
1428 if (num_bufs >= max) {
1429 vu_panic(dev, "Looped descriptor");
1430 goto err;
1431 }
1432
1433 /* loop over the indirect descriptor table */
1434 indirect = 1;
293084a7
YX
1435 desc_addr = desc[i].addr;
1436 desc_len = desc[i].len;
1437 max = desc_len / sizeof(struct vring_desc);
1438 read_len = desc_len;
1439 desc = vu_gpa_to_va(dev, &read_len, desc_addr);
1440 if (unlikely(desc && read_len != desc_len)) {
1441 /* Failed to use zero copy */
1442 desc = NULL;
1443 if (!virtqueue_read_indirect_desc(dev, desc_buf,
1444 desc_addr,
1445 desc_len)) {
1446 desc = desc_buf;
1447 }
1448 }
1449 if (!desc) {
1450 vu_panic(dev, "Invalid indirect buffer table");
1451 goto err;
1452 }
7b2e5c65
MAL
1453 num_bufs = i = 0;
1454 }
1455
1456 do {
1457 /* If we've got too many, that implies a descriptor loop. */
1458 if (++num_bufs > max) {
1459 vu_panic(dev, "Looped descriptor");
1460 goto err;
1461 }
1462
1463 if (desc[i].flags & VRING_DESC_F_WRITE) {
1464 in_total += desc[i].len;
1465 } else {
1466 out_total += desc[i].len;
1467 }
1468 if (in_total >= max_in_bytes && out_total >= max_out_bytes) {
1469 goto done;
1470 }
1471 rc = virtqueue_read_next_desc(dev, desc, i, max, &i);
1472 } while (rc == VIRTQUEUE_READ_DESC_MORE);
1473
1474 if (rc == VIRTQUEUE_READ_DESC_ERROR) {
1475 goto err;
1476 }
1477
1478 if (!indirect) {
1479 total_bufs = num_bufs;
1480 } else {
1481 total_bufs++;
1482 }
1483 }
1484 if (rc < 0) {
1485 goto err;
1486 }
1487done:
1488 if (in_bytes) {
1489 *in_bytes = in_total;
1490 }
1491 if (out_bytes) {
1492 *out_bytes = out_total;
1493 }
1494 return;
1495
1496err:
1497 in_total = out_total = 0;
1498 goto done;
1499}
1500
1501bool
1502vu_queue_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int in_bytes,
1503 unsigned int out_bytes)
1504{
1505 unsigned int in_total, out_total;
1506
1507 vu_queue_get_avail_bytes(dev, vq, &in_total, &out_total,
1508 in_bytes, out_bytes);
1509
1510 return in_bytes <= in_total && out_bytes <= out_total;
1511}
1512
1513/* Fetch avail_idx from VQ memory only when we really need to know if
1514 * guest has added some buffers. */
640601c7 1515bool
7b2e5c65
MAL
1516vu_queue_empty(VuDev *dev, VuVirtq *vq)
1517{
640601c7
MAL
1518 if (unlikely(dev->broken) ||
1519 unlikely(!vq->vring.avail)) {
1520 return true;
1521 }
1522
7b2e5c65 1523 if (vq->shadow_avail_idx != vq->last_avail_idx) {
640601c7 1524 return false;
7b2e5c65
MAL
1525 }
1526
1527 return vring_avail_idx(vq) == vq->last_avail_idx;
1528}
1529
1530static inline
1531bool has_feature(uint64_t features, unsigned int fbit)
1532{
1533 assert(fbit < 64);
1534 return !!(features & (1ULL << fbit));
1535}
1536
1537static inline
1538bool vu_has_feature(VuDev *dev,
1539 unsigned int fbit)
1540{
1541 return has_feature(dev->features, fbit);
1542}
1543
1544static bool
1545vring_notify(VuDev *dev, VuVirtq *vq)
1546{
1547 uint16_t old, new;
1548 bool v;
1549
1550 /* We need to expose used array entries before checking used event. */
1551 smp_mb();
1552
1553 /* Always notify when queue is empty (when feature acknowledge) */
1554 if (vu_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) &&
1555 !vq->inuse && vu_queue_empty(dev, vq)) {
1556 return true;
1557 }
1558
1559 if (!vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
1560 return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT);
1561 }
1562
1563 v = vq->signalled_used_valid;
1564 vq->signalled_used_valid = true;
1565 old = vq->signalled_used;
1566 new = vq->signalled_used = vq->used_idx;
1567 return !v || vring_need_event(vring_get_used_event(vq), new, old);
1568}
1569
1570void
1571vu_queue_notify(VuDev *dev, VuVirtq *vq)
1572{
640601c7
MAL
1573 if (unlikely(dev->broken) ||
1574 unlikely(!vq->vring.avail)) {
7b2e5c65
MAL
1575 return;
1576 }
1577
1578 if (!vring_notify(dev, vq)) {
1579 DPRINT("skipped notify...\n");
1580 return;
1581 }
1582
1583 if (eventfd_write(vq->call_fd, 1) < 0) {
1584 vu_panic(dev, "Error writing eventfd: %s", strerror(errno));
1585 }
1586}
1587
1588static inline void
1589vring_used_flags_set_bit(VuVirtq *vq, int mask)
1590{
1591 uint16_t *flags;
1592
1593 flags = (uint16_t *)((char*)vq->vring.used +
1594 offsetof(struct vring_used, flags));
1595 *flags |= mask;
1596}
1597
1598static inline void
1599vring_used_flags_unset_bit(VuVirtq *vq, int mask)
1600{
1601 uint16_t *flags;
1602
1603 flags = (uint16_t *)((char*)vq->vring.used +
1604 offsetof(struct vring_used, flags));
1605 *flags &= ~mask;
1606}
1607
1608static inline void
1609vring_set_avail_event(VuVirtq *vq, uint16_t val)
1610{
1611 if (!vq->notification) {
1612 return;
1613 }
1614
1615 *((uint16_t *) &vq->vring.used->ring[vq->vring.num]) = val;
1616}
1617
1618void
1619vu_queue_set_notification(VuDev *dev, VuVirtq *vq, int enable)
1620{
1621 vq->notification = enable;
1622 if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
1623 vring_set_avail_event(vq, vring_avail_idx(vq));
1624 } else if (enable) {
1625 vring_used_flags_unset_bit(vq, VRING_USED_F_NO_NOTIFY);
1626 } else {
1627 vring_used_flags_set_bit(vq, VRING_USED_F_NO_NOTIFY);
1628 }
1629 if (enable) {
1630 /* Expose avail event/used flags before caller checks the avail idx. */
1631 smp_mb();
1632 }
1633}
1634
1635static void
1636virtqueue_map_desc(VuDev *dev,
1637 unsigned int *p_num_sg, struct iovec *iov,
1638 unsigned int max_num_sg, bool is_write,
1639 uint64_t pa, size_t sz)
1640{
1641 unsigned num_sg = *p_num_sg;
1642
1643 assert(num_sg <= max_num_sg);
1644
1645 if (!sz) {
1646 vu_panic(dev, "virtio: zero sized buffers are not allowed");
1647 return;
1648 }
1649
293084a7
YX
1650 while (sz) {
1651 uint64_t len = sz;
1652
1653 if (num_sg == max_num_sg) {
1654 vu_panic(dev, "virtio: too many descriptors in indirect table");
1655 return;
1656 }
1657
1658 iov[num_sg].iov_base = vu_gpa_to_va(dev, &len, pa);
1659 if (iov[num_sg].iov_base == NULL) {
1660 vu_panic(dev, "virtio: invalid address for buffers");
1661 return;
1662 }
1663 iov[num_sg].iov_len = len;
1664 num_sg++;
1665 sz -= len;
1666 pa += len;
1667 }
7b2e5c65
MAL
1668
1669 *p_num_sg = num_sg;
1670}
1671
1672/* Round number down to multiple */
1673#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
1674
1675/* Round number up to multiple */
1676#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
1677
1678static void *
1679virtqueue_alloc_element(size_t sz,
1680 unsigned out_num, unsigned in_num)
1681{
1682 VuVirtqElement *elem;
1683 size_t in_sg_ofs = ALIGN_UP(sz, __alignof__(elem->in_sg[0]));
1684 size_t out_sg_ofs = in_sg_ofs + in_num * sizeof(elem->in_sg[0]);
1685 size_t out_sg_end = out_sg_ofs + out_num * sizeof(elem->out_sg[0]);
1686
1687 assert(sz >= sizeof(VuVirtqElement));
1688 elem = malloc(out_sg_end);
1689 elem->out_num = out_num;
1690 elem->in_num = in_num;
1691 elem->in_sg = (void *)elem + in_sg_ofs;
1692 elem->out_sg = (void *)elem + out_sg_ofs;
1693 return elem;
1694}
1695
1696void *
1697vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz)
1698{
293084a7
YX
1699 unsigned int i, head, max, desc_len;
1700 uint64_t desc_addr, read_len;
7b2e5c65
MAL
1701 VuVirtqElement *elem;
1702 unsigned out_num, in_num;
1703 struct iovec iov[VIRTQUEUE_MAX_SIZE];
293084a7 1704 struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE];
7b2e5c65
MAL
1705 struct vring_desc *desc;
1706 int rc;
1707
640601c7
MAL
1708 if (unlikely(dev->broken) ||
1709 unlikely(!vq->vring.avail)) {
7b2e5c65
MAL
1710 return NULL;
1711 }
1712
1713 if (vu_queue_empty(dev, vq)) {
1714 return NULL;
1715 }
1716 /* Needed after virtio_queue_empty(), see comment in
1717 * virtqueue_num_heads(). */
1718 smp_rmb();
1719
1720 /* When we start there are none of either input nor output. */
1721 out_num = in_num = 0;
1722
1723 max = vq->vring.num;
1724 if (vq->inuse >= vq->vring.num) {
1725 vu_panic(dev, "Virtqueue size exceeded");
1726 return NULL;
1727 }
1728
1729 if (!virtqueue_get_head(dev, vq, vq->last_avail_idx++, &head)) {
1730 return NULL;
1731 }
1732
1733 if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
1734 vring_set_avail_event(vq, vq->last_avail_idx);
1735 }
1736
1737 i = head;
1738 desc = vq->vring.desc;
1739 if (desc[i].flags & VRING_DESC_F_INDIRECT) {
1740 if (desc[i].len % sizeof(struct vring_desc)) {
1741 vu_panic(dev, "Invalid size for indirect buffer table");
1742 }
1743
1744 /* loop over the indirect descriptor table */
293084a7
YX
1745 desc_addr = desc[i].addr;
1746 desc_len = desc[i].len;
1747 max = desc_len / sizeof(struct vring_desc);
1748 read_len = desc_len;
1749 desc = vu_gpa_to_va(dev, &read_len, desc_addr);
1750 if (unlikely(desc && read_len != desc_len)) {
1751 /* Failed to use zero copy */
1752 desc = NULL;
1753 if (!virtqueue_read_indirect_desc(dev, desc_buf,
1754 desc_addr,
1755 desc_len)) {
1756 desc = desc_buf;
1757 }
1758 }
1759 if (!desc) {
1760 vu_panic(dev, "Invalid indirect buffer table");
1761 return NULL;
1762 }
7b2e5c65
MAL
1763 i = 0;
1764 }
1765
1766 /* Collect all the descriptors */
1767 do {
1768 if (desc[i].flags & VRING_DESC_F_WRITE) {
1769 virtqueue_map_desc(dev, &in_num, iov + out_num,
1770 VIRTQUEUE_MAX_SIZE - out_num, true,
1771 desc[i].addr, desc[i].len);
1772 } else {
1773 if (in_num) {
1774 vu_panic(dev, "Incorrect order for descriptors");
1775 return NULL;
1776 }
1777 virtqueue_map_desc(dev, &out_num, iov,
1778 VIRTQUEUE_MAX_SIZE, false,
1779 desc[i].addr, desc[i].len);
1780 }
1781
1782 /* If we've got too many, that implies a descriptor loop. */
1783 if ((in_num + out_num) > max) {
1784 vu_panic(dev, "Looped descriptor");
1785 }
1786 rc = virtqueue_read_next_desc(dev, desc, i, max, &i);
1787 } while (rc == VIRTQUEUE_READ_DESC_MORE);
1788
1789 if (rc == VIRTQUEUE_READ_DESC_ERROR) {
1790 return NULL;
1791 }
1792
1793 /* Now copy what we have collected and mapped */
1794 elem = virtqueue_alloc_element(sz, out_num, in_num);
1795 elem->index = head;
1796 for (i = 0; i < out_num; i++) {
1797 elem->out_sg[i] = iov[i];
1798 }
1799 for (i = 0; i < in_num; i++) {
1800 elem->in_sg[i] = iov[out_num + i];
1801 }
1802
1803 vq->inuse++;
1804
1805 return elem;
1806}
1807
1808bool
1809vu_queue_rewind(VuDev *dev, VuVirtq *vq, unsigned int num)
1810{
1811 if (num > vq->inuse) {
1812 return false;
1813 }
1814 vq->last_avail_idx -= num;
1815 vq->inuse -= num;
1816 return true;
1817}
1818
1819static inline
1820void vring_used_write(VuDev *dev, VuVirtq *vq,
1821 struct vring_used_elem *uelem, int i)
1822{
1823 struct vring_used *used = vq->vring.used;
1824
1825 used->ring[i] = *uelem;
1826 vu_log_write(dev, vq->vring.log_guest_addr +
1827 offsetof(struct vring_used, ring[i]),
1828 sizeof(used->ring[i]));
1829}
1830
1831
1832static void
1833vu_log_queue_fill(VuDev *dev, VuVirtq *vq,
1834 const VuVirtqElement *elem,
1835 unsigned int len)
1836{
1837 struct vring_desc *desc = vq->vring.desc;
293084a7
YX
1838 unsigned int i, max, min, desc_len;
1839 uint64_t desc_addr, read_len;
1840 struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE];
7b2e5c65
MAL
1841 unsigned num_bufs = 0;
1842
1843 max = vq->vring.num;
1844 i = elem->index;
1845
1846 if (desc[i].flags & VRING_DESC_F_INDIRECT) {
1847 if (desc[i].len % sizeof(struct vring_desc)) {
1848 vu_panic(dev, "Invalid size for indirect buffer table");
1849 }
1850
1851 /* loop over the indirect descriptor table */
293084a7
YX
1852 desc_addr = desc[i].addr;
1853 desc_len = desc[i].len;
1854 max = desc_len / sizeof(struct vring_desc);
1855 read_len = desc_len;
1856 desc = vu_gpa_to_va(dev, &read_len, desc_addr);
1857 if (unlikely(desc && read_len != desc_len)) {
1858 /* Failed to use zero copy */
1859 desc = NULL;
1860 if (!virtqueue_read_indirect_desc(dev, desc_buf,
1861 desc_addr,
1862 desc_len)) {
1863 desc = desc_buf;
1864 }
1865 }
1866 if (!desc) {
1867 vu_panic(dev, "Invalid indirect buffer table");
1868 return;
1869 }
7b2e5c65
MAL
1870 i = 0;
1871 }
1872
1873 do {
1874 if (++num_bufs > max) {
1875 vu_panic(dev, "Looped descriptor");
1876 return;
1877 }
1878
1879 if (desc[i].flags & VRING_DESC_F_WRITE) {
1880 min = MIN(desc[i].len, len);
1881 vu_log_write(dev, desc[i].addr, min);
1882 len -= min;
1883 }
1884
1885 } while (len > 0 &&
1886 (virtqueue_read_next_desc(dev, desc, i, max, &i)
1887 == VIRTQUEUE_READ_DESC_MORE));
1888}
1889
1890void
1891vu_queue_fill(VuDev *dev, VuVirtq *vq,
1892 const VuVirtqElement *elem,
1893 unsigned int len, unsigned int idx)
1894{
1895 struct vring_used_elem uelem;
1896
640601c7
MAL
1897 if (unlikely(dev->broken) ||
1898 unlikely(!vq->vring.avail)) {
7b2e5c65
MAL
1899 return;
1900 }
1901
1902 vu_log_queue_fill(dev, vq, elem, len);
1903
1904 idx = (idx + vq->used_idx) % vq->vring.num;
1905
1906 uelem.id = elem->index;
1907 uelem.len = len;
1908 vring_used_write(dev, vq, &uelem, idx);
1909}
1910
1911static inline
1912void vring_used_idx_set(VuDev *dev, VuVirtq *vq, uint16_t val)
1913{
1914 vq->vring.used->idx = val;
1915 vu_log_write(dev,
1916 vq->vring.log_guest_addr + offsetof(struct vring_used, idx),
1917 sizeof(vq->vring.used->idx));
1918
1919 vq->used_idx = val;
1920}
1921
1922void
1923vu_queue_flush(VuDev *dev, VuVirtq *vq, unsigned int count)
1924{
1925 uint16_t old, new;
1926
640601c7
MAL
1927 if (unlikely(dev->broken) ||
1928 unlikely(!vq->vring.avail)) {
7b2e5c65
MAL
1929 return;
1930 }
1931
1932 /* Make sure buffer is written before we update index. */
1933 smp_wmb();
1934
1935 old = vq->used_idx;
1936 new = old + count;
1937 vring_used_idx_set(dev, vq, new);
1938 vq->inuse -= count;
1939 if (unlikely((int16_t)(new - vq->signalled_used) < (uint16_t)(new - old))) {
1940 vq->signalled_used_valid = false;
1941 }
1942}
1943
1944void
1945vu_queue_push(VuDev *dev, VuVirtq *vq,
1946 const VuVirtqElement *elem, unsigned int len)
1947{
1948 vu_queue_fill(dev, vq, elem, len, 0);
1949 vu_queue_flush(dev, vq, 1);
1950}