4 * Copyright IBM, Corp. 2007
5 * Copyright (c) 2016 Red Hat, Inc.
8 * Anthony Liguori <aliguori@us.ibm.com>
9 * Marc-André Lureau <mlureau@redhat.com>
10 * Victor Kaplansky <victork@redhat.com>
12 * This work is licensed under the terms of the GNU GPL, version 2 or
13 * later. See the COPYING file in the top-level directory.
16 /* this code avoids GLib dependency */
25 #include <sys/types.h>
26 #include <sys/socket.h>
27 #include <sys/eventfd.h>
29 #include "qemu/compiler.h"
31 #if defined(__linux__)
32 #include <sys/syscall.h>
34 #include <sys/ioctl.h>
35 #include <linux/vhost.h>
37 #ifdef __NR_userfaultfd
38 #include <linux/userfaultfd.h>
43 #include "qemu/atomic.h"
45 #include "libvhost-user.h"
47 /* usually provided by GLib */
49 #define MIN(x, y) ({ \
50 typeof(x) _min1 = (x); \
51 typeof(y) _min2 = (y); \
52 (void) (&_min1 == &_min2); \
53 _min1 < _min2 ? _min1 : _min2; })
56 #define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
58 /* The version of the protocol we support */
59 #define VHOST_USER_VERSION 1
60 #define LIBVHOST_USER_DEBUG 0
64 if (LIBVHOST_USER_DEBUG) { \
65 fprintf(stderr, __VA_ARGS__); \
70 vu_request_to_string(unsigned int req
)
72 #define REQ(req) [req] = #req
73 static const char *vu_request_str
[] = {
75 REQ(VHOST_USER_GET_FEATURES
),
76 REQ(VHOST_USER_SET_FEATURES
),
77 REQ(VHOST_USER_SET_OWNER
),
78 REQ(VHOST_USER_RESET_OWNER
),
79 REQ(VHOST_USER_SET_MEM_TABLE
),
80 REQ(VHOST_USER_SET_LOG_BASE
),
81 REQ(VHOST_USER_SET_LOG_FD
),
82 REQ(VHOST_USER_SET_VRING_NUM
),
83 REQ(VHOST_USER_SET_VRING_ADDR
),
84 REQ(VHOST_USER_SET_VRING_BASE
),
85 REQ(VHOST_USER_GET_VRING_BASE
),
86 REQ(VHOST_USER_SET_VRING_KICK
),
87 REQ(VHOST_USER_SET_VRING_CALL
),
88 REQ(VHOST_USER_SET_VRING_ERR
),
89 REQ(VHOST_USER_GET_PROTOCOL_FEATURES
),
90 REQ(VHOST_USER_SET_PROTOCOL_FEATURES
),
91 REQ(VHOST_USER_GET_QUEUE_NUM
),
92 REQ(VHOST_USER_SET_VRING_ENABLE
),
93 REQ(VHOST_USER_SEND_RARP
),
94 REQ(VHOST_USER_NET_SET_MTU
),
95 REQ(VHOST_USER_SET_SLAVE_REQ_FD
),
96 REQ(VHOST_USER_IOTLB_MSG
),
97 REQ(VHOST_USER_SET_VRING_ENDIAN
),
98 REQ(VHOST_USER_GET_CONFIG
),
99 REQ(VHOST_USER_SET_CONFIG
),
100 REQ(VHOST_USER_POSTCOPY_ADVISE
),
101 REQ(VHOST_USER_POSTCOPY_LISTEN
),
106 if (req
< VHOST_USER_MAX
) {
107 return vu_request_str
[req
];
114 vu_panic(VuDev
*dev
, const char *msg
, ...)
120 if (vasprintf(&buf
, msg
, ap
) < 0) {
126 dev
->panic(dev
, buf
);
129 /* FIXME: find a way to call virtio_error? */
132 /* Translate guest physical address to our virtual address. */
134 vu_gpa_to_va(VuDev
*dev
, uint64_t *plen
, uint64_t guest_addr
)
142 /* Find matching memory region. */
143 for (i
= 0; i
< dev
->nregions
; i
++) {
144 VuDevRegion
*r
= &dev
->regions
[i
];
146 if ((guest_addr
>= r
->gpa
) && (guest_addr
< (r
->gpa
+ r
->size
))) {
147 if ((guest_addr
+ *plen
) > (r
->gpa
+ r
->size
)) {
148 *plen
= r
->gpa
+ r
->size
- guest_addr
;
150 return (void *)(uintptr_t)
151 guest_addr
- r
->gpa
+ r
->mmap_addr
+ r
->mmap_offset
;
158 /* Translate qemu virtual address to our virtual address. */
160 qva_to_va(VuDev
*dev
, uint64_t qemu_addr
)
164 /* Find matching memory region. */
165 for (i
= 0; i
< dev
->nregions
; i
++) {
166 VuDevRegion
*r
= &dev
->regions
[i
];
168 if ((qemu_addr
>= r
->qva
) && (qemu_addr
< (r
->qva
+ r
->size
))) {
169 return (void *)(uintptr_t)
170 qemu_addr
- r
->qva
+ r
->mmap_addr
+ r
->mmap_offset
;
178 vmsg_close_fds(VhostUserMsg
*vmsg
)
182 for (i
= 0; i
< vmsg
->fd_num
; i
++) {
188 vu_message_read(VuDev
*dev
, int conn_fd
, VhostUserMsg
*vmsg
)
190 char control
[CMSG_SPACE(VHOST_MEMORY_MAX_NREGIONS
* sizeof(int))] = { };
192 .iov_base
= (char *)vmsg
,
193 .iov_len
= VHOST_USER_HDR_SIZE
,
195 struct msghdr msg
= {
198 .msg_control
= control
,
199 .msg_controllen
= sizeof(control
),
202 struct cmsghdr
*cmsg
;
206 rc
= recvmsg(conn_fd
, &msg
, 0);
207 } while (rc
< 0 && (errno
== EINTR
|| errno
== EAGAIN
));
210 vu_panic(dev
, "Error while recvmsg: %s", strerror(errno
));
215 for (cmsg
= CMSG_FIRSTHDR(&msg
);
217 cmsg
= CMSG_NXTHDR(&msg
, cmsg
))
219 if (cmsg
->cmsg_level
== SOL_SOCKET
&& cmsg
->cmsg_type
== SCM_RIGHTS
) {
220 fd_size
= cmsg
->cmsg_len
- CMSG_LEN(0);
221 vmsg
->fd_num
= fd_size
/ sizeof(int);
222 memcpy(vmsg
->fds
, CMSG_DATA(cmsg
), fd_size
);
227 if (vmsg
->size
> sizeof(vmsg
->payload
)) {
229 "Error: too big message request: %d, size: vmsg->size: %u, "
230 "while sizeof(vmsg->payload) = %zu\n",
231 vmsg
->request
, vmsg
->size
, sizeof(vmsg
->payload
));
237 rc
= read(conn_fd
, &vmsg
->payload
, vmsg
->size
);
238 } while (rc
< 0 && (errno
== EINTR
|| errno
== EAGAIN
));
241 vu_panic(dev
, "Error while reading: %s", strerror(errno
));
245 assert(rc
== vmsg
->size
);
251 vmsg_close_fds(vmsg
);
257 vu_message_write(VuDev
*dev
, int conn_fd
, VhostUserMsg
*vmsg
)
260 uint8_t *p
= (uint8_t *)vmsg
;
261 char control
[CMSG_SPACE(VHOST_MEMORY_MAX_NREGIONS
* sizeof(int))] = { };
263 .iov_base
= (char *)vmsg
,
264 .iov_len
= VHOST_USER_HDR_SIZE
,
266 struct msghdr msg
= {
269 .msg_control
= control
,
271 struct cmsghdr
*cmsg
;
273 memset(control
, 0, sizeof(control
));
274 assert(vmsg
->fd_num
<= VHOST_MEMORY_MAX_NREGIONS
);
275 if (vmsg
->fd_num
> 0) {
276 size_t fdsize
= vmsg
->fd_num
* sizeof(int);
277 msg
.msg_controllen
= CMSG_SPACE(fdsize
);
278 cmsg
= CMSG_FIRSTHDR(&msg
);
279 cmsg
->cmsg_len
= CMSG_LEN(fdsize
);
280 cmsg
->cmsg_level
= SOL_SOCKET
;
281 cmsg
->cmsg_type
= SCM_RIGHTS
;
282 memcpy(CMSG_DATA(cmsg
), vmsg
->fds
, fdsize
);
284 msg
.msg_controllen
= 0;
287 /* Set the version in the flags when sending the reply */
288 vmsg
->flags
&= ~VHOST_USER_VERSION_MASK
;
289 vmsg
->flags
|= VHOST_USER_VERSION
;
290 vmsg
->flags
|= VHOST_USER_REPLY_MASK
;
293 rc
= sendmsg(conn_fd
, &msg
, 0);
294 } while (rc
< 0 && (errno
== EINTR
|| errno
== EAGAIN
));
298 rc
= write(conn_fd
, vmsg
->data
, vmsg
->size
);
300 rc
= write(conn_fd
, p
+ VHOST_USER_HDR_SIZE
, vmsg
->size
);
302 } while (rc
< 0 && (errno
== EINTR
|| errno
== EAGAIN
));
305 vu_panic(dev
, "Error while writing: %s", strerror(errno
));
312 /* Kick the log_call_fd if required. */
314 vu_log_kick(VuDev
*dev
)
316 if (dev
->log_call_fd
!= -1) {
317 DPRINT("Kicking the QEMU's log...\n");
318 if (eventfd_write(dev
->log_call_fd
, 1) < 0) {
319 vu_panic(dev
, "Error writing eventfd: %s", strerror(errno
));
325 vu_log_page(uint8_t *log_table
, uint64_t page
)
327 DPRINT("Logged dirty guest page: %"PRId64
"\n", page
);
328 atomic_or(&log_table
[page
/ 8], 1 << (page
% 8));
332 vu_log_write(VuDev
*dev
, uint64_t address
, uint64_t length
)
336 if (!(dev
->features
& (1ULL << VHOST_F_LOG_ALL
)) ||
337 !dev
->log_table
|| !length
) {
341 assert(dev
->log_size
> ((address
+ length
- 1) / VHOST_LOG_PAGE
/ 8));
343 page
= address
/ VHOST_LOG_PAGE
;
344 while (page
* VHOST_LOG_PAGE
< address
+ length
) {
345 vu_log_page(dev
->log_table
, page
);
346 page
+= VHOST_LOG_PAGE
;
353 vu_kick_cb(VuDev
*dev
, int condition
, void *data
)
355 int index
= (intptr_t)data
;
356 VuVirtq
*vq
= &dev
->vq
[index
];
357 int sock
= vq
->kick_fd
;
361 rc
= eventfd_read(sock
, &kick_data
);
363 vu_panic(dev
, "kick eventfd_read(): %s", strerror(errno
));
364 dev
->remove_watch(dev
, dev
->vq
[index
].kick_fd
);
366 DPRINT("Got kick_data: %016"PRIx64
" handler:%p idx:%d\n",
367 kick_data
, vq
->handler
, index
);
369 vq
->handler(dev
, index
);
375 vu_get_features_exec(VuDev
*dev
, VhostUserMsg
*vmsg
)
378 1ULL << VHOST_F_LOG_ALL
|
379 1ULL << VHOST_USER_F_PROTOCOL_FEATURES
;
381 if (dev
->iface
->get_features
) {
382 vmsg
->payload
.u64
|= dev
->iface
->get_features(dev
);
385 vmsg
->size
= sizeof(vmsg
->payload
.u64
);
388 DPRINT("Sending back to guest u64: 0x%016"PRIx64
"\n", vmsg
->payload
.u64
);
394 vu_set_enable_all_rings(VuDev
*dev
, bool enabled
)
398 for (i
= 0; i
< VHOST_MAX_NR_VIRTQUEUE
; i
++) {
399 dev
->vq
[i
].enable
= enabled
;
404 vu_set_features_exec(VuDev
*dev
, VhostUserMsg
*vmsg
)
406 DPRINT("u64: 0x%016"PRIx64
"\n", vmsg
->payload
.u64
);
408 dev
->features
= vmsg
->payload
.u64
;
410 if (!(dev
->features
& VHOST_USER_F_PROTOCOL_FEATURES
)) {
411 vu_set_enable_all_rings(dev
, true);
414 if (dev
->iface
->set_features
) {
415 dev
->iface
->set_features(dev
, dev
->features
);
422 vu_set_owner_exec(VuDev
*dev
, VhostUserMsg
*vmsg
)
428 vu_close_log(VuDev
*dev
)
430 if (dev
->log_table
) {
431 if (munmap(dev
->log_table
, dev
->log_size
) != 0) {
432 perror("close log munmap() error");
435 dev
->log_table
= NULL
;
437 if (dev
->log_call_fd
!= -1) {
438 close(dev
->log_call_fd
);
439 dev
->log_call_fd
= -1;
444 vu_reset_device_exec(VuDev
*dev
, VhostUserMsg
*vmsg
)
446 vu_set_enable_all_rings(dev
, false);
452 vu_set_mem_table_exec_postcopy(VuDev
*dev
, VhostUserMsg
*vmsg
)
455 VhostUserMemory
*memory
= &vmsg
->payload
.memory
;
456 dev
->nregions
= memory
->nregions
;
457 /* TODO: Postcopy specific code */
458 DPRINT("Nregions: %d\n", memory
->nregions
);
459 for (i
= 0; i
< dev
->nregions
; i
++) {
461 VhostUserMemoryRegion
*msg_region
= &memory
->regions
[i
];
462 VuDevRegion
*dev_region
= &dev
->regions
[i
];
464 DPRINT("Region %d\n", i
);
465 DPRINT(" guest_phys_addr: 0x%016"PRIx64
"\n",
466 msg_region
->guest_phys_addr
);
467 DPRINT(" memory_size: 0x%016"PRIx64
"\n",
468 msg_region
->memory_size
);
469 DPRINT(" userspace_addr 0x%016"PRIx64
"\n",
470 msg_region
->userspace_addr
);
471 DPRINT(" mmap_offset 0x%016"PRIx64
"\n",
472 msg_region
->mmap_offset
);
474 dev_region
->gpa
= msg_region
->guest_phys_addr
;
475 dev_region
->size
= msg_region
->memory_size
;
476 dev_region
->qva
= msg_region
->userspace_addr
;
477 dev_region
->mmap_offset
= msg_region
->mmap_offset
;
479 /* We don't use offset argument of mmap() since the
480 * mapped address has to be page aligned, and we use huge
482 mmap_addr
= mmap(0, dev_region
->size
+ dev_region
->mmap_offset
,
483 PROT_READ
| PROT_WRITE
, MAP_SHARED
,
486 if (mmap_addr
== MAP_FAILED
) {
487 vu_panic(dev
, "region mmap error: %s", strerror(errno
));
489 dev_region
->mmap_addr
= (uint64_t)(uintptr_t)mmap_addr
;
490 DPRINT(" mmap_addr: 0x%016"PRIx64
"\n",
491 dev_region
->mmap_addr
);
494 /* Return the address to QEMU so that it can translate the ufd
495 * fault addresses back.
497 msg_region
->userspace_addr
= (uintptr_t)(mmap_addr
+
498 dev_region
->mmap_offset
);
502 /* Send the message back to qemu with the addresses filled in */
504 if (!vu_message_write(dev
, dev
->sock
, vmsg
)) {
505 vu_panic(dev
, "failed to respond to set-mem-table for postcopy");
509 /* Wait for QEMU to confirm that it's registered the handler for the
512 if (!vu_message_read(dev
, dev
->sock
, vmsg
) ||
513 vmsg
->size
!= sizeof(vmsg
->payload
.u64
) ||
514 vmsg
->payload
.u64
!= 0) {
515 vu_panic(dev
, "failed to receive valid ack for postcopy set-mem-table");
519 /* OK, now we can go and register the memory and generate faults */
520 for (i
= 0; i
< dev
->nregions
; i
++) {
521 VuDevRegion
*dev_region
= &dev
->regions
[i
];
522 #ifdef UFFDIO_REGISTER
523 /* We should already have an open ufd. Mark each memory
525 * Note: Do we need any madvises? Well it's not been accessed
526 * yet, still probably need no THP to be safe, discard to be safe?
528 struct uffdio_register reg_struct
;
529 reg_struct
.range
.start
= (uintptr_t)dev_region
->mmap_addr
;
530 reg_struct
.range
.len
= dev_region
->size
+ dev_region
->mmap_offset
;
531 reg_struct
.mode
= UFFDIO_REGISTER_MODE_MISSING
;
533 if (ioctl(dev
->postcopy_ufd
, UFFDIO_REGISTER
, ®_struct
)) {
534 vu_panic(dev
, "%s: Failed to userfault region %d "
535 "@%p + size:%zx offset: %zx: (ufd=%d)%s\n",
537 dev_region
->mmap_addr
,
538 dev_region
->size
, dev_region
->mmap_offset
,
539 dev
->postcopy_ufd
, strerror(errno
));
542 if (!(reg_struct
.ioctls
& ((__u64
)1 << _UFFDIO_COPY
))) {
543 vu_panic(dev
, "%s Region (%d) doesn't support COPY",
547 DPRINT("%s: region %d: Registered userfault for %llx + %llx\n",
548 __func__
, i
, reg_struct
.range
.start
, reg_struct
.range
.len
);
549 /* TODO: Stash 'zero' support flags somewhere */
557 vu_set_mem_table_exec(VuDev
*dev
, VhostUserMsg
*vmsg
)
560 VhostUserMemory
*memory
= &vmsg
->payload
.memory
;
562 for (i
= 0; i
< dev
->nregions
; i
++) {
563 VuDevRegion
*r
= &dev
->regions
[i
];
564 void *m
= (void *) (uintptr_t) r
->mmap_addr
;
567 munmap(m
, r
->size
+ r
->mmap_offset
);
570 dev
->nregions
= memory
->nregions
;
572 if (dev
->postcopy_listening
) {
573 return vu_set_mem_table_exec_postcopy(dev
, vmsg
);
576 DPRINT("Nregions: %d\n", memory
->nregions
);
577 for (i
= 0; i
< dev
->nregions
; i
++) {
579 VhostUserMemoryRegion
*msg_region
= &memory
->regions
[i
];
580 VuDevRegion
*dev_region
= &dev
->regions
[i
];
582 DPRINT("Region %d\n", i
);
583 DPRINT(" guest_phys_addr: 0x%016"PRIx64
"\n",
584 msg_region
->guest_phys_addr
);
585 DPRINT(" memory_size: 0x%016"PRIx64
"\n",
586 msg_region
->memory_size
);
587 DPRINT(" userspace_addr 0x%016"PRIx64
"\n",
588 msg_region
->userspace_addr
);
589 DPRINT(" mmap_offset 0x%016"PRIx64
"\n",
590 msg_region
->mmap_offset
);
592 dev_region
->gpa
= msg_region
->guest_phys_addr
;
593 dev_region
->size
= msg_region
->memory_size
;
594 dev_region
->qva
= msg_region
->userspace_addr
;
595 dev_region
->mmap_offset
= msg_region
->mmap_offset
;
597 /* We don't use offset argument of mmap() since the
598 * mapped address has to be page aligned, and we use huge
600 mmap_addr
= mmap(0, dev_region
->size
+ dev_region
->mmap_offset
,
601 PROT_READ
| PROT_WRITE
, MAP_SHARED
,
604 if (mmap_addr
== MAP_FAILED
) {
605 vu_panic(dev
, "region mmap error: %s", strerror(errno
));
607 dev_region
->mmap_addr
= (uint64_t)(uintptr_t)mmap_addr
;
608 DPRINT(" mmap_addr: 0x%016"PRIx64
"\n",
609 dev_region
->mmap_addr
);
619 vu_set_log_base_exec(VuDev
*dev
, VhostUserMsg
*vmsg
)
622 uint64_t log_mmap_size
, log_mmap_offset
;
625 if (vmsg
->fd_num
!= 1 ||
626 vmsg
->size
!= sizeof(vmsg
->payload
.log
)) {
627 vu_panic(dev
, "Invalid log_base message");
632 log_mmap_offset
= vmsg
->payload
.log
.mmap_offset
;
633 log_mmap_size
= vmsg
->payload
.log
.mmap_size
;
634 DPRINT("Log mmap_offset: %"PRId64
"\n", log_mmap_offset
);
635 DPRINT("Log mmap_size: %"PRId64
"\n", log_mmap_size
);
637 rc
= mmap(0, log_mmap_size
, PROT_READ
| PROT_WRITE
, MAP_SHARED
, fd
,
640 if (rc
== MAP_FAILED
) {
641 perror("log mmap error");
644 if (dev
->log_table
) {
645 munmap(dev
->log_table
, dev
->log_size
);
648 dev
->log_size
= log_mmap_size
;
650 vmsg
->size
= sizeof(vmsg
->payload
.u64
);
657 vu_set_log_fd_exec(VuDev
*dev
, VhostUserMsg
*vmsg
)
659 if (vmsg
->fd_num
!= 1) {
660 vu_panic(dev
, "Invalid log_fd message");
664 if (dev
->log_call_fd
!= -1) {
665 close(dev
->log_call_fd
);
667 dev
->log_call_fd
= vmsg
->fds
[0];
668 DPRINT("Got log_call_fd: %d\n", vmsg
->fds
[0]);
674 vu_set_vring_num_exec(VuDev
*dev
, VhostUserMsg
*vmsg
)
676 unsigned int index
= vmsg
->payload
.state
.index
;
677 unsigned int num
= vmsg
->payload
.state
.num
;
679 DPRINT("State.index: %d\n", index
);
680 DPRINT("State.num: %d\n", num
);
681 dev
->vq
[index
].vring
.num
= num
;
687 vu_set_vring_addr_exec(VuDev
*dev
, VhostUserMsg
*vmsg
)
689 struct vhost_vring_addr
*vra
= &vmsg
->payload
.addr
;
690 unsigned int index
= vra
->index
;
691 VuVirtq
*vq
= &dev
->vq
[index
];
693 DPRINT("vhost_vring_addr:\n");
694 DPRINT(" index: %d\n", vra
->index
);
695 DPRINT(" flags: %d\n", vra
->flags
);
696 DPRINT(" desc_user_addr: 0x%016llx\n", vra
->desc_user_addr
);
697 DPRINT(" used_user_addr: 0x%016llx\n", vra
->used_user_addr
);
698 DPRINT(" avail_user_addr: 0x%016llx\n", vra
->avail_user_addr
);
699 DPRINT(" log_guest_addr: 0x%016llx\n", vra
->log_guest_addr
);
701 vq
->vring
.flags
= vra
->flags
;
702 vq
->vring
.desc
= qva_to_va(dev
, vra
->desc_user_addr
);
703 vq
->vring
.used
= qva_to_va(dev
, vra
->used_user_addr
);
704 vq
->vring
.avail
= qva_to_va(dev
, vra
->avail_user_addr
);
705 vq
->vring
.log_guest_addr
= vra
->log_guest_addr
;
707 DPRINT("Setting virtq addresses:\n");
708 DPRINT(" vring_desc at %p\n", vq
->vring
.desc
);
709 DPRINT(" vring_used at %p\n", vq
->vring
.used
);
710 DPRINT(" vring_avail at %p\n", vq
->vring
.avail
);
712 if (!(vq
->vring
.desc
&& vq
->vring
.used
&& vq
->vring
.avail
)) {
713 vu_panic(dev
, "Invalid vring_addr message");
717 vq
->used_idx
= vq
->vring
.used
->idx
;
719 if (vq
->last_avail_idx
!= vq
->used_idx
) {
720 bool resume
= dev
->iface
->queue_is_processed_in_order
&&
721 dev
->iface
->queue_is_processed_in_order(dev
, index
);
723 DPRINT("Last avail index != used index: %u != %u%s\n",
724 vq
->last_avail_idx
, vq
->used_idx
,
725 resume
? ", resuming" : "");
728 vq
->shadow_avail_idx
= vq
->last_avail_idx
= vq
->used_idx
;
736 vu_set_vring_base_exec(VuDev
*dev
, VhostUserMsg
*vmsg
)
738 unsigned int index
= vmsg
->payload
.state
.index
;
739 unsigned int num
= vmsg
->payload
.state
.num
;
741 DPRINT("State.index: %d\n", index
);
742 DPRINT("State.num: %d\n", num
);
743 dev
->vq
[index
].shadow_avail_idx
= dev
->vq
[index
].last_avail_idx
= num
;
749 vu_get_vring_base_exec(VuDev
*dev
, VhostUserMsg
*vmsg
)
751 unsigned int index
= vmsg
->payload
.state
.index
;
753 DPRINT("State.index: %d\n", index
);
754 vmsg
->payload
.state
.num
= dev
->vq
[index
].last_avail_idx
;
755 vmsg
->size
= sizeof(vmsg
->payload
.state
);
757 dev
->vq
[index
].started
= false;
758 if (dev
->iface
->queue_set_started
) {
759 dev
->iface
->queue_set_started(dev
, index
, false);
762 if (dev
->vq
[index
].call_fd
!= -1) {
763 close(dev
->vq
[index
].call_fd
);
764 dev
->vq
[index
].call_fd
= -1;
766 if (dev
->vq
[index
].kick_fd
!= -1) {
767 dev
->remove_watch(dev
, dev
->vq
[index
].kick_fd
);
768 close(dev
->vq
[index
].kick_fd
);
769 dev
->vq
[index
].kick_fd
= -1;
776 vu_check_queue_msg_file(VuDev
*dev
, VhostUserMsg
*vmsg
)
778 int index
= vmsg
->payload
.u64
& VHOST_USER_VRING_IDX_MASK
;
780 if (index
>= VHOST_MAX_NR_VIRTQUEUE
) {
781 vmsg_close_fds(vmsg
);
782 vu_panic(dev
, "Invalid queue index: %u", index
);
786 if (vmsg
->payload
.u64
& VHOST_USER_VRING_NOFD_MASK
||
788 vmsg_close_fds(vmsg
);
789 vu_panic(dev
, "Invalid fds in request: %d", vmsg
->request
);
797 vu_set_vring_kick_exec(VuDev
*dev
, VhostUserMsg
*vmsg
)
799 int index
= vmsg
->payload
.u64
& VHOST_USER_VRING_IDX_MASK
;
801 DPRINT("u64: 0x%016"PRIx64
"\n", vmsg
->payload
.u64
);
803 if (!vu_check_queue_msg_file(dev
, vmsg
)) {
807 if (dev
->vq
[index
].kick_fd
!= -1) {
808 dev
->remove_watch(dev
, dev
->vq
[index
].kick_fd
);
809 close(dev
->vq
[index
].kick_fd
);
810 dev
->vq
[index
].kick_fd
= -1;
813 if (!(vmsg
->payload
.u64
& VHOST_USER_VRING_NOFD_MASK
)) {
814 dev
->vq
[index
].kick_fd
= vmsg
->fds
[0];
815 DPRINT("Got kick_fd: %d for vq: %d\n", vmsg
->fds
[0], index
);
818 dev
->vq
[index
].started
= true;
819 if (dev
->iface
->queue_set_started
) {
820 dev
->iface
->queue_set_started(dev
, index
, true);
823 if (dev
->vq
[index
].kick_fd
!= -1 && dev
->vq
[index
].handler
) {
824 dev
->set_watch(dev
, dev
->vq
[index
].kick_fd
, VU_WATCH_IN
,
825 vu_kick_cb
, (void *)(long)index
);
827 DPRINT("Waiting for kicks on fd: %d for vq: %d\n",
828 dev
->vq
[index
].kick_fd
, index
);
834 void vu_set_queue_handler(VuDev
*dev
, VuVirtq
*vq
,
835 vu_queue_handler_cb handler
)
837 int qidx
= vq
- dev
->vq
;
839 vq
->handler
= handler
;
840 if (vq
->kick_fd
>= 0) {
842 dev
->set_watch(dev
, vq
->kick_fd
, VU_WATCH_IN
,
843 vu_kick_cb
, (void *)(long)qidx
);
845 dev
->remove_watch(dev
, vq
->kick_fd
);
851 vu_set_vring_call_exec(VuDev
*dev
, VhostUserMsg
*vmsg
)
853 int index
= vmsg
->payload
.u64
& VHOST_USER_VRING_IDX_MASK
;
855 DPRINT("u64: 0x%016"PRIx64
"\n", vmsg
->payload
.u64
);
857 if (!vu_check_queue_msg_file(dev
, vmsg
)) {
861 if (dev
->vq
[index
].call_fd
!= -1) {
862 close(dev
->vq
[index
].call_fd
);
863 dev
->vq
[index
].call_fd
= -1;
866 if (!(vmsg
->payload
.u64
& VHOST_USER_VRING_NOFD_MASK
)) {
867 dev
->vq
[index
].call_fd
= vmsg
->fds
[0];
870 DPRINT("Got call_fd: %d for vq: %d\n", vmsg
->fds
[0], index
);
876 vu_set_vring_err_exec(VuDev
*dev
, VhostUserMsg
*vmsg
)
878 int index
= vmsg
->payload
.u64
& VHOST_USER_VRING_IDX_MASK
;
880 DPRINT("u64: 0x%016"PRIx64
"\n", vmsg
->payload
.u64
);
882 if (!vu_check_queue_msg_file(dev
, vmsg
)) {
886 if (dev
->vq
[index
].err_fd
!= -1) {
887 close(dev
->vq
[index
].err_fd
);
888 dev
->vq
[index
].err_fd
= -1;
891 if (!(vmsg
->payload
.u64
& VHOST_USER_VRING_NOFD_MASK
)) {
892 dev
->vq
[index
].err_fd
= vmsg
->fds
[0];
899 vu_get_protocol_features_exec(VuDev
*dev
, VhostUserMsg
*vmsg
)
901 uint64_t features
= 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD
|
902 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ
;
904 if (dev
->iface
->get_protocol_features
) {
905 features
|= dev
->iface
->get_protocol_features(dev
);
908 vmsg
->payload
.u64
= features
;
909 vmsg
->size
= sizeof(vmsg
->payload
.u64
);
916 vu_set_protocol_features_exec(VuDev
*dev
, VhostUserMsg
*vmsg
)
918 uint64_t features
= vmsg
->payload
.u64
;
920 DPRINT("u64: 0x%016"PRIx64
"\n", features
);
922 dev
->protocol_features
= vmsg
->payload
.u64
;
924 if (dev
->iface
->set_protocol_features
) {
925 dev
->iface
->set_protocol_features(dev
, features
);
932 vu_get_queue_num_exec(VuDev
*dev
, VhostUserMsg
*vmsg
)
934 DPRINT("Function %s() not implemented yet.\n", __func__
);
939 vu_set_vring_enable_exec(VuDev
*dev
, VhostUserMsg
*vmsg
)
941 unsigned int index
= vmsg
->payload
.state
.index
;
942 unsigned int enable
= vmsg
->payload
.state
.num
;
944 DPRINT("State.index: %d\n", index
);
945 DPRINT("State.enable: %d\n", enable
);
947 if (index
>= VHOST_MAX_NR_VIRTQUEUE
) {
948 vu_panic(dev
, "Invalid vring_enable index: %u", index
);
952 dev
->vq
[index
].enable
= enable
;
957 vu_set_slave_req_fd(VuDev
*dev
, VhostUserMsg
*vmsg
)
959 if (vmsg
->fd_num
!= 1) {
960 vu_panic(dev
, "Invalid slave_req_fd message (%d fd's)", vmsg
->fd_num
);
964 if (dev
->slave_fd
!= -1) {
965 close(dev
->slave_fd
);
967 dev
->slave_fd
= vmsg
->fds
[0];
968 DPRINT("Got slave_fd: %d\n", vmsg
->fds
[0]);
974 vu_get_config(VuDev
*dev
, VhostUserMsg
*vmsg
)
978 if (dev
->iface
->get_config
) {
979 ret
= dev
->iface
->get_config(dev
, vmsg
->payload
.config
.region
,
980 vmsg
->payload
.config
.size
);
984 /* resize to zero to indicate an error to master */
992 vu_set_config(VuDev
*dev
, VhostUserMsg
*vmsg
)
996 if (dev
->iface
->set_config
) {
997 ret
= dev
->iface
->set_config(dev
, vmsg
->payload
.config
.region
,
998 vmsg
->payload
.config
.offset
,
999 vmsg
->payload
.config
.size
,
1000 vmsg
->payload
.config
.flags
);
1002 vu_panic(dev
, "Set virtio configuration space failed");
1010 vu_set_postcopy_advise(VuDev
*dev
, VhostUserMsg
*vmsg
)
1012 dev
->postcopy_ufd
= -1;
1014 struct uffdio_api api_struct
;
1016 dev
->postcopy_ufd
= syscall(__NR_userfaultfd
, O_CLOEXEC
| O_NONBLOCK
);
1020 if (dev
->postcopy_ufd
== -1) {
1021 vu_panic(dev
, "Userfaultfd not available: %s", strerror(errno
));
1026 api_struct
.api
= UFFD_API
;
1027 api_struct
.features
= 0;
1028 if (ioctl(dev
->postcopy_ufd
, UFFDIO_API
, &api_struct
)) {
1029 vu_panic(dev
, "Failed UFFDIO_API: %s", strerror(errno
));
1030 close(dev
->postcopy_ufd
);
1031 dev
->postcopy_ufd
= -1;
1034 /* TODO: Stash feature flags somewhere */
1038 /* Return a ufd to the QEMU */
1040 vmsg
->fds
[0] = dev
->postcopy_ufd
;
1041 return true; /* = send a reply */
1045 vu_set_postcopy_listen(VuDev
*dev
, VhostUserMsg
*vmsg
)
1047 vmsg
->payload
.u64
= -1;
1048 vmsg
->size
= sizeof(vmsg
->payload
.u64
);
1050 if (dev
->nregions
) {
1051 vu_panic(dev
, "Regions already registered at postcopy-listen");
1054 dev
->postcopy_listening
= true;
1056 vmsg
->flags
= VHOST_USER_VERSION
| VHOST_USER_REPLY_MASK
;
1057 vmsg
->payload
.u64
= 0; /* Success */
1061 vu_process_message(VuDev
*dev
, VhostUserMsg
*vmsg
)
1065 /* Print out generic part of the request. */
1066 DPRINT("================ Vhost user message ================\n");
1067 DPRINT("Request: %s (%d)\n", vu_request_to_string(vmsg
->request
),
1069 DPRINT("Flags: 0x%x\n", vmsg
->flags
);
1070 DPRINT("Size: %d\n", vmsg
->size
);
1075 for (i
= 0; i
< vmsg
->fd_num
; i
++) {
1076 DPRINT(" %d", vmsg
->fds
[i
]);
1081 if (dev
->iface
->process_msg
&&
1082 dev
->iface
->process_msg(dev
, vmsg
, &do_reply
)) {
1086 switch (vmsg
->request
) {
1087 case VHOST_USER_GET_FEATURES
:
1088 return vu_get_features_exec(dev
, vmsg
);
1089 case VHOST_USER_SET_FEATURES
:
1090 return vu_set_features_exec(dev
, vmsg
);
1091 case VHOST_USER_GET_PROTOCOL_FEATURES
:
1092 return vu_get_protocol_features_exec(dev
, vmsg
);
1093 case VHOST_USER_SET_PROTOCOL_FEATURES
:
1094 return vu_set_protocol_features_exec(dev
, vmsg
);
1095 case VHOST_USER_SET_OWNER
:
1096 return vu_set_owner_exec(dev
, vmsg
);
1097 case VHOST_USER_RESET_OWNER
:
1098 return vu_reset_device_exec(dev
, vmsg
);
1099 case VHOST_USER_SET_MEM_TABLE
:
1100 return vu_set_mem_table_exec(dev
, vmsg
);
1101 case VHOST_USER_SET_LOG_BASE
:
1102 return vu_set_log_base_exec(dev
, vmsg
);
1103 case VHOST_USER_SET_LOG_FD
:
1104 return vu_set_log_fd_exec(dev
, vmsg
);
1105 case VHOST_USER_SET_VRING_NUM
:
1106 return vu_set_vring_num_exec(dev
, vmsg
);
1107 case VHOST_USER_SET_VRING_ADDR
:
1108 return vu_set_vring_addr_exec(dev
, vmsg
);
1109 case VHOST_USER_SET_VRING_BASE
:
1110 return vu_set_vring_base_exec(dev
, vmsg
);
1111 case VHOST_USER_GET_VRING_BASE
:
1112 return vu_get_vring_base_exec(dev
, vmsg
);
1113 case VHOST_USER_SET_VRING_KICK
:
1114 return vu_set_vring_kick_exec(dev
, vmsg
);
1115 case VHOST_USER_SET_VRING_CALL
:
1116 return vu_set_vring_call_exec(dev
, vmsg
);
1117 case VHOST_USER_SET_VRING_ERR
:
1118 return vu_set_vring_err_exec(dev
, vmsg
);
1119 case VHOST_USER_GET_QUEUE_NUM
:
1120 return vu_get_queue_num_exec(dev
, vmsg
);
1121 case VHOST_USER_SET_VRING_ENABLE
:
1122 return vu_set_vring_enable_exec(dev
, vmsg
);
1123 case VHOST_USER_SET_SLAVE_REQ_FD
:
1124 return vu_set_slave_req_fd(dev
, vmsg
);
1125 case VHOST_USER_GET_CONFIG
:
1126 return vu_get_config(dev
, vmsg
);
1127 case VHOST_USER_SET_CONFIG
:
1128 return vu_set_config(dev
, vmsg
);
1129 case VHOST_USER_NONE
:
1131 case VHOST_USER_POSTCOPY_ADVISE
:
1132 return vu_set_postcopy_advise(dev
, vmsg
);
1133 case VHOST_USER_POSTCOPY_LISTEN
:
1134 return vu_set_postcopy_listen(dev
, vmsg
);
1136 vmsg_close_fds(vmsg
);
1137 vu_panic(dev
, "Unhandled request: %d", vmsg
->request
);
1144 vu_dispatch(VuDev
*dev
)
1146 VhostUserMsg vmsg
= { 0, };
1147 int reply_requested
;
1148 bool success
= false;
1150 if (!vu_message_read(dev
, dev
->sock
, &vmsg
)) {
1154 reply_requested
= vu_process_message(dev
, &vmsg
);
1155 if (!reply_requested
) {
1160 if (!vu_message_write(dev
, dev
->sock
, &vmsg
)) {
1172 vu_deinit(VuDev
*dev
)
1176 for (i
= 0; i
< dev
->nregions
; i
++) {
1177 VuDevRegion
*r
= &dev
->regions
[i
];
1178 void *m
= (void *) (uintptr_t) r
->mmap_addr
;
1179 if (m
!= MAP_FAILED
) {
1180 munmap(m
, r
->size
+ r
->mmap_offset
);
1185 for (i
= 0; i
< VHOST_MAX_NR_VIRTQUEUE
; i
++) {
1186 VuVirtq
*vq
= &dev
->vq
[i
];
1188 if (vq
->call_fd
!= -1) {
1193 if (vq
->kick_fd
!= -1) {
1198 if (vq
->err_fd
!= -1) {
1206 if (dev
->slave_fd
!= -1) {
1207 close(dev
->slave_fd
);
1211 if (dev
->sock
!= -1) {
1220 vu_set_watch_cb set_watch
,
1221 vu_remove_watch_cb remove_watch
,
1222 const VuDevIface
*iface
)
1226 assert(socket
>= 0);
1228 assert(remove_watch
);
1232 memset(dev
, 0, sizeof(*dev
));
1236 dev
->set_watch
= set_watch
;
1237 dev
->remove_watch
= remove_watch
;
1239 dev
->log_call_fd
= -1;
1241 for (i
= 0; i
< VHOST_MAX_NR_VIRTQUEUE
; i
++) {
1242 dev
->vq
[i
] = (VuVirtq
) {
1243 .call_fd
= -1, .kick_fd
= -1, .err_fd
= -1,
1244 .notification
= true,
1250 vu_get_queue(VuDev
*dev
, int qidx
)
1252 assert(qidx
< VHOST_MAX_NR_VIRTQUEUE
);
1253 return &dev
->vq
[qidx
];
1257 vu_queue_enabled(VuDev
*dev
, VuVirtq
*vq
)
1263 vu_queue_started(const VuDev
*dev
, const VuVirtq
*vq
)
1268 static inline uint16_t
1269 vring_avail_flags(VuVirtq
*vq
)
1271 return vq
->vring
.avail
->flags
;
1274 static inline uint16_t
1275 vring_avail_idx(VuVirtq
*vq
)
1277 vq
->shadow_avail_idx
= vq
->vring
.avail
->idx
;
1279 return vq
->shadow_avail_idx
;
1282 static inline uint16_t
1283 vring_avail_ring(VuVirtq
*vq
, int i
)
1285 return vq
->vring
.avail
->ring
[i
];
1288 static inline uint16_t
1289 vring_get_used_event(VuVirtq
*vq
)
1291 return vring_avail_ring(vq
, vq
->vring
.num
);
1295 virtqueue_num_heads(VuDev
*dev
, VuVirtq
*vq
, unsigned int idx
)
1297 uint16_t num_heads
= vring_avail_idx(vq
) - idx
;
1299 /* Check it isn't doing very strange things with descriptor numbers. */
1300 if (num_heads
> vq
->vring
.num
) {
1301 vu_panic(dev
, "Guest moved used index from %u to %u",
1302 idx
, vq
->shadow_avail_idx
);
1306 /* On success, callers read a descriptor at vq->last_avail_idx.
1307 * Make sure descriptor read does not bypass avail index read. */
1315 virtqueue_get_head(VuDev
*dev
, VuVirtq
*vq
,
1316 unsigned int idx
, unsigned int *head
)
1318 /* Grab the next descriptor number they're advertising, and increment
1319 * the index we've seen. */
1320 *head
= vring_avail_ring(vq
, idx
% vq
->vring
.num
);
1322 /* If their number is silly, that's a fatal mistake. */
1323 if (*head
>= vq
->vring
.num
) {
1324 vu_panic(dev
, "Guest says index %u is available", head
);
1332 virtqueue_read_indirect_desc(VuDev
*dev
, struct vring_desc
*desc
,
1333 uint64_t addr
, size_t len
)
1335 struct vring_desc
*ori_desc
;
1338 if (len
> (VIRTQUEUE_MAX_SIZE
* sizeof(struct vring_desc
))) {
1348 ori_desc
= vu_gpa_to_va(dev
, &read_len
, addr
);
1353 memcpy(desc
, ori_desc
, read_len
);
1363 VIRTQUEUE_READ_DESC_ERROR
= -1,
1364 VIRTQUEUE_READ_DESC_DONE
= 0, /* end of chain */
1365 VIRTQUEUE_READ_DESC_MORE
= 1, /* more buffers in chain */
1369 virtqueue_read_next_desc(VuDev
*dev
, struct vring_desc
*desc
,
1370 int i
, unsigned int max
, unsigned int *next
)
1372 /* If this descriptor says it doesn't chain, we're done. */
1373 if (!(desc
[i
].flags
& VRING_DESC_F_NEXT
)) {
1374 return VIRTQUEUE_READ_DESC_DONE
;
1377 /* Check they're not leading us off end of descriptors. */
1378 *next
= desc
[i
].next
;
1379 /* Make sure compiler knows to grab that: we don't want it changing! */
1383 vu_panic(dev
, "Desc next is %u", next
);
1384 return VIRTQUEUE_READ_DESC_ERROR
;
1387 return VIRTQUEUE_READ_DESC_MORE
;
1391 vu_queue_get_avail_bytes(VuDev
*dev
, VuVirtq
*vq
, unsigned int *in_bytes
,
1392 unsigned int *out_bytes
,
1393 unsigned max_in_bytes
, unsigned max_out_bytes
)
1396 unsigned int total_bufs
, in_total
, out_total
;
1399 idx
= vq
->last_avail_idx
;
1401 total_bufs
= in_total
= out_total
= 0;
1402 if (unlikely(dev
->broken
) ||
1403 unlikely(!vq
->vring
.avail
)) {
1407 while ((rc
= virtqueue_num_heads(dev
, vq
, idx
)) > 0) {
1408 unsigned int max
, desc_len
, num_bufs
, indirect
= 0;
1409 uint64_t desc_addr
, read_len
;
1410 struct vring_desc
*desc
;
1411 struct vring_desc desc_buf
[VIRTQUEUE_MAX_SIZE
];
1414 max
= vq
->vring
.num
;
1415 num_bufs
= total_bufs
;
1416 if (!virtqueue_get_head(dev
, vq
, idx
++, &i
)) {
1419 desc
= vq
->vring
.desc
;
1421 if (desc
[i
].flags
& VRING_DESC_F_INDIRECT
) {
1422 if (desc
[i
].len
% sizeof(struct vring_desc
)) {
1423 vu_panic(dev
, "Invalid size for indirect buffer table");
1427 /* If we've got too many, that implies a descriptor loop. */
1428 if (num_bufs
>= max
) {
1429 vu_panic(dev
, "Looped descriptor");
1433 /* loop over the indirect descriptor table */
1435 desc_addr
= desc
[i
].addr
;
1436 desc_len
= desc
[i
].len
;
1437 max
= desc_len
/ sizeof(struct vring_desc
);
1438 read_len
= desc_len
;
1439 desc
= vu_gpa_to_va(dev
, &read_len
, desc_addr
);
1440 if (unlikely(desc
&& read_len
!= desc_len
)) {
1441 /* Failed to use zero copy */
1443 if (!virtqueue_read_indirect_desc(dev
, desc_buf
,
1450 vu_panic(dev
, "Invalid indirect buffer table");
1457 /* If we've got too many, that implies a descriptor loop. */
1458 if (++num_bufs
> max
) {
1459 vu_panic(dev
, "Looped descriptor");
1463 if (desc
[i
].flags
& VRING_DESC_F_WRITE
) {
1464 in_total
+= desc
[i
].len
;
1466 out_total
+= desc
[i
].len
;
1468 if (in_total
>= max_in_bytes
&& out_total
>= max_out_bytes
) {
1471 rc
= virtqueue_read_next_desc(dev
, desc
, i
, max
, &i
);
1472 } while (rc
== VIRTQUEUE_READ_DESC_MORE
);
1474 if (rc
== VIRTQUEUE_READ_DESC_ERROR
) {
1479 total_bufs
= num_bufs
;
1489 *in_bytes
= in_total
;
1492 *out_bytes
= out_total
;
1497 in_total
= out_total
= 0;
1502 vu_queue_avail_bytes(VuDev
*dev
, VuVirtq
*vq
, unsigned int in_bytes
,
1503 unsigned int out_bytes
)
1505 unsigned int in_total
, out_total
;
1507 vu_queue_get_avail_bytes(dev
, vq
, &in_total
, &out_total
,
1508 in_bytes
, out_bytes
);
1510 return in_bytes
<= in_total
&& out_bytes
<= out_total
;
1513 /* Fetch avail_idx from VQ memory only when we really need to know if
1514 * guest has added some buffers. */
1516 vu_queue_empty(VuDev
*dev
, VuVirtq
*vq
)
1518 if (unlikely(dev
->broken
) ||
1519 unlikely(!vq
->vring
.avail
)) {
1523 if (vq
->shadow_avail_idx
!= vq
->last_avail_idx
) {
1527 return vring_avail_idx(vq
) == vq
->last_avail_idx
;
1531 bool has_feature(uint64_t features
, unsigned int fbit
)
1534 return !!(features
& (1ULL << fbit
));
1538 bool vu_has_feature(VuDev
*dev
,
1541 return has_feature(dev
->features
, fbit
);
1545 vring_notify(VuDev
*dev
, VuVirtq
*vq
)
1550 /* We need to expose used array entries before checking used event. */
1553 /* Always notify when queue is empty (when feature acknowledge) */
1554 if (vu_has_feature(dev
, VIRTIO_F_NOTIFY_ON_EMPTY
) &&
1555 !vq
->inuse
&& vu_queue_empty(dev
, vq
)) {
1559 if (!vu_has_feature(dev
, VIRTIO_RING_F_EVENT_IDX
)) {
1560 return !(vring_avail_flags(vq
) & VRING_AVAIL_F_NO_INTERRUPT
);
1563 v
= vq
->signalled_used_valid
;
1564 vq
->signalled_used_valid
= true;
1565 old
= vq
->signalled_used
;
1566 new = vq
->signalled_used
= vq
->used_idx
;
1567 return !v
|| vring_need_event(vring_get_used_event(vq
), new, old
);
1571 vu_queue_notify(VuDev
*dev
, VuVirtq
*vq
)
1573 if (unlikely(dev
->broken
) ||
1574 unlikely(!vq
->vring
.avail
)) {
1578 if (!vring_notify(dev
, vq
)) {
1579 DPRINT("skipped notify...\n");
1583 if (eventfd_write(vq
->call_fd
, 1) < 0) {
1584 vu_panic(dev
, "Error writing eventfd: %s", strerror(errno
));
1589 vring_used_flags_set_bit(VuVirtq
*vq
, int mask
)
1593 flags
= (uint16_t *)((char*)vq
->vring
.used
+
1594 offsetof(struct vring_used
, flags
));
1599 vring_used_flags_unset_bit(VuVirtq
*vq
, int mask
)
1603 flags
= (uint16_t *)((char*)vq
->vring
.used
+
1604 offsetof(struct vring_used
, flags
));
1609 vring_set_avail_event(VuVirtq
*vq
, uint16_t val
)
1611 if (!vq
->notification
) {
1615 *((uint16_t *) &vq
->vring
.used
->ring
[vq
->vring
.num
]) = val
;
1619 vu_queue_set_notification(VuDev
*dev
, VuVirtq
*vq
, int enable
)
1621 vq
->notification
= enable
;
1622 if (vu_has_feature(dev
, VIRTIO_RING_F_EVENT_IDX
)) {
1623 vring_set_avail_event(vq
, vring_avail_idx(vq
));
1624 } else if (enable
) {
1625 vring_used_flags_unset_bit(vq
, VRING_USED_F_NO_NOTIFY
);
1627 vring_used_flags_set_bit(vq
, VRING_USED_F_NO_NOTIFY
);
1630 /* Expose avail event/used flags before caller checks the avail idx. */
1636 virtqueue_map_desc(VuDev
*dev
,
1637 unsigned int *p_num_sg
, struct iovec
*iov
,
1638 unsigned int max_num_sg
, bool is_write
,
1639 uint64_t pa
, size_t sz
)
1641 unsigned num_sg
= *p_num_sg
;
1643 assert(num_sg
<= max_num_sg
);
1646 vu_panic(dev
, "virtio: zero sized buffers are not allowed");
1653 if (num_sg
== max_num_sg
) {
1654 vu_panic(dev
, "virtio: too many descriptors in indirect table");
1658 iov
[num_sg
].iov_base
= vu_gpa_to_va(dev
, &len
, pa
);
1659 if (iov
[num_sg
].iov_base
== NULL
) {
1660 vu_panic(dev
, "virtio: invalid address for buffers");
1663 iov
[num_sg
].iov_len
= len
;
1672 /* Round number down to multiple */
1673 #define ALIGN_DOWN(n, m) ((n) / (m) * (m))
1675 /* Round number up to multiple */
1676 #define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
1679 virtqueue_alloc_element(size_t sz
,
1680 unsigned out_num
, unsigned in_num
)
1682 VuVirtqElement
*elem
;
1683 size_t in_sg_ofs
= ALIGN_UP(sz
, __alignof__(elem
->in_sg
[0]));
1684 size_t out_sg_ofs
= in_sg_ofs
+ in_num
* sizeof(elem
->in_sg
[0]);
1685 size_t out_sg_end
= out_sg_ofs
+ out_num
* sizeof(elem
->out_sg
[0]);
1687 assert(sz
>= sizeof(VuVirtqElement
));
1688 elem
= malloc(out_sg_end
);
1689 elem
->out_num
= out_num
;
1690 elem
->in_num
= in_num
;
1691 elem
->in_sg
= (void *)elem
+ in_sg_ofs
;
1692 elem
->out_sg
= (void *)elem
+ out_sg_ofs
;
1697 vu_queue_pop(VuDev
*dev
, VuVirtq
*vq
, size_t sz
)
1699 unsigned int i
, head
, max
, desc_len
;
1700 uint64_t desc_addr
, read_len
;
1701 VuVirtqElement
*elem
;
1702 unsigned out_num
, in_num
;
1703 struct iovec iov
[VIRTQUEUE_MAX_SIZE
];
1704 struct vring_desc desc_buf
[VIRTQUEUE_MAX_SIZE
];
1705 struct vring_desc
*desc
;
1708 if (unlikely(dev
->broken
) ||
1709 unlikely(!vq
->vring
.avail
)) {
1713 if (vu_queue_empty(dev
, vq
)) {
1716 /* Needed after virtio_queue_empty(), see comment in
1717 * virtqueue_num_heads(). */
1720 /* When we start there are none of either input nor output. */
1721 out_num
= in_num
= 0;
1723 max
= vq
->vring
.num
;
1724 if (vq
->inuse
>= vq
->vring
.num
) {
1725 vu_panic(dev
, "Virtqueue size exceeded");
1729 if (!virtqueue_get_head(dev
, vq
, vq
->last_avail_idx
++, &head
)) {
1733 if (vu_has_feature(dev
, VIRTIO_RING_F_EVENT_IDX
)) {
1734 vring_set_avail_event(vq
, vq
->last_avail_idx
);
1738 desc
= vq
->vring
.desc
;
1739 if (desc
[i
].flags
& VRING_DESC_F_INDIRECT
) {
1740 if (desc
[i
].len
% sizeof(struct vring_desc
)) {
1741 vu_panic(dev
, "Invalid size for indirect buffer table");
1744 /* loop over the indirect descriptor table */
1745 desc_addr
= desc
[i
].addr
;
1746 desc_len
= desc
[i
].len
;
1747 max
= desc_len
/ sizeof(struct vring_desc
);
1748 read_len
= desc_len
;
1749 desc
= vu_gpa_to_va(dev
, &read_len
, desc_addr
);
1750 if (unlikely(desc
&& read_len
!= desc_len
)) {
1751 /* Failed to use zero copy */
1753 if (!virtqueue_read_indirect_desc(dev
, desc_buf
,
1760 vu_panic(dev
, "Invalid indirect buffer table");
1766 /* Collect all the descriptors */
1768 if (desc
[i
].flags
& VRING_DESC_F_WRITE
) {
1769 virtqueue_map_desc(dev
, &in_num
, iov
+ out_num
,
1770 VIRTQUEUE_MAX_SIZE
- out_num
, true,
1771 desc
[i
].addr
, desc
[i
].len
);
1774 vu_panic(dev
, "Incorrect order for descriptors");
1777 virtqueue_map_desc(dev
, &out_num
, iov
,
1778 VIRTQUEUE_MAX_SIZE
, false,
1779 desc
[i
].addr
, desc
[i
].len
);
1782 /* If we've got too many, that implies a descriptor loop. */
1783 if ((in_num
+ out_num
) > max
) {
1784 vu_panic(dev
, "Looped descriptor");
1786 rc
= virtqueue_read_next_desc(dev
, desc
, i
, max
, &i
);
1787 } while (rc
== VIRTQUEUE_READ_DESC_MORE
);
1789 if (rc
== VIRTQUEUE_READ_DESC_ERROR
) {
1793 /* Now copy what we have collected and mapped */
1794 elem
= virtqueue_alloc_element(sz
, out_num
, in_num
);
1796 for (i
= 0; i
< out_num
; i
++) {
1797 elem
->out_sg
[i
] = iov
[i
];
1799 for (i
= 0; i
< in_num
; i
++) {
1800 elem
->in_sg
[i
] = iov
[out_num
+ i
];
1809 vu_queue_rewind(VuDev
*dev
, VuVirtq
*vq
, unsigned int num
)
1811 if (num
> vq
->inuse
) {
1814 vq
->last_avail_idx
-= num
;
1820 void vring_used_write(VuDev
*dev
, VuVirtq
*vq
,
1821 struct vring_used_elem
*uelem
, int i
)
1823 struct vring_used
*used
= vq
->vring
.used
;
1825 used
->ring
[i
] = *uelem
;
1826 vu_log_write(dev
, vq
->vring
.log_guest_addr
+
1827 offsetof(struct vring_used
, ring
[i
]),
1828 sizeof(used
->ring
[i
]));
1833 vu_log_queue_fill(VuDev
*dev
, VuVirtq
*vq
,
1834 const VuVirtqElement
*elem
,
1837 struct vring_desc
*desc
= vq
->vring
.desc
;
1838 unsigned int i
, max
, min
, desc_len
;
1839 uint64_t desc_addr
, read_len
;
1840 struct vring_desc desc_buf
[VIRTQUEUE_MAX_SIZE
];
1841 unsigned num_bufs
= 0;
1843 max
= vq
->vring
.num
;
1846 if (desc
[i
].flags
& VRING_DESC_F_INDIRECT
) {
1847 if (desc
[i
].len
% sizeof(struct vring_desc
)) {
1848 vu_panic(dev
, "Invalid size for indirect buffer table");
1851 /* loop over the indirect descriptor table */
1852 desc_addr
= desc
[i
].addr
;
1853 desc_len
= desc
[i
].len
;
1854 max
= desc_len
/ sizeof(struct vring_desc
);
1855 read_len
= desc_len
;
1856 desc
= vu_gpa_to_va(dev
, &read_len
, desc_addr
);
1857 if (unlikely(desc
&& read_len
!= desc_len
)) {
1858 /* Failed to use zero copy */
1860 if (!virtqueue_read_indirect_desc(dev
, desc_buf
,
1867 vu_panic(dev
, "Invalid indirect buffer table");
1874 if (++num_bufs
> max
) {
1875 vu_panic(dev
, "Looped descriptor");
1879 if (desc
[i
].flags
& VRING_DESC_F_WRITE
) {
1880 min
= MIN(desc
[i
].len
, len
);
1881 vu_log_write(dev
, desc
[i
].addr
, min
);
1886 (virtqueue_read_next_desc(dev
, desc
, i
, max
, &i
)
1887 == VIRTQUEUE_READ_DESC_MORE
));
1891 vu_queue_fill(VuDev
*dev
, VuVirtq
*vq
,
1892 const VuVirtqElement
*elem
,
1893 unsigned int len
, unsigned int idx
)
1895 struct vring_used_elem uelem
;
1897 if (unlikely(dev
->broken
) ||
1898 unlikely(!vq
->vring
.avail
)) {
1902 vu_log_queue_fill(dev
, vq
, elem
, len
);
1904 idx
= (idx
+ vq
->used_idx
) % vq
->vring
.num
;
1906 uelem
.id
= elem
->index
;
1908 vring_used_write(dev
, vq
, &uelem
, idx
);
1912 void vring_used_idx_set(VuDev
*dev
, VuVirtq
*vq
, uint16_t val
)
1914 vq
->vring
.used
->idx
= val
;
1916 vq
->vring
.log_guest_addr
+ offsetof(struct vring_used
, idx
),
1917 sizeof(vq
->vring
.used
->idx
));
1923 vu_queue_flush(VuDev
*dev
, VuVirtq
*vq
, unsigned int count
)
1927 if (unlikely(dev
->broken
) ||
1928 unlikely(!vq
->vring
.avail
)) {
1932 /* Make sure buffer is written before we update index. */
1937 vring_used_idx_set(dev
, vq
, new);
1939 if (unlikely((int16_t)(new - vq
->signalled_used
) < (uint16_t)(new - old
))) {
1940 vq
->signalled_used_valid
= false;
1945 vu_queue_push(VuDev
*dev
, VuVirtq
*vq
,
1946 const VuVirtqElement
*elem
, unsigned int len
)
1948 vu_queue_fill(dev
, vq
, elem
, len
, 0);
1949 vu_queue_flush(dev
, vq
, 1);