4 * Copyright(c) 2017-2018 Intel Corporation.
5 * Copyright(c) 2020 Red Hat, Inc.
7 * This work is licensed under the terms of the GNU GPL, version 2 or later.
8 * See the COPYING file in the top-level directory.
12 #include "qemu/osdep.h"
13 #include <linux/vhost.h>
14 #include <linux/vfio.h>
15 #include <sys/eventfd.h>
16 #include <sys/ioctl.h>
17 #include "hw/virtio/vhost.h"
18 #include "hw/virtio/vhost-backend.h"
19 #include "hw/virtio/virtio-net.h"
20 #include "hw/virtio/vhost-shadow-virtqueue.h"
21 #include "hw/virtio/vhost-vdpa.h"
22 #include "exec/address-spaces.h"
23 #include "migration/blocker.h"
24 #include "qemu/cutils.h"
25 #include "qemu/main-loop.h"
28 #include "qapi/error.h"
31 * Return one past the end of the end of section. Be careful with uint64_t
34 static Int128
vhost_vdpa_section_end(const MemoryRegionSection
*section
)
36 Int128 llend
= int128_make64(section
->offset_within_address_space
);
37 llend
= int128_add(llend
, section
->size
);
38 llend
= int128_and(llend
, int128_exts64(TARGET_PAGE_MASK
));
43 static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection
*section
,
49 if ((!memory_region_is_ram(section
->mr
) &&
50 !memory_region_is_iommu(section
->mr
)) ||
51 memory_region_is_protected(section
->mr
) ||
52 /* vhost-vDPA doesn't allow MMIO to be mapped */
53 memory_region_is_ram_device(section
->mr
)) {
57 if (section
->offset_within_address_space
< iova_min
) {
58 error_report("RAM section out of device range (min=0x%" PRIx64
59 ", addr=0x%" HWADDR_PRIx
")",
60 iova_min
, section
->offset_within_address_space
);
64 llend
= vhost_vdpa_section_end(section
);
65 if (int128_gt(llend
, int128_make64(iova_max
))) {
66 error_report("RAM section out of device range (max=0x%" PRIx64
67 ", end addr=0x%" PRIx64
")",
68 iova_max
, int128_get64(llend
));
75 int vhost_vdpa_dma_map(struct vhost_vdpa
*v
, hwaddr iova
, hwaddr size
,
76 void *vaddr
, bool readonly
)
78 struct vhost_msg_v2 msg
= {};
79 int fd
= v
->device_fd
;
82 msg
.type
= v
->msg_type
;
83 msg
.iotlb
.iova
= iova
;
84 msg
.iotlb
.size
= size
;
85 msg
.iotlb
.uaddr
= (uint64_t)(uintptr_t)vaddr
;
86 msg
.iotlb
.perm
= readonly
? VHOST_ACCESS_RO
: VHOST_ACCESS_RW
;
87 msg
.iotlb
.type
= VHOST_IOTLB_UPDATE
;
89 trace_vhost_vdpa_dma_map(v
, fd
, msg
.type
, msg
.iotlb
.iova
, msg
.iotlb
.size
,
90 msg
.iotlb
.uaddr
, msg
.iotlb
.perm
, msg
.iotlb
.type
);
92 if (write(fd
, &msg
, sizeof(msg
)) != sizeof(msg
)) {
93 error_report("failed to write, fd=%d, errno=%d (%s)",
94 fd
, errno
, strerror(errno
));
101 int vhost_vdpa_dma_unmap(struct vhost_vdpa
*v
, hwaddr iova
, hwaddr size
)
103 struct vhost_msg_v2 msg
= {};
104 int fd
= v
->device_fd
;
107 msg
.type
= v
->msg_type
;
108 msg
.iotlb
.iova
= iova
;
109 msg
.iotlb
.size
= size
;
110 msg
.iotlb
.type
= VHOST_IOTLB_INVALIDATE
;
112 trace_vhost_vdpa_dma_unmap(v
, fd
, msg
.type
, msg
.iotlb
.iova
,
113 msg
.iotlb
.size
, msg
.iotlb
.type
);
115 if (write(fd
, &msg
, sizeof(msg
)) != sizeof(msg
)) {
116 error_report("failed to write, fd=%d, errno=%d (%s)",
117 fd
, errno
, strerror(errno
));
124 static void vhost_vdpa_listener_begin_batch(struct vhost_vdpa
*v
)
126 int fd
= v
->device_fd
;
127 struct vhost_msg_v2 msg
= {
129 .iotlb
.type
= VHOST_IOTLB_BATCH_BEGIN
,
132 trace_vhost_vdpa_listener_begin_batch(v
, fd
, msg
.type
, msg
.iotlb
.type
);
133 if (write(fd
, &msg
, sizeof(msg
)) != sizeof(msg
)) {
134 error_report("failed to write, fd=%d, errno=%d (%s)",
135 fd
, errno
, strerror(errno
));
139 static void vhost_vdpa_iotlb_batch_begin_once(struct vhost_vdpa
*v
)
141 if (v
->dev
->backend_cap
& (0x1ULL
<< VHOST_BACKEND_F_IOTLB_BATCH
) &&
142 !v
->iotlb_batch_begin_sent
) {
143 vhost_vdpa_listener_begin_batch(v
);
146 v
->iotlb_batch_begin_sent
= true;
149 static void vhost_vdpa_listener_commit(MemoryListener
*listener
)
151 struct vhost_vdpa
*v
= container_of(listener
, struct vhost_vdpa
, listener
);
152 struct vhost_dev
*dev
= v
->dev
;
153 struct vhost_msg_v2 msg
= {};
154 int fd
= v
->device_fd
;
156 if (!(dev
->backend_cap
& (0x1ULL
<< VHOST_BACKEND_F_IOTLB_BATCH
))) {
160 if (!v
->iotlb_batch_begin_sent
) {
164 msg
.type
= v
->msg_type
;
165 msg
.iotlb
.type
= VHOST_IOTLB_BATCH_END
;
167 trace_vhost_vdpa_listener_commit(v
, fd
, msg
.type
, msg
.iotlb
.type
);
168 if (write(fd
, &msg
, sizeof(msg
)) != sizeof(msg
)) {
169 error_report("failed to write, fd=%d, errno=%d (%s)",
170 fd
, errno
, strerror(errno
));
173 v
->iotlb_batch_begin_sent
= false;
176 static void vhost_vdpa_listener_region_add(MemoryListener
*listener
,
177 MemoryRegionSection
*section
)
179 DMAMap mem_region
= {};
180 struct vhost_vdpa
*v
= container_of(listener
, struct vhost_vdpa
, listener
);
182 Int128 llend
, llsize
;
186 if (vhost_vdpa_listener_skipped_section(section
, v
->iova_range
.first
,
187 v
->iova_range
.last
)) {
191 if (unlikely((section
->offset_within_address_space
& ~TARGET_PAGE_MASK
) !=
192 (section
->offset_within_region
& ~TARGET_PAGE_MASK
))) {
193 error_report("%s received unaligned region", __func__
);
197 iova
= TARGET_PAGE_ALIGN(section
->offset_within_address_space
);
198 llend
= vhost_vdpa_section_end(section
);
199 if (int128_ge(int128_make64(iova
), llend
)) {
203 memory_region_ref(section
->mr
);
205 /* Here we assume that memory_region_is_ram(section->mr)==true */
207 vaddr
= memory_region_get_ram_ptr(section
->mr
) +
208 section
->offset_within_region
+
209 (iova
- section
->offset_within_address_space
);
211 trace_vhost_vdpa_listener_region_add(v
, iova
, int128_get64(llend
),
212 vaddr
, section
->readonly
);
214 llsize
= int128_sub(llend
, int128_make64(iova
));
215 if (v
->shadow_vqs_enabled
) {
218 mem_region
.translated_addr
= (hwaddr
)(uintptr_t)vaddr
,
219 mem_region
.size
= int128_get64(llsize
) - 1,
220 mem_region
.perm
= IOMMU_ACCESS_FLAG(true, section
->readonly
),
222 r
= vhost_iova_tree_map_alloc(v
->iova_tree
, &mem_region
);
223 if (unlikely(r
!= IOVA_OK
)) {
224 error_report("Can't allocate a mapping (%d)", r
);
228 iova
= mem_region
.iova
;
231 vhost_vdpa_iotlb_batch_begin_once(v
);
232 ret
= vhost_vdpa_dma_map(v
, iova
, int128_get64(llsize
),
233 vaddr
, section
->readonly
);
235 error_report("vhost vdpa map fail!");
242 if (v
->shadow_vqs_enabled
) {
243 vhost_iova_tree_remove(v
->iova_tree
, mem_region
);
248 * On the initfn path, store the first error in the container so we
249 * can gracefully fail. Runtime, there's not much we can do other
250 * than throw a hardware error.
252 error_report("vhost-vdpa: DMA mapping failed, unable to continue");
257 static void vhost_vdpa_listener_region_del(MemoryListener
*listener
,
258 MemoryRegionSection
*section
)
260 struct vhost_vdpa
*v
= container_of(listener
, struct vhost_vdpa
, listener
);
262 Int128 llend
, llsize
;
265 if (vhost_vdpa_listener_skipped_section(section
, v
->iova_range
.first
,
266 v
->iova_range
.last
)) {
270 if (unlikely((section
->offset_within_address_space
& ~TARGET_PAGE_MASK
) !=
271 (section
->offset_within_region
& ~TARGET_PAGE_MASK
))) {
272 error_report("%s received unaligned region", __func__
);
276 iova
= TARGET_PAGE_ALIGN(section
->offset_within_address_space
);
277 llend
= vhost_vdpa_section_end(section
);
279 trace_vhost_vdpa_listener_region_del(v
, iova
, int128_get64(llend
));
281 if (int128_ge(int128_make64(iova
), llend
)) {
285 llsize
= int128_sub(llend
, int128_make64(iova
));
287 if (v
->shadow_vqs_enabled
) {
288 const DMAMap
*result
;
289 const void *vaddr
= memory_region_get_ram_ptr(section
->mr
) +
290 section
->offset_within_region
+
291 (iova
- section
->offset_within_address_space
);
292 DMAMap mem_region
= {
293 .translated_addr
= (hwaddr
)(uintptr_t)vaddr
,
294 .size
= int128_get64(llsize
) - 1,
297 result
= vhost_iova_tree_find_iova(v
->iova_tree
, &mem_region
);
299 /* The memory listener map wasn't mapped */
303 vhost_iova_tree_remove(v
->iova_tree
, *result
);
305 vhost_vdpa_iotlb_batch_begin_once(v
);
306 ret
= vhost_vdpa_dma_unmap(v
, iova
, int128_get64(llsize
));
308 error_report("vhost_vdpa dma unmap error!");
311 memory_region_unref(section
->mr
);
314 * IOTLB API is used by vhost-vdpa which requires incremental updating
315 * of the mapping. So we can not use generic vhost memory listener which
316 * depends on the addnop().
318 static const MemoryListener vhost_vdpa_memory_listener
= {
319 .name
= "vhost-vdpa",
320 .commit
= vhost_vdpa_listener_commit
,
321 .region_add
= vhost_vdpa_listener_region_add
,
322 .region_del
= vhost_vdpa_listener_region_del
,
325 static int vhost_vdpa_call(struct vhost_dev
*dev
, unsigned long int request
,
328 struct vhost_vdpa
*v
= dev
->opaque
;
329 int fd
= v
->device_fd
;
332 assert(dev
->vhost_ops
->backend_type
== VHOST_BACKEND_TYPE_VDPA
);
334 ret
= ioctl(fd
, request
, arg
);
335 return ret
< 0 ? -errno
: ret
;
338 static int vhost_vdpa_add_status(struct vhost_dev
*dev
, uint8_t status
)
343 trace_vhost_vdpa_add_status(dev
, status
);
344 ret
= vhost_vdpa_call(dev
, VHOST_VDPA_GET_STATUS
, &s
);
351 ret
= vhost_vdpa_call(dev
, VHOST_VDPA_SET_STATUS
, &s
);
356 ret
= vhost_vdpa_call(dev
, VHOST_VDPA_GET_STATUS
, &s
);
368 static void vhost_vdpa_get_iova_range(struct vhost_vdpa
*v
)
370 int ret
= vhost_vdpa_call(v
->dev
, VHOST_VDPA_GET_IOVA_RANGE
,
373 v
->iova_range
.first
= 0;
374 v
->iova_range
.last
= UINT64_MAX
;
377 trace_vhost_vdpa_get_iova_range(v
->dev
, v
->iova_range
.first
,
382 * The use of this function is for requests that only need to be
383 * applied once. Typically such request occurs at the beginning
384 * of operation, and before setting up queues. It should not be
385 * used for request that performs operation until all queues are
386 * set, which would need to check dev->vq_index_end instead.
388 static bool vhost_vdpa_first_dev(struct vhost_dev
*dev
)
390 struct vhost_vdpa
*v
= dev
->opaque
;
392 return v
->index
== 0;
395 static int vhost_vdpa_get_dev_features(struct vhost_dev
*dev
,
400 ret
= vhost_vdpa_call(dev
, VHOST_GET_FEATURES
, features
);
401 trace_vhost_vdpa_get_features(dev
, *features
);
405 static int vhost_vdpa_init_svq(struct vhost_dev
*hdev
, struct vhost_vdpa
*v
,
408 g_autoptr(GPtrArray
) shadow_vqs
= NULL
;
409 uint64_t dev_features
, svq_features
;
413 if (!v
->shadow_vqs_enabled
) {
417 r
= vhost_vdpa_get_dev_features(hdev
, &dev_features
);
419 error_setg_errno(errp
, -r
, "Can't get vdpa device features");
423 svq_features
= dev_features
;
424 ok
= vhost_svq_valid_features(svq_features
, errp
);
429 shadow_vqs
= g_ptr_array_new_full(hdev
->nvqs
, vhost_svq_free
);
430 for (unsigned n
= 0; n
< hdev
->nvqs
; ++n
) {
431 VhostShadowVirtqueue
*svq
;
433 svq
= vhost_svq_new(v
->shadow_vq_ops
, v
->shadow_vq_ops_opaque
);
434 g_ptr_array_add(shadow_vqs
, svq
);
437 v
->shadow_vqs
= g_steal_pointer(&shadow_vqs
);
441 static int vhost_vdpa_init(struct vhost_dev
*dev
, void *opaque
, Error
**errp
)
443 struct vhost_vdpa
*v
;
444 assert(dev
->vhost_ops
->backend_type
== VHOST_BACKEND_TYPE_VDPA
);
445 trace_vhost_vdpa_init(dev
, opaque
);
449 * Similar to VFIO, we end up pinning all guest memory and have to
450 * disable discarding of RAM.
452 ret
= ram_block_discard_disable(true);
454 error_report("Cannot set discarding of RAM broken");
460 dev
->opaque
= opaque
;
461 v
->listener
= vhost_vdpa_memory_listener
;
462 v
->msg_type
= VHOST_IOTLB_MSG_V2
;
463 ret
= vhost_vdpa_init_svq(dev
, v
, errp
);
468 vhost_vdpa_get_iova_range(v
);
470 if (!vhost_vdpa_first_dev(dev
)) {
474 vhost_vdpa_add_status(dev
, VIRTIO_CONFIG_S_ACKNOWLEDGE
|
475 VIRTIO_CONFIG_S_DRIVER
);
480 ram_block_discard_disable(false);
484 static void vhost_vdpa_host_notifier_uninit(struct vhost_dev
*dev
,
487 size_t page_size
= qemu_real_host_page_size();
488 struct vhost_vdpa
*v
= dev
->opaque
;
489 VirtIODevice
*vdev
= dev
->vdev
;
490 VhostVDPAHostNotifier
*n
;
492 n
= &v
->notifier
[queue_index
];
495 virtio_queue_set_host_notifier_mr(vdev
, queue_index
, &n
->mr
, false);
496 object_unparent(OBJECT(&n
->mr
));
497 munmap(n
->addr
, page_size
);
502 static int vhost_vdpa_host_notifier_init(struct vhost_dev
*dev
, int queue_index
)
504 size_t page_size
= qemu_real_host_page_size();
505 struct vhost_vdpa
*v
= dev
->opaque
;
506 VirtIODevice
*vdev
= dev
->vdev
;
507 VhostVDPAHostNotifier
*n
;
508 int fd
= v
->device_fd
;
512 vhost_vdpa_host_notifier_uninit(dev
, queue_index
);
514 n
= &v
->notifier
[queue_index
];
516 addr
= mmap(NULL
, page_size
, PROT_WRITE
, MAP_SHARED
, fd
,
517 queue_index
* page_size
);
518 if (addr
== MAP_FAILED
) {
522 name
= g_strdup_printf("vhost-vdpa/host-notifier@%p mmaps[%d]",
524 memory_region_init_ram_device_ptr(&n
->mr
, OBJECT(vdev
), name
,
528 if (virtio_queue_set_host_notifier_mr(vdev
, queue_index
, &n
->mr
, true)) {
529 object_unparent(OBJECT(&n
->mr
));
530 munmap(addr
, page_size
);
541 static void vhost_vdpa_host_notifiers_uninit(struct vhost_dev
*dev
, int n
)
545 for (i
= dev
->vq_index
; i
< dev
->vq_index
+ n
; i
++) {
546 vhost_vdpa_host_notifier_uninit(dev
, i
);
550 static void vhost_vdpa_host_notifiers_init(struct vhost_dev
*dev
)
552 struct vhost_vdpa
*v
= dev
->opaque
;
555 if (v
->shadow_vqs_enabled
) {
556 /* FIXME SVQ is not compatible with host notifiers mr */
560 for (i
= dev
->vq_index
; i
< dev
->vq_index
+ dev
->nvqs
; i
++) {
561 if (vhost_vdpa_host_notifier_init(dev
, i
)) {
569 vhost_vdpa_host_notifiers_uninit(dev
, i
- dev
->vq_index
);
573 static void vhost_vdpa_svq_cleanup(struct vhost_dev
*dev
)
575 struct vhost_vdpa
*v
= dev
->opaque
;
578 if (!v
->shadow_vqs
) {
582 for (idx
= 0; idx
< v
->shadow_vqs
->len
; ++idx
) {
583 vhost_svq_stop(g_ptr_array_index(v
->shadow_vqs
, idx
));
585 g_ptr_array_free(v
->shadow_vqs
, true);
588 static int vhost_vdpa_cleanup(struct vhost_dev
*dev
)
590 struct vhost_vdpa
*v
;
591 assert(dev
->vhost_ops
->backend_type
== VHOST_BACKEND_TYPE_VDPA
);
593 trace_vhost_vdpa_cleanup(dev
, v
);
594 vhost_vdpa_host_notifiers_uninit(dev
, dev
->nvqs
);
595 memory_listener_unregister(&v
->listener
);
596 vhost_vdpa_svq_cleanup(dev
);
599 ram_block_discard_disable(false);
604 static int vhost_vdpa_memslots_limit(struct vhost_dev
*dev
)
606 trace_vhost_vdpa_memslots_limit(dev
, INT_MAX
);
610 static int vhost_vdpa_set_mem_table(struct vhost_dev
*dev
,
611 struct vhost_memory
*mem
)
613 if (!vhost_vdpa_first_dev(dev
)) {
617 trace_vhost_vdpa_set_mem_table(dev
, mem
->nregions
, mem
->padding
);
618 if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_MEM_TABLE
) &&
619 trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_REGIONS
)) {
621 for (i
= 0; i
< mem
->nregions
; i
++) {
622 trace_vhost_vdpa_dump_regions(dev
, i
,
623 mem
->regions
[i
].guest_phys_addr
,
624 mem
->regions
[i
].memory_size
,
625 mem
->regions
[i
].userspace_addr
,
626 mem
->regions
[i
].flags_padding
);
636 static int vhost_vdpa_set_features(struct vhost_dev
*dev
,
639 struct vhost_vdpa
*v
= dev
->opaque
;
642 if (!vhost_vdpa_first_dev(dev
)) {
646 if (v
->shadow_vqs_enabled
) {
647 if ((v
->acked_features
^ features
) == BIT_ULL(VHOST_F_LOG_ALL
)) {
649 * QEMU is just trying to enable or disable logging. SVQ handles
650 * this sepparately, so no need to forward this.
652 v
->acked_features
= features
;
656 v
->acked_features
= features
;
658 /* We must not ack _F_LOG if SVQ is enabled */
659 features
&= ~BIT_ULL(VHOST_F_LOG_ALL
);
662 trace_vhost_vdpa_set_features(dev
, features
);
663 ret
= vhost_vdpa_call(dev
, VHOST_SET_FEATURES
, &features
);
668 return vhost_vdpa_add_status(dev
, VIRTIO_CONFIG_S_FEATURES_OK
);
671 static int vhost_vdpa_set_backend_cap(struct vhost_dev
*dev
)
674 uint64_t f
= 0x1ULL
<< VHOST_BACKEND_F_IOTLB_MSG_V2
|
675 0x1ULL
<< VHOST_BACKEND_F_IOTLB_BATCH
;
678 if (vhost_vdpa_call(dev
, VHOST_GET_BACKEND_FEATURES
, &features
)) {
684 if (vhost_vdpa_first_dev(dev
)) {
685 r
= vhost_vdpa_call(dev
, VHOST_SET_BACKEND_FEATURES
, &features
);
691 dev
->backend_cap
= features
;
696 static int vhost_vdpa_get_device_id(struct vhost_dev
*dev
,
700 ret
= vhost_vdpa_call(dev
, VHOST_VDPA_GET_DEVICE_ID
, device_id
);
701 trace_vhost_vdpa_get_device_id(dev
, *device_id
);
705 static void vhost_vdpa_reset_svq(struct vhost_vdpa
*v
)
707 if (!v
->shadow_vqs_enabled
) {
711 for (unsigned i
= 0; i
< v
->shadow_vqs
->len
; ++i
) {
712 VhostShadowVirtqueue
*svq
= g_ptr_array_index(v
->shadow_vqs
, i
);
717 static int vhost_vdpa_reset_device(struct vhost_dev
*dev
)
719 struct vhost_vdpa
*v
= dev
->opaque
;
723 vhost_vdpa_reset_svq(v
);
725 ret
= vhost_vdpa_call(dev
, VHOST_VDPA_SET_STATUS
, &status
);
726 trace_vhost_vdpa_reset_device(dev
, status
);
730 static int vhost_vdpa_get_vq_index(struct vhost_dev
*dev
, int idx
)
732 assert(idx
>= dev
->vq_index
&& idx
< dev
->vq_index
+ dev
->nvqs
);
734 trace_vhost_vdpa_get_vq_index(dev
, idx
, idx
);
738 static int vhost_vdpa_set_vring_ready(struct vhost_dev
*dev
)
741 trace_vhost_vdpa_set_vring_ready(dev
);
742 for (i
= 0; i
< dev
->nvqs
; ++i
) {
743 struct vhost_vring_state state
= {
744 .index
= dev
->vq_index
+ i
,
747 vhost_vdpa_call(dev
, VHOST_VDPA_SET_VRING_ENABLE
, &state
);
752 static void vhost_vdpa_dump_config(struct vhost_dev
*dev
, const uint8_t *config
,
756 char line
[QEMU_HEXDUMP_LINE_LEN
];
758 for (b
= 0; b
< config_len
; b
+= 16) {
759 len
= config_len
- b
;
760 qemu_hexdump_line(line
, b
, config
, len
, false);
761 trace_vhost_vdpa_dump_config(dev
, line
);
765 static int vhost_vdpa_set_config(struct vhost_dev
*dev
, const uint8_t *data
,
766 uint32_t offset
, uint32_t size
,
769 struct vhost_vdpa_config
*config
;
771 unsigned long config_size
= offsetof(struct vhost_vdpa_config
, buf
);
773 trace_vhost_vdpa_set_config(dev
, offset
, size
, flags
);
774 config
= g_malloc(size
+ config_size
);
775 config
->off
= offset
;
777 memcpy(config
->buf
, data
, size
);
778 if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_CONFIG
) &&
779 trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG
)) {
780 vhost_vdpa_dump_config(dev
, data
, size
);
782 ret
= vhost_vdpa_call(dev
, VHOST_VDPA_SET_CONFIG
, config
);
787 static int vhost_vdpa_get_config(struct vhost_dev
*dev
, uint8_t *config
,
788 uint32_t config_len
, Error
**errp
)
790 struct vhost_vdpa_config
*v_config
;
791 unsigned long config_size
= offsetof(struct vhost_vdpa_config
, buf
);
794 trace_vhost_vdpa_get_config(dev
, config
, config_len
);
795 v_config
= g_malloc(config_len
+ config_size
);
796 v_config
->len
= config_len
;
798 ret
= vhost_vdpa_call(dev
, VHOST_VDPA_GET_CONFIG
, v_config
);
799 memcpy(config
, v_config
->buf
, config_len
);
801 if (trace_event_get_state_backends(TRACE_VHOST_VDPA_GET_CONFIG
) &&
802 trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG
)) {
803 vhost_vdpa_dump_config(dev
, config
, config_len
);
808 static int vhost_vdpa_set_dev_vring_base(struct vhost_dev
*dev
,
809 struct vhost_vring_state
*ring
)
811 trace_vhost_vdpa_set_vring_base(dev
, ring
->index
, ring
->num
);
812 return vhost_vdpa_call(dev
, VHOST_SET_VRING_BASE
, ring
);
815 static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev
*dev
,
816 struct vhost_vring_file
*file
)
818 trace_vhost_vdpa_set_vring_kick(dev
, file
->index
, file
->fd
);
819 return vhost_vdpa_call(dev
, VHOST_SET_VRING_KICK
, file
);
822 static int vhost_vdpa_set_vring_dev_call(struct vhost_dev
*dev
,
823 struct vhost_vring_file
*file
)
825 trace_vhost_vdpa_set_vring_call(dev
, file
->index
, file
->fd
);
826 return vhost_vdpa_call(dev
, VHOST_SET_VRING_CALL
, file
);
829 static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev
*dev
,
830 struct vhost_vring_addr
*addr
)
832 trace_vhost_vdpa_set_vring_addr(dev
, addr
->index
, addr
->flags
,
833 addr
->desc_user_addr
, addr
->used_user_addr
,
834 addr
->avail_user_addr
,
835 addr
->log_guest_addr
);
837 return vhost_vdpa_call(dev
, VHOST_SET_VRING_ADDR
, addr
);
842 * Set the shadow virtqueue descriptors to the device
844 * @dev: The vhost device model
845 * @svq: The shadow virtqueue
846 * @idx: The index of the virtqueue in the vhost device
849 * Note that this function does not rewind kick file descriptor if cannot set
852 static int vhost_vdpa_svq_set_fds(struct vhost_dev
*dev
,
853 VhostShadowVirtqueue
*svq
, unsigned idx
,
856 struct vhost_vring_file file
= {
857 .index
= dev
->vq_index
+ idx
,
859 const EventNotifier
*event_notifier
= &svq
->hdev_kick
;
862 r
= event_notifier_init(&svq
->hdev_kick
, 0);
864 error_setg_errno(errp
, -r
, "Couldn't create kick event notifier");
865 goto err_init_hdev_kick
;
868 r
= event_notifier_init(&svq
->hdev_call
, 0);
870 error_setg_errno(errp
, -r
, "Couldn't create call event notifier");
871 goto err_init_hdev_call
;
874 file
.fd
= event_notifier_get_fd(event_notifier
);
875 r
= vhost_vdpa_set_vring_dev_kick(dev
, &file
);
876 if (unlikely(r
!= 0)) {
877 error_setg_errno(errp
, -r
, "Can't set device kick fd");
878 goto err_init_set_dev_fd
;
881 event_notifier
= &svq
->hdev_call
;
882 file
.fd
= event_notifier_get_fd(event_notifier
);
883 r
= vhost_vdpa_set_vring_dev_call(dev
, &file
);
884 if (unlikely(r
!= 0)) {
885 error_setg_errno(errp
, -r
, "Can't set device call fd");
886 goto err_init_set_dev_fd
;
892 event_notifier_set_handler(&svq
->hdev_call
, NULL
);
895 event_notifier_cleanup(&svq
->hdev_kick
);
902 * Unmap a SVQ area in the device
904 static void vhost_vdpa_svq_unmap_ring(struct vhost_vdpa
*v
, hwaddr addr
)
906 const DMAMap needle
= {
907 .translated_addr
= addr
,
909 const DMAMap
*result
= vhost_iova_tree_find_iova(v
->iova_tree
, &needle
);
913 if (unlikely(!result
)) {
914 error_report("Unable to find SVQ address to unmap");
918 size
= ROUND_UP(result
->size
, qemu_real_host_page_size());
919 r
= vhost_vdpa_dma_unmap(v
, result
->iova
, size
);
920 if (unlikely(r
< 0)) {
921 error_report("Unable to unmap SVQ vring: %s (%d)", g_strerror(-r
), -r
);
925 vhost_iova_tree_remove(v
->iova_tree
, *result
);
928 static void vhost_vdpa_svq_unmap_rings(struct vhost_dev
*dev
,
929 const VhostShadowVirtqueue
*svq
)
931 struct vhost_vdpa
*v
= dev
->opaque
;
932 struct vhost_vring_addr svq_addr
;
934 vhost_svq_get_vring_addr(svq
, &svq_addr
);
936 vhost_vdpa_svq_unmap_ring(v
, svq_addr
.desc_user_addr
);
938 vhost_vdpa_svq_unmap_ring(v
, svq_addr
.used_user_addr
);
942 * Map the SVQ area in the device
944 * @v: Vhost-vdpa device
945 * @needle: The area to search iova
946 * @errorp: Error pointer
948 static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa
*v
, DMAMap
*needle
,
953 r
= vhost_iova_tree_map_alloc(v
->iova_tree
, needle
);
954 if (unlikely(r
!= IOVA_OK
)) {
955 error_setg(errp
, "Cannot allocate iova (%d)", r
);
959 r
= vhost_vdpa_dma_map(v
, needle
->iova
, needle
->size
+ 1,
960 (void *)(uintptr_t)needle
->translated_addr
,
961 needle
->perm
== IOMMU_RO
);
962 if (unlikely(r
!= 0)) {
963 error_setg_errno(errp
, -r
, "Cannot map region to device");
964 vhost_iova_tree_remove(v
->iova_tree
, *needle
);
971 * Map the shadow virtqueue rings in the device
973 * @dev: The vhost device
974 * @svq: The shadow virtqueue
975 * @addr: Assigned IOVA addresses
976 * @errp: Error pointer
978 static bool vhost_vdpa_svq_map_rings(struct vhost_dev
*dev
,
979 const VhostShadowVirtqueue
*svq
,
980 struct vhost_vring_addr
*addr
,
984 DMAMap device_region
, driver_region
;
985 struct vhost_vring_addr svq_addr
;
986 struct vhost_vdpa
*v
= dev
->opaque
;
987 size_t device_size
= vhost_svq_device_area_size(svq
);
988 size_t driver_size
= vhost_svq_driver_area_size(svq
);
992 vhost_svq_get_vring_addr(svq
, &svq_addr
);
994 driver_region
= (DMAMap
) {
995 .translated_addr
= svq_addr
.desc_user_addr
,
996 .size
= driver_size
- 1,
999 ok
= vhost_vdpa_svq_map_ring(v
, &driver_region
, errp
);
1000 if (unlikely(!ok
)) {
1001 error_prepend(errp
, "Cannot create vq driver region: ");
1004 addr
->desc_user_addr
= driver_region
.iova
;
1005 avail_offset
= svq_addr
.avail_user_addr
- svq_addr
.desc_user_addr
;
1006 addr
->avail_user_addr
= driver_region
.iova
+ avail_offset
;
1008 device_region
= (DMAMap
) {
1009 .translated_addr
= svq_addr
.used_user_addr
,
1010 .size
= device_size
- 1,
1013 ok
= vhost_vdpa_svq_map_ring(v
, &device_region
, errp
);
1014 if (unlikely(!ok
)) {
1015 error_prepend(errp
, "Cannot create vq device region: ");
1016 vhost_vdpa_svq_unmap_ring(v
, driver_region
.translated_addr
);
1018 addr
->used_user_addr
= device_region
.iova
;
1023 static bool vhost_vdpa_svq_setup(struct vhost_dev
*dev
,
1024 VhostShadowVirtqueue
*svq
, unsigned idx
,
1027 uint16_t vq_index
= dev
->vq_index
+ idx
;
1028 struct vhost_vring_state s
= {
1033 r
= vhost_vdpa_set_dev_vring_base(dev
, &s
);
1035 error_setg_errno(errp
, -r
, "Cannot set vring base");
1039 r
= vhost_vdpa_svq_set_fds(dev
, svq
, idx
, errp
);
1043 static bool vhost_vdpa_svqs_start(struct vhost_dev
*dev
)
1045 struct vhost_vdpa
*v
= dev
->opaque
;
1049 if (!v
->shadow_vqs_enabled
) {
1053 for (i
= 0; i
< v
->shadow_vqs
->len
; ++i
) {
1054 VirtQueue
*vq
= virtio_get_queue(dev
->vdev
, dev
->vq_index
+ i
);
1055 VhostShadowVirtqueue
*svq
= g_ptr_array_index(v
->shadow_vqs
, i
);
1056 struct vhost_vring_addr addr
= {
1057 .index
= dev
->vq_index
+ i
,
1060 bool ok
= vhost_vdpa_svq_setup(dev
, svq
, i
, &err
);
1061 if (unlikely(!ok
)) {
1065 vhost_svq_start(svq
, dev
->vdev
, vq
, v
->iova_tree
);
1066 ok
= vhost_vdpa_svq_map_rings(dev
, svq
, &addr
, &err
);
1067 if (unlikely(!ok
)) {
1071 /* Override vring GPA set by vhost subsystem */
1072 r
= vhost_vdpa_set_vring_dev_addr(dev
, &addr
);
1073 if (unlikely(r
!= 0)) {
1074 error_setg_errno(&err
, -r
, "Cannot set device address");
1082 vhost_vdpa_svq_unmap_rings(dev
, g_ptr_array_index(v
->shadow_vqs
, i
));
1085 vhost_svq_stop(g_ptr_array_index(v
->shadow_vqs
, i
));
1088 error_reportf_err(err
, "Cannot setup SVQ %u: ", i
);
1089 for (unsigned j
= 0; j
< i
; ++j
) {
1090 VhostShadowVirtqueue
*svq
= g_ptr_array_index(v
->shadow_vqs
, j
);
1091 vhost_vdpa_svq_unmap_rings(dev
, svq
);
1092 vhost_svq_stop(svq
);
1098 static void vhost_vdpa_svqs_stop(struct vhost_dev
*dev
)
1100 struct vhost_vdpa
*v
= dev
->opaque
;
1102 if (!v
->shadow_vqs_enabled
) {
1106 for (unsigned i
= 0; i
< v
->shadow_vqs
->len
; ++i
) {
1107 VhostShadowVirtqueue
*svq
= g_ptr_array_index(v
->shadow_vqs
, i
);
1108 vhost_vdpa_svq_unmap_rings(dev
, svq
);
1110 event_notifier_cleanup(&svq
->hdev_kick
);
1111 event_notifier_cleanup(&svq
->hdev_call
);
1115 static int vhost_vdpa_dev_start(struct vhost_dev
*dev
, bool started
)
1117 struct vhost_vdpa
*v
= dev
->opaque
;
1119 trace_vhost_vdpa_dev_start(dev
, started
);
1122 vhost_vdpa_host_notifiers_init(dev
);
1123 ok
= vhost_vdpa_svqs_start(dev
);
1124 if (unlikely(!ok
)) {
1127 vhost_vdpa_set_vring_ready(dev
);
1129 vhost_vdpa_svqs_stop(dev
);
1130 vhost_vdpa_host_notifiers_uninit(dev
, dev
->nvqs
);
1133 if (dev
->vq_index
+ dev
->nvqs
!= dev
->vq_index_end
) {
1138 memory_listener_register(&v
->listener
, &address_space_memory
);
1139 return vhost_vdpa_add_status(dev
, VIRTIO_CONFIG_S_DRIVER_OK
);
1141 vhost_vdpa_reset_device(dev
);
1142 vhost_vdpa_add_status(dev
, VIRTIO_CONFIG_S_ACKNOWLEDGE
|
1143 VIRTIO_CONFIG_S_DRIVER
);
1144 memory_listener_unregister(&v
->listener
);
1150 static int vhost_vdpa_set_log_base(struct vhost_dev
*dev
, uint64_t base
,
1151 struct vhost_log
*log
)
1153 struct vhost_vdpa
*v
= dev
->opaque
;
1154 if (v
->shadow_vqs_enabled
|| !vhost_vdpa_first_dev(dev
)) {
1158 trace_vhost_vdpa_set_log_base(dev
, base
, log
->size
, log
->refcnt
, log
->fd
,
1160 return vhost_vdpa_call(dev
, VHOST_SET_LOG_BASE
, &base
);
1163 static int vhost_vdpa_set_vring_addr(struct vhost_dev
*dev
,
1164 struct vhost_vring_addr
*addr
)
1166 struct vhost_vdpa
*v
= dev
->opaque
;
1168 if (v
->shadow_vqs_enabled
) {
1170 * Device vring addr was set at device start. SVQ base is handled by
1176 return vhost_vdpa_set_vring_dev_addr(dev
, addr
);
1179 static int vhost_vdpa_set_vring_num(struct vhost_dev
*dev
,
1180 struct vhost_vring_state
*ring
)
1182 trace_vhost_vdpa_set_vring_num(dev
, ring
->index
, ring
->num
);
1183 return vhost_vdpa_call(dev
, VHOST_SET_VRING_NUM
, ring
);
1186 static int vhost_vdpa_set_vring_base(struct vhost_dev
*dev
,
1187 struct vhost_vring_state
*ring
)
1189 struct vhost_vdpa
*v
= dev
->opaque
;
1190 VirtQueue
*vq
= virtio_get_queue(dev
->vdev
, ring
->index
);
1193 * vhost-vdpa devices does not support in-flight requests. Set all of them
1196 * TODO: This is ok for networking, but other kinds of devices might
1197 * have problems with these retransmissions.
1199 while (virtqueue_rewind(vq
, 1)) {
1202 if (v
->shadow_vqs_enabled
) {
1204 * Device vring base was set at device start. SVQ base is handled by
1210 return vhost_vdpa_set_dev_vring_base(dev
, ring
);
1213 static int vhost_vdpa_get_vring_base(struct vhost_dev
*dev
,
1214 struct vhost_vring_state
*ring
)
1216 struct vhost_vdpa
*v
= dev
->opaque
;
1219 if (v
->shadow_vqs_enabled
) {
1220 ring
->num
= virtio_queue_get_last_avail_idx(dev
->vdev
, ring
->index
);
1224 ret
= vhost_vdpa_call(dev
, VHOST_GET_VRING_BASE
, ring
);
1225 trace_vhost_vdpa_get_vring_base(dev
, ring
->index
, ring
->num
);
1229 static int vhost_vdpa_set_vring_kick(struct vhost_dev
*dev
,
1230 struct vhost_vring_file
*file
)
1232 struct vhost_vdpa
*v
= dev
->opaque
;
1233 int vdpa_idx
= file
->index
- dev
->vq_index
;
1235 if (v
->shadow_vqs_enabled
) {
1236 VhostShadowVirtqueue
*svq
= g_ptr_array_index(v
->shadow_vqs
, vdpa_idx
);
1237 vhost_svq_set_svq_kick_fd(svq
, file
->fd
);
1240 return vhost_vdpa_set_vring_dev_kick(dev
, file
);
1244 static int vhost_vdpa_set_vring_call(struct vhost_dev
*dev
,
1245 struct vhost_vring_file
*file
)
1247 struct vhost_vdpa
*v
= dev
->opaque
;
1249 if (v
->shadow_vqs_enabled
) {
1250 int vdpa_idx
= file
->index
- dev
->vq_index
;
1251 VhostShadowVirtqueue
*svq
= g_ptr_array_index(v
->shadow_vqs
, vdpa_idx
);
1253 vhost_svq_set_svq_call_fd(svq
, file
->fd
);
1256 return vhost_vdpa_set_vring_dev_call(dev
, file
);
1260 static int vhost_vdpa_get_features(struct vhost_dev
*dev
,
1263 struct vhost_vdpa
*v
= dev
->opaque
;
1264 int ret
= vhost_vdpa_get_dev_features(dev
, features
);
1266 if (ret
== 0 && v
->shadow_vqs_enabled
) {
1267 /* Add SVQ logging capabilities */
1268 *features
|= BIT_ULL(VHOST_F_LOG_ALL
);
1274 static int vhost_vdpa_set_owner(struct vhost_dev
*dev
)
1276 if (!vhost_vdpa_first_dev(dev
)) {
1280 trace_vhost_vdpa_set_owner(dev
);
1281 return vhost_vdpa_call(dev
, VHOST_SET_OWNER
, NULL
);
1284 static int vhost_vdpa_vq_get_addr(struct vhost_dev
*dev
,
1285 struct vhost_vring_addr
*addr
, struct vhost_virtqueue
*vq
)
1287 assert(dev
->vhost_ops
->backend_type
== VHOST_BACKEND_TYPE_VDPA
);
1288 addr
->desc_user_addr
= (uint64_t)(unsigned long)vq
->desc_phys
;
1289 addr
->avail_user_addr
= (uint64_t)(unsigned long)vq
->avail_phys
;
1290 addr
->used_user_addr
= (uint64_t)(unsigned long)vq
->used_phys
;
1291 trace_vhost_vdpa_vq_get_addr(dev
, vq
, addr
->desc_user_addr
,
1292 addr
->avail_user_addr
, addr
->used_user_addr
);
1296 static bool vhost_vdpa_force_iommu(struct vhost_dev
*dev
)
1301 const VhostOps vdpa_ops
= {
1302 .backend_type
= VHOST_BACKEND_TYPE_VDPA
,
1303 .vhost_backend_init
= vhost_vdpa_init
,
1304 .vhost_backend_cleanup
= vhost_vdpa_cleanup
,
1305 .vhost_set_log_base
= vhost_vdpa_set_log_base
,
1306 .vhost_set_vring_addr
= vhost_vdpa_set_vring_addr
,
1307 .vhost_set_vring_num
= vhost_vdpa_set_vring_num
,
1308 .vhost_set_vring_base
= vhost_vdpa_set_vring_base
,
1309 .vhost_get_vring_base
= vhost_vdpa_get_vring_base
,
1310 .vhost_set_vring_kick
= vhost_vdpa_set_vring_kick
,
1311 .vhost_set_vring_call
= vhost_vdpa_set_vring_call
,
1312 .vhost_get_features
= vhost_vdpa_get_features
,
1313 .vhost_set_backend_cap
= vhost_vdpa_set_backend_cap
,
1314 .vhost_set_owner
= vhost_vdpa_set_owner
,
1315 .vhost_set_vring_endian
= NULL
,
1316 .vhost_backend_memslots_limit
= vhost_vdpa_memslots_limit
,
1317 .vhost_set_mem_table
= vhost_vdpa_set_mem_table
,
1318 .vhost_set_features
= vhost_vdpa_set_features
,
1319 .vhost_reset_device
= vhost_vdpa_reset_device
,
1320 .vhost_get_vq_index
= vhost_vdpa_get_vq_index
,
1321 .vhost_get_config
= vhost_vdpa_get_config
,
1322 .vhost_set_config
= vhost_vdpa_set_config
,
1323 .vhost_requires_shm_log
= NULL
,
1324 .vhost_migration_done
= NULL
,
1325 .vhost_backend_can_merge
= NULL
,
1326 .vhost_net_set_mtu
= NULL
,
1327 .vhost_set_iotlb_callback
= NULL
,
1328 .vhost_send_device_iotlb_msg
= NULL
,
1329 .vhost_dev_start
= vhost_vdpa_dev_start
,
1330 .vhost_get_device_id
= vhost_vdpa_get_device_id
,
1331 .vhost_vq_get_addr
= vhost_vdpa_vq_get_addr
,
1332 .vhost_force_iommu
= vhost_vdpa_force_iommu
,