]> git.proxmox.com Git - mirror_qemu.git/blob - hw/virtio/vhost-vdpa.c
vhost: move iova_tree set to vhost_svq_start
[mirror_qemu.git] / hw / virtio / vhost-vdpa.c
1 /*
2 * vhost-vdpa
3 *
4 * Copyright(c) 2017-2018 Intel Corporation.
5 * Copyright(c) 2020 Red Hat, Inc.
6 *
7 * This work is licensed under the terms of the GNU GPL, version 2 or later.
8 * See the COPYING file in the top-level directory.
9 *
10 */
11
12 #include "qemu/osdep.h"
13 #include <linux/vhost.h>
14 #include <linux/vfio.h>
15 #include <sys/eventfd.h>
16 #include <sys/ioctl.h>
17 #include "hw/virtio/vhost.h"
18 #include "hw/virtio/vhost-backend.h"
19 #include "hw/virtio/virtio-net.h"
20 #include "hw/virtio/vhost-shadow-virtqueue.h"
21 #include "hw/virtio/vhost-vdpa.h"
22 #include "exec/address-spaces.h"
23 #include "migration/blocker.h"
24 #include "qemu/cutils.h"
25 #include "qemu/main-loop.h"
26 #include "cpu.h"
27 #include "trace.h"
28 #include "qapi/error.h"
29
30 /*
31 * Return one past the end of the end of section. Be careful with uint64_t
32 * conversions!
33 */
34 static Int128 vhost_vdpa_section_end(const MemoryRegionSection *section)
35 {
36 Int128 llend = int128_make64(section->offset_within_address_space);
37 llend = int128_add(llend, section->size);
38 llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
39
40 return llend;
41 }
42
43 static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section,
44 uint64_t iova_min,
45 uint64_t iova_max)
46 {
47 Int128 llend;
48
49 if ((!memory_region_is_ram(section->mr) &&
50 !memory_region_is_iommu(section->mr)) ||
51 memory_region_is_protected(section->mr) ||
52 /* vhost-vDPA doesn't allow MMIO to be mapped */
53 memory_region_is_ram_device(section->mr)) {
54 return true;
55 }
56
57 if (section->offset_within_address_space < iova_min) {
58 error_report("RAM section out of device range (min=0x%" PRIx64
59 ", addr=0x%" HWADDR_PRIx ")",
60 iova_min, section->offset_within_address_space);
61 return true;
62 }
63
64 llend = vhost_vdpa_section_end(section);
65 if (int128_gt(llend, int128_make64(iova_max))) {
66 error_report("RAM section out of device range (max=0x%" PRIx64
67 ", end addr=0x%" PRIx64 ")",
68 iova_max, int128_get64(llend));
69 return true;
70 }
71
72 return false;
73 }
74
75 int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr iova, hwaddr size,
76 void *vaddr, bool readonly)
77 {
78 struct vhost_msg_v2 msg = {};
79 int fd = v->device_fd;
80 int ret = 0;
81
82 msg.type = v->msg_type;
83 msg.iotlb.iova = iova;
84 msg.iotlb.size = size;
85 msg.iotlb.uaddr = (uint64_t)(uintptr_t)vaddr;
86 msg.iotlb.perm = readonly ? VHOST_ACCESS_RO : VHOST_ACCESS_RW;
87 msg.iotlb.type = VHOST_IOTLB_UPDATE;
88
89 trace_vhost_vdpa_dma_map(v, fd, msg.type, msg.iotlb.iova, msg.iotlb.size,
90 msg.iotlb.uaddr, msg.iotlb.perm, msg.iotlb.type);
91
92 if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
93 error_report("failed to write, fd=%d, errno=%d (%s)",
94 fd, errno, strerror(errno));
95 return -EIO ;
96 }
97
98 return ret;
99 }
100
101 int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, hwaddr iova, hwaddr size)
102 {
103 struct vhost_msg_v2 msg = {};
104 int fd = v->device_fd;
105 int ret = 0;
106
107 msg.type = v->msg_type;
108 msg.iotlb.iova = iova;
109 msg.iotlb.size = size;
110 msg.iotlb.type = VHOST_IOTLB_INVALIDATE;
111
112 trace_vhost_vdpa_dma_unmap(v, fd, msg.type, msg.iotlb.iova,
113 msg.iotlb.size, msg.iotlb.type);
114
115 if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
116 error_report("failed to write, fd=%d, errno=%d (%s)",
117 fd, errno, strerror(errno));
118 return -EIO ;
119 }
120
121 return ret;
122 }
123
124 static void vhost_vdpa_listener_begin_batch(struct vhost_vdpa *v)
125 {
126 int fd = v->device_fd;
127 struct vhost_msg_v2 msg = {
128 .type = v->msg_type,
129 .iotlb.type = VHOST_IOTLB_BATCH_BEGIN,
130 };
131
132 trace_vhost_vdpa_listener_begin_batch(v, fd, msg.type, msg.iotlb.type);
133 if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
134 error_report("failed to write, fd=%d, errno=%d (%s)",
135 fd, errno, strerror(errno));
136 }
137 }
138
139 static void vhost_vdpa_iotlb_batch_begin_once(struct vhost_vdpa *v)
140 {
141 if (v->dev->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH) &&
142 !v->iotlb_batch_begin_sent) {
143 vhost_vdpa_listener_begin_batch(v);
144 }
145
146 v->iotlb_batch_begin_sent = true;
147 }
148
149 static void vhost_vdpa_listener_commit(MemoryListener *listener)
150 {
151 struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener);
152 struct vhost_dev *dev = v->dev;
153 struct vhost_msg_v2 msg = {};
154 int fd = v->device_fd;
155
156 if (!(dev->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH))) {
157 return;
158 }
159
160 if (!v->iotlb_batch_begin_sent) {
161 return;
162 }
163
164 msg.type = v->msg_type;
165 msg.iotlb.type = VHOST_IOTLB_BATCH_END;
166
167 trace_vhost_vdpa_listener_commit(v, fd, msg.type, msg.iotlb.type);
168 if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
169 error_report("failed to write, fd=%d, errno=%d (%s)",
170 fd, errno, strerror(errno));
171 }
172
173 v->iotlb_batch_begin_sent = false;
174 }
175
176 static void vhost_vdpa_listener_region_add(MemoryListener *listener,
177 MemoryRegionSection *section)
178 {
179 DMAMap mem_region = {};
180 struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener);
181 hwaddr iova;
182 Int128 llend, llsize;
183 void *vaddr;
184 int ret;
185
186 if (vhost_vdpa_listener_skipped_section(section, v->iova_range.first,
187 v->iova_range.last)) {
188 return;
189 }
190
191 if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
192 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
193 error_report("%s received unaligned region", __func__);
194 return;
195 }
196
197 iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
198 llend = vhost_vdpa_section_end(section);
199 if (int128_ge(int128_make64(iova), llend)) {
200 return;
201 }
202
203 memory_region_ref(section->mr);
204
205 /* Here we assume that memory_region_is_ram(section->mr)==true */
206
207 vaddr = memory_region_get_ram_ptr(section->mr) +
208 section->offset_within_region +
209 (iova - section->offset_within_address_space);
210
211 trace_vhost_vdpa_listener_region_add(v, iova, int128_get64(llend),
212 vaddr, section->readonly);
213
214 llsize = int128_sub(llend, int128_make64(iova));
215 if (v->shadow_vqs_enabled) {
216 int r;
217
218 mem_region.translated_addr = (hwaddr)(uintptr_t)vaddr,
219 mem_region.size = int128_get64(llsize) - 1,
220 mem_region.perm = IOMMU_ACCESS_FLAG(true, section->readonly),
221
222 r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region);
223 if (unlikely(r != IOVA_OK)) {
224 error_report("Can't allocate a mapping (%d)", r);
225 goto fail;
226 }
227
228 iova = mem_region.iova;
229 }
230
231 vhost_vdpa_iotlb_batch_begin_once(v);
232 ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
233 vaddr, section->readonly);
234 if (ret) {
235 error_report("vhost vdpa map fail!");
236 goto fail_map;
237 }
238
239 return;
240
241 fail_map:
242 if (v->shadow_vqs_enabled) {
243 vhost_iova_tree_remove(v->iova_tree, mem_region);
244 }
245
246 fail:
247 /*
248 * On the initfn path, store the first error in the container so we
249 * can gracefully fail. Runtime, there's not much we can do other
250 * than throw a hardware error.
251 */
252 error_report("vhost-vdpa: DMA mapping failed, unable to continue");
253 return;
254
255 }
256
257 static void vhost_vdpa_listener_region_del(MemoryListener *listener,
258 MemoryRegionSection *section)
259 {
260 struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener);
261 hwaddr iova;
262 Int128 llend, llsize;
263 int ret;
264
265 if (vhost_vdpa_listener_skipped_section(section, v->iova_range.first,
266 v->iova_range.last)) {
267 return;
268 }
269
270 if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
271 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
272 error_report("%s received unaligned region", __func__);
273 return;
274 }
275
276 iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
277 llend = vhost_vdpa_section_end(section);
278
279 trace_vhost_vdpa_listener_region_del(v, iova, int128_get64(llend));
280
281 if (int128_ge(int128_make64(iova), llend)) {
282 return;
283 }
284
285 llsize = int128_sub(llend, int128_make64(iova));
286
287 if (v->shadow_vqs_enabled) {
288 const DMAMap *result;
289 const void *vaddr = memory_region_get_ram_ptr(section->mr) +
290 section->offset_within_region +
291 (iova - section->offset_within_address_space);
292 DMAMap mem_region = {
293 .translated_addr = (hwaddr)(uintptr_t)vaddr,
294 .size = int128_get64(llsize) - 1,
295 };
296
297 result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region);
298 if (!result) {
299 /* The memory listener map wasn't mapped */
300 return;
301 }
302 iova = result->iova;
303 vhost_iova_tree_remove(v->iova_tree, *result);
304 }
305 vhost_vdpa_iotlb_batch_begin_once(v);
306 ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize));
307 if (ret) {
308 error_report("vhost_vdpa dma unmap error!");
309 }
310
311 memory_region_unref(section->mr);
312 }
313 /*
314 * IOTLB API is used by vhost-vdpa which requires incremental updating
315 * of the mapping. So we can not use generic vhost memory listener which
316 * depends on the addnop().
317 */
318 static const MemoryListener vhost_vdpa_memory_listener = {
319 .name = "vhost-vdpa",
320 .commit = vhost_vdpa_listener_commit,
321 .region_add = vhost_vdpa_listener_region_add,
322 .region_del = vhost_vdpa_listener_region_del,
323 };
324
325 static int vhost_vdpa_call(struct vhost_dev *dev, unsigned long int request,
326 void *arg)
327 {
328 struct vhost_vdpa *v = dev->opaque;
329 int fd = v->device_fd;
330 int ret;
331
332 assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
333
334 ret = ioctl(fd, request, arg);
335 return ret < 0 ? -errno : ret;
336 }
337
338 static int vhost_vdpa_add_status(struct vhost_dev *dev, uint8_t status)
339 {
340 uint8_t s;
341 int ret;
342
343 trace_vhost_vdpa_add_status(dev, status);
344 ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s);
345 if (ret < 0) {
346 return ret;
347 }
348
349 s |= status;
350
351 ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &s);
352 if (ret < 0) {
353 return ret;
354 }
355
356 ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s);
357 if (ret < 0) {
358 return ret;
359 }
360
361 if (!(s & status)) {
362 return -EIO;
363 }
364
365 return 0;
366 }
367
368 static void vhost_vdpa_get_iova_range(struct vhost_vdpa *v)
369 {
370 int ret = vhost_vdpa_call(v->dev, VHOST_VDPA_GET_IOVA_RANGE,
371 &v->iova_range);
372 if (ret != 0) {
373 v->iova_range.first = 0;
374 v->iova_range.last = UINT64_MAX;
375 }
376
377 trace_vhost_vdpa_get_iova_range(v->dev, v->iova_range.first,
378 v->iova_range.last);
379 }
380
381 /*
382 * The use of this function is for requests that only need to be
383 * applied once. Typically such request occurs at the beginning
384 * of operation, and before setting up queues. It should not be
385 * used for request that performs operation until all queues are
386 * set, which would need to check dev->vq_index_end instead.
387 */
388 static bool vhost_vdpa_first_dev(struct vhost_dev *dev)
389 {
390 struct vhost_vdpa *v = dev->opaque;
391
392 return v->index == 0;
393 }
394
395 static int vhost_vdpa_get_dev_features(struct vhost_dev *dev,
396 uint64_t *features)
397 {
398 int ret;
399
400 ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
401 trace_vhost_vdpa_get_features(dev, *features);
402 return ret;
403 }
404
405 static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
406 Error **errp)
407 {
408 g_autoptr(GPtrArray) shadow_vqs = NULL;
409 uint64_t dev_features, svq_features;
410 int r;
411 bool ok;
412
413 if (!v->shadow_vqs_enabled) {
414 return 0;
415 }
416
417 r = vhost_vdpa_get_dev_features(hdev, &dev_features);
418 if (r != 0) {
419 error_setg_errno(errp, -r, "Can't get vdpa device features");
420 return r;
421 }
422
423 svq_features = dev_features;
424 ok = vhost_svq_valid_features(svq_features, errp);
425 if (unlikely(!ok)) {
426 return -1;
427 }
428
429 shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
430 for (unsigned n = 0; n < hdev->nvqs; ++n) {
431 VhostShadowVirtqueue *svq;
432
433 svq = vhost_svq_new(v->shadow_vq_ops, v->shadow_vq_ops_opaque);
434 g_ptr_array_add(shadow_vqs, svq);
435 }
436
437 v->shadow_vqs = g_steal_pointer(&shadow_vqs);
438 return 0;
439 }
440
441 static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
442 {
443 struct vhost_vdpa *v;
444 assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
445 trace_vhost_vdpa_init(dev, opaque);
446 int ret;
447
448 /*
449 * Similar to VFIO, we end up pinning all guest memory and have to
450 * disable discarding of RAM.
451 */
452 ret = ram_block_discard_disable(true);
453 if (ret) {
454 error_report("Cannot set discarding of RAM broken");
455 return ret;
456 }
457
458 v = opaque;
459 v->dev = dev;
460 dev->opaque = opaque ;
461 v->listener = vhost_vdpa_memory_listener;
462 v->msg_type = VHOST_IOTLB_MSG_V2;
463 ret = vhost_vdpa_init_svq(dev, v, errp);
464 if (ret) {
465 goto err;
466 }
467
468 vhost_vdpa_get_iova_range(v);
469
470 if (!vhost_vdpa_first_dev(dev)) {
471 return 0;
472 }
473
474 vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE |
475 VIRTIO_CONFIG_S_DRIVER);
476
477 return 0;
478
479 err:
480 ram_block_discard_disable(false);
481 return ret;
482 }
483
484 static void vhost_vdpa_host_notifier_uninit(struct vhost_dev *dev,
485 int queue_index)
486 {
487 size_t page_size = qemu_real_host_page_size();
488 struct vhost_vdpa *v = dev->opaque;
489 VirtIODevice *vdev = dev->vdev;
490 VhostVDPAHostNotifier *n;
491
492 n = &v->notifier[queue_index];
493
494 if (n->addr) {
495 virtio_queue_set_host_notifier_mr(vdev, queue_index, &n->mr, false);
496 object_unparent(OBJECT(&n->mr));
497 munmap(n->addr, page_size);
498 n->addr = NULL;
499 }
500 }
501
502 static int vhost_vdpa_host_notifier_init(struct vhost_dev *dev, int queue_index)
503 {
504 size_t page_size = qemu_real_host_page_size();
505 struct vhost_vdpa *v = dev->opaque;
506 VirtIODevice *vdev = dev->vdev;
507 VhostVDPAHostNotifier *n;
508 int fd = v->device_fd;
509 void *addr;
510 char *name;
511
512 vhost_vdpa_host_notifier_uninit(dev, queue_index);
513
514 n = &v->notifier[queue_index];
515
516 addr = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, fd,
517 queue_index * page_size);
518 if (addr == MAP_FAILED) {
519 goto err;
520 }
521
522 name = g_strdup_printf("vhost-vdpa/host-notifier@%p mmaps[%d]",
523 v, queue_index);
524 memory_region_init_ram_device_ptr(&n->mr, OBJECT(vdev), name,
525 page_size, addr);
526 g_free(name);
527
528 if (virtio_queue_set_host_notifier_mr(vdev, queue_index, &n->mr, true)) {
529 object_unparent(OBJECT(&n->mr));
530 munmap(addr, page_size);
531 goto err;
532 }
533 n->addr = addr;
534
535 return 0;
536
537 err:
538 return -1;
539 }
540
541 static void vhost_vdpa_host_notifiers_uninit(struct vhost_dev *dev, int n)
542 {
543 int i;
544
545 for (i = dev->vq_index; i < dev->vq_index + n; i++) {
546 vhost_vdpa_host_notifier_uninit(dev, i);
547 }
548 }
549
550 static void vhost_vdpa_host_notifiers_init(struct vhost_dev *dev)
551 {
552 struct vhost_vdpa *v = dev->opaque;
553 int i;
554
555 if (v->shadow_vqs_enabled) {
556 /* FIXME SVQ is not compatible with host notifiers mr */
557 return;
558 }
559
560 for (i = dev->vq_index; i < dev->vq_index + dev->nvqs; i++) {
561 if (vhost_vdpa_host_notifier_init(dev, i)) {
562 goto err;
563 }
564 }
565
566 return;
567
568 err:
569 vhost_vdpa_host_notifiers_uninit(dev, i - dev->vq_index);
570 return;
571 }
572
573 static void vhost_vdpa_svq_cleanup(struct vhost_dev *dev)
574 {
575 struct vhost_vdpa *v = dev->opaque;
576 size_t idx;
577
578 if (!v->shadow_vqs) {
579 return;
580 }
581
582 for (idx = 0; idx < v->shadow_vqs->len; ++idx) {
583 vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, idx));
584 }
585 g_ptr_array_free(v->shadow_vqs, true);
586 }
587
588 static int vhost_vdpa_cleanup(struct vhost_dev *dev)
589 {
590 struct vhost_vdpa *v;
591 assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
592 v = dev->opaque;
593 trace_vhost_vdpa_cleanup(dev, v);
594 vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
595 memory_listener_unregister(&v->listener);
596 vhost_vdpa_svq_cleanup(dev);
597
598 dev->opaque = NULL;
599 ram_block_discard_disable(false);
600
601 return 0;
602 }
603
604 static int vhost_vdpa_memslots_limit(struct vhost_dev *dev)
605 {
606 trace_vhost_vdpa_memslots_limit(dev, INT_MAX);
607 return INT_MAX;
608 }
609
610 static int vhost_vdpa_set_mem_table(struct vhost_dev *dev,
611 struct vhost_memory *mem)
612 {
613 if (!vhost_vdpa_first_dev(dev)) {
614 return 0;
615 }
616
617 trace_vhost_vdpa_set_mem_table(dev, mem->nregions, mem->padding);
618 if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_MEM_TABLE) &&
619 trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_REGIONS)) {
620 int i;
621 for (i = 0; i < mem->nregions; i++) {
622 trace_vhost_vdpa_dump_regions(dev, i,
623 mem->regions[i].guest_phys_addr,
624 mem->regions[i].memory_size,
625 mem->regions[i].userspace_addr,
626 mem->regions[i].flags_padding);
627 }
628 }
629 if (mem->padding) {
630 return -EINVAL;
631 }
632
633 return 0;
634 }
635
636 static int vhost_vdpa_set_features(struct vhost_dev *dev,
637 uint64_t features)
638 {
639 struct vhost_vdpa *v = dev->opaque;
640 int ret;
641
642 if (!vhost_vdpa_first_dev(dev)) {
643 return 0;
644 }
645
646 if (v->shadow_vqs_enabled) {
647 if ((v->acked_features ^ features) == BIT_ULL(VHOST_F_LOG_ALL)) {
648 /*
649 * QEMU is just trying to enable or disable logging. SVQ handles
650 * this sepparately, so no need to forward this.
651 */
652 v->acked_features = features;
653 return 0;
654 }
655
656 v->acked_features = features;
657
658 /* We must not ack _F_LOG if SVQ is enabled */
659 features &= ~BIT_ULL(VHOST_F_LOG_ALL);
660 }
661
662 trace_vhost_vdpa_set_features(dev, features);
663 ret = vhost_vdpa_call(dev, VHOST_SET_FEATURES, &features);
664 if (ret) {
665 return ret;
666 }
667
668 return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_FEATURES_OK);
669 }
670
671 static int vhost_vdpa_set_backend_cap(struct vhost_dev *dev)
672 {
673 uint64_t features;
674 uint64_t f = 0x1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2 |
675 0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH;
676 int r;
677
678 if (vhost_vdpa_call(dev, VHOST_GET_BACKEND_FEATURES, &features)) {
679 return -EFAULT;
680 }
681
682 features &= f;
683
684 if (vhost_vdpa_first_dev(dev)) {
685 r = vhost_vdpa_call(dev, VHOST_SET_BACKEND_FEATURES, &features);
686 if (r) {
687 return -EFAULT;
688 }
689 }
690
691 dev->backend_cap = features;
692
693 return 0;
694 }
695
696 static int vhost_vdpa_get_device_id(struct vhost_dev *dev,
697 uint32_t *device_id)
698 {
699 int ret;
700 ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_DEVICE_ID, device_id);
701 trace_vhost_vdpa_get_device_id(dev, *device_id);
702 return ret;
703 }
704
705 static void vhost_vdpa_reset_svq(struct vhost_vdpa *v)
706 {
707 if (!v->shadow_vqs_enabled) {
708 return;
709 }
710
711 for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
712 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
713 vhost_svq_stop(svq);
714 }
715 }
716
717 static int vhost_vdpa_reset_device(struct vhost_dev *dev)
718 {
719 struct vhost_vdpa *v = dev->opaque;
720 int ret;
721 uint8_t status = 0;
722
723 vhost_vdpa_reset_svq(v);
724
725 ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status);
726 trace_vhost_vdpa_reset_device(dev, status);
727 return ret;
728 }
729
730 static int vhost_vdpa_get_vq_index(struct vhost_dev *dev, int idx)
731 {
732 assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs);
733
734 trace_vhost_vdpa_get_vq_index(dev, idx, idx);
735 return idx;
736 }
737
738 static int vhost_vdpa_set_vring_ready(struct vhost_dev *dev)
739 {
740 int i;
741 trace_vhost_vdpa_set_vring_ready(dev);
742 for (i = 0; i < dev->nvqs; ++i) {
743 struct vhost_vring_state state = {
744 .index = dev->vq_index + i,
745 .num = 1,
746 };
747 vhost_vdpa_call(dev, VHOST_VDPA_SET_VRING_ENABLE, &state);
748 }
749 return 0;
750 }
751
752 static void vhost_vdpa_dump_config(struct vhost_dev *dev, const uint8_t *config,
753 uint32_t config_len)
754 {
755 int b, len;
756 char line[QEMU_HEXDUMP_LINE_LEN];
757
758 for (b = 0; b < config_len; b += 16) {
759 len = config_len - b;
760 qemu_hexdump_line(line, b, config, len, false);
761 trace_vhost_vdpa_dump_config(dev, line);
762 }
763 }
764
765 static int vhost_vdpa_set_config(struct vhost_dev *dev, const uint8_t *data,
766 uint32_t offset, uint32_t size,
767 uint32_t flags)
768 {
769 struct vhost_vdpa_config *config;
770 int ret;
771 unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
772
773 trace_vhost_vdpa_set_config(dev, offset, size, flags);
774 config = g_malloc(size + config_size);
775 config->off = offset;
776 config->len = size;
777 memcpy(config->buf, data, size);
778 if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_CONFIG) &&
779 trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG)) {
780 vhost_vdpa_dump_config(dev, data, size);
781 }
782 ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_CONFIG, config);
783 g_free(config);
784 return ret;
785 }
786
787 static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
788 uint32_t config_len, Error **errp)
789 {
790 struct vhost_vdpa_config *v_config;
791 unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
792 int ret;
793
794 trace_vhost_vdpa_get_config(dev, config, config_len);
795 v_config = g_malloc(config_len + config_size);
796 v_config->len = config_len;
797 v_config->off = 0;
798 ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_CONFIG, v_config);
799 memcpy(config, v_config->buf, config_len);
800 g_free(v_config);
801 if (trace_event_get_state_backends(TRACE_VHOST_VDPA_GET_CONFIG) &&
802 trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG)) {
803 vhost_vdpa_dump_config(dev, config, config_len);
804 }
805 return ret;
806 }
807
808 static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev,
809 struct vhost_vring_state *ring)
810 {
811 trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
812 return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
813 }
814
815 static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
816 struct vhost_vring_file *file)
817 {
818 trace_vhost_vdpa_set_vring_kick(dev, file->index, file->fd);
819 return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
820 }
821
822 static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
823 struct vhost_vring_file *file)
824 {
825 trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
826 return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
827 }
828
829 static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
830 struct vhost_vring_addr *addr)
831 {
832 trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
833 addr->desc_user_addr, addr->used_user_addr,
834 addr->avail_user_addr,
835 addr->log_guest_addr);
836
837 return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
838
839 }
840
841 /**
842 * Set the shadow virtqueue descriptors to the device
843 *
844 * @dev: The vhost device model
845 * @svq: The shadow virtqueue
846 * @idx: The index of the virtqueue in the vhost device
847 * @errp: Error
848 *
849 * Note that this function does not rewind kick file descriptor if cannot set
850 * call one.
851 */
852 static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
853 VhostShadowVirtqueue *svq, unsigned idx,
854 Error **errp)
855 {
856 struct vhost_vring_file file = {
857 .index = dev->vq_index + idx,
858 };
859 const EventNotifier *event_notifier = &svq->hdev_kick;
860 int r;
861
862 r = event_notifier_init(&svq->hdev_kick, 0);
863 if (r != 0) {
864 error_setg_errno(errp, -r, "Couldn't create kick event notifier");
865 goto err_init_hdev_kick;
866 }
867
868 r = event_notifier_init(&svq->hdev_call, 0);
869 if (r != 0) {
870 error_setg_errno(errp, -r, "Couldn't create call event notifier");
871 goto err_init_hdev_call;
872 }
873
874 file.fd = event_notifier_get_fd(event_notifier);
875 r = vhost_vdpa_set_vring_dev_kick(dev, &file);
876 if (unlikely(r != 0)) {
877 error_setg_errno(errp, -r, "Can't set device kick fd");
878 goto err_init_set_dev_fd;
879 }
880
881 event_notifier = &svq->hdev_call;
882 file.fd = event_notifier_get_fd(event_notifier);
883 r = vhost_vdpa_set_vring_dev_call(dev, &file);
884 if (unlikely(r != 0)) {
885 error_setg_errno(errp, -r, "Can't set device call fd");
886 goto err_init_set_dev_fd;
887 }
888
889 return 0;
890
891 err_init_set_dev_fd:
892 event_notifier_set_handler(&svq->hdev_call, NULL);
893
894 err_init_hdev_call:
895 event_notifier_cleanup(&svq->hdev_kick);
896
897 err_init_hdev_kick:
898 return r;
899 }
900
901 /**
902 * Unmap a SVQ area in the device
903 */
904 static void vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr addr)
905 {
906 const DMAMap needle = {
907 .translated_addr = addr,
908 };
909 const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, &needle);
910 hwaddr size;
911 int r;
912
913 if (unlikely(!result)) {
914 error_report("Unable to find SVQ address to unmap");
915 return;
916 }
917
918 size = ROUND_UP(result->size, qemu_real_host_page_size());
919 r = vhost_vdpa_dma_unmap(v, result->iova, size);
920 if (unlikely(r < 0)) {
921 error_report("Unable to unmap SVQ vring: %s (%d)", g_strerror(-r), -r);
922 return;
923 }
924
925 vhost_iova_tree_remove(v->iova_tree, *result);
926 }
927
928 static void vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
929 const VhostShadowVirtqueue *svq)
930 {
931 struct vhost_vdpa *v = dev->opaque;
932 struct vhost_vring_addr svq_addr;
933
934 vhost_svq_get_vring_addr(svq, &svq_addr);
935
936 vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr);
937
938 vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr);
939 }
940
941 /**
942 * Map the SVQ area in the device
943 *
944 * @v: Vhost-vdpa device
945 * @needle: The area to search iova
946 * @errorp: Error pointer
947 */
948 static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle,
949 Error **errp)
950 {
951 int r;
952
953 r = vhost_iova_tree_map_alloc(v->iova_tree, needle);
954 if (unlikely(r != IOVA_OK)) {
955 error_setg(errp, "Cannot allocate iova (%d)", r);
956 return false;
957 }
958
959 r = vhost_vdpa_dma_map(v, needle->iova, needle->size + 1,
960 (void *)(uintptr_t)needle->translated_addr,
961 needle->perm == IOMMU_RO);
962 if (unlikely(r != 0)) {
963 error_setg_errno(errp, -r, "Cannot map region to device");
964 vhost_iova_tree_remove(v->iova_tree, *needle);
965 }
966
967 return r == 0;
968 }
969
970 /**
971 * Map the shadow virtqueue rings in the device
972 *
973 * @dev: The vhost device
974 * @svq: The shadow virtqueue
975 * @addr: Assigned IOVA addresses
976 * @errp: Error pointer
977 */
978 static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
979 const VhostShadowVirtqueue *svq,
980 struct vhost_vring_addr *addr,
981 Error **errp)
982 {
983 ERRP_GUARD();
984 DMAMap device_region, driver_region;
985 struct vhost_vring_addr svq_addr;
986 struct vhost_vdpa *v = dev->opaque;
987 size_t device_size = vhost_svq_device_area_size(svq);
988 size_t driver_size = vhost_svq_driver_area_size(svq);
989 size_t avail_offset;
990 bool ok;
991
992 vhost_svq_get_vring_addr(svq, &svq_addr);
993
994 driver_region = (DMAMap) {
995 .translated_addr = svq_addr.desc_user_addr,
996 .size = driver_size - 1,
997 .perm = IOMMU_RO,
998 };
999 ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp);
1000 if (unlikely(!ok)) {
1001 error_prepend(errp, "Cannot create vq driver region: ");
1002 return false;
1003 }
1004 addr->desc_user_addr = driver_region.iova;
1005 avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr;
1006 addr->avail_user_addr = driver_region.iova + avail_offset;
1007
1008 device_region = (DMAMap) {
1009 .translated_addr = svq_addr.used_user_addr,
1010 .size = device_size - 1,
1011 .perm = IOMMU_RW,
1012 };
1013 ok = vhost_vdpa_svq_map_ring(v, &device_region, errp);
1014 if (unlikely(!ok)) {
1015 error_prepend(errp, "Cannot create vq device region: ");
1016 vhost_vdpa_svq_unmap_ring(v, driver_region.translated_addr);
1017 }
1018 addr->used_user_addr = device_region.iova;
1019
1020 return ok;
1021 }
1022
1023 static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
1024 VhostShadowVirtqueue *svq, unsigned idx,
1025 Error **errp)
1026 {
1027 uint16_t vq_index = dev->vq_index + idx;
1028 struct vhost_vring_state s = {
1029 .index = vq_index,
1030 };
1031 int r;
1032
1033 r = vhost_vdpa_set_dev_vring_base(dev, &s);
1034 if (unlikely(r)) {
1035 error_setg_errno(errp, -r, "Cannot set vring base");
1036 return false;
1037 }
1038
1039 r = vhost_vdpa_svq_set_fds(dev, svq, idx, errp);
1040 return r == 0;
1041 }
1042
1043 static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
1044 {
1045 struct vhost_vdpa *v = dev->opaque;
1046 Error *err = NULL;
1047 unsigned i;
1048
1049 if (!v->shadow_vqs_enabled) {
1050 return true;
1051 }
1052
1053 for (i = 0; i < v->shadow_vqs->len; ++i) {
1054 VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
1055 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
1056 struct vhost_vring_addr addr = {
1057 .index = dev->vq_index + i,
1058 };
1059 int r;
1060 bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
1061 if (unlikely(!ok)) {
1062 goto err;
1063 }
1064
1065 vhost_svq_start(svq, dev->vdev, vq, v->iova_tree);
1066 ok = vhost_vdpa_svq_map_rings(dev, svq, &addr, &err);
1067 if (unlikely(!ok)) {
1068 goto err_map;
1069 }
1070
1071 /* Override vring GPA set by vhost subsystem */
1072 r = vhost_vdpa_set_vring_dev_addr(dev, &addr);
1073 if (unlikely(r != 0)) {
1074 error_setg_errno(&err, -r, "Cannot set device address");
1075 goto err_set_addr;
1076 }
1077 }
1078
1079 return true;
1080
1081 err_set_addr:
1082 vhost_vdpa_svq_unmap_rings(dev, g_ptr_array_index(v->shadow_vqs, i));
1083
1084 err_map:
1085 vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, i));
1086
1087 err:
1088 error_reportf_err(err, "Cannot setup SVQ %u: ", i);
1089 for (unsigned j = 0; j < i; ++j) {
1090 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, j);
1091 vhost_vdpa_svq_unmap_rings(dev, svq);
1092 vhost_svq_stop(svq);
1093 }
1094
1095 return false;
1096 }
1097
1098 static void vhost_vdpa_svqs_stop(struct vhost_dev *dev)
1099 {
1100 struct vhost_vdpa *v = dev->opaque;
1101
1102 if (!v->shadow_vqs_enabled) {
1103 return;
1104 }
1105
1106 for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
1107 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
1108 vhost_vdpa_svq_unmap_rings(dev, svq);
1109
1110 event_notifier_cleanup(&svq->hdev_kick);
1111 event_notifier_cleanup(&svq->hdev_call);
1112 }
1113 }
1114
1115 static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
1116 {
1117 struct vhost_vdpa *v = dev->opaque;
1118 bool ok;
1119 trace_vhost_vdpa_dev_start(dev, started);
1120
1121 if (started) {
1122 vhost_vdpa_host_notifiers_init(dev);
1123 ok = vhost_vdpa_svqs_start(dev);
1124 if (unlikely(!ok)) {
1125 return -1;
1126 }
1127 vhost_vdpa_set_vring_ready(dev);
1128 } else {
1129 vhost_vdpa_svqs_stop(dev);
1130 vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
1131 }
1132
1133 if (dev->vq_index + dev->nvqs != dev->vq_index_end) {
1134 return 0;
1135 }
1136
1137 if (started) {
1138 memory_listener_register(&v->listener, &address_space_memory);
1139 return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK);
1140 } else {
1141 vhost_vdpa_reset_device(dev);
1142 vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE |
1143 VIRTIO_CONFIG_S_DRIVER);
1144 memory_listener_unregister(&v->listener);
1145
1146 return 0;
1147 }
1148 }
1149
1150 static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
1151 struct vhost_log *log)
1152 {
1153 struct vhost_vdpa *v = dev->opaque;
1154 if (v->shadow_vqs_enabled || !vhost_vdpa_first_dev(dev)) {
1155 return 0;
1156 }
1157
1158 trace_vhost_vdpa_set_log_base(dev, base, log->size, log->refcnt, log->fd,
1159 log->log);
1160 return vhost_vdpa_call(dev, VHOST_SET_LOG_BASE, &base);
1161 }
1162
1163 static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev,
1164 struct vhost_vring_addr *addr)
1165 {
1166 struct vhost_vdpa *v = dev->opaque;
1167
1168 if (v->shadow_vqs_enabled) {
1169 /*
1170 * Device vring addr was set at device start. SVQ base is handled by
1171 * VirtQueue code.
1172 */
1173 return 0;
1174 }
1175
1176 return vhost_vdpa_set_vring_dev_addr(dev, addr);
1177 }
1178
1179 static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
1180 struct vhost_vring_state *ring)
1181 {
1182 trace_vhost_vdpa_set_vring_num(dev, ring->index, ring->num);
1183 return vhost_vdpa_call(dev, VHOST_SET_VRING_NUM, ring);
1184 }
1185
1186 static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
1187 struct vhost_vring_state *ring)
1188 {
1189 struct vhost_vdpa *v = dev->opaque;
1190 VirtQueue *vq = virtio_get_queue(dev->vdev, ring->index);
1191
1192 /*
1193 * vhost-vdpa devices does not support in-flight requests. Set all of them
1194 * as available.
1195 *
1196 * TODO: This is ok for networking, but other kinds of devices might
1197 * have problems with these retransmissions.
1198 */
1199 while (virtqueue_rewind(vq, 1)) {
1200 continue;
1201 }
1202 if (v->shadow_vqs_enabled) {
1203 /*
1204 * Device vring base was set at device start. SVQ base is handled by
1205 * VirtQueue code.
1206 */
1207 return 0;
1208 }
1209
1210 return vhost_vdpa_set_dev_vring_base(dev, ring);
1211 }
1212
1213 static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
1214 struct vhost_vring_state *ring)
1215 {
1216 struct vhost_vdpa *v = dev->opaque;
1217 int ret;
1218
1219 if (v->shadow_vqs_enabled) {
1220 ring->num = virtio_queue_get_last_avail_idx(dev->vdev, ring->index);
1221 return 0;
1222 }
1223
1224 ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring);
1225 trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num);
1226 return ret;
1227 }
1228
1229 static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev,
1230 struct vhost_vring_file *file)
1231 {
1232 struct vhost_vdpa *v = dev->opaque;
1233 int vdpa_idx = file->index - dev->vq_index;
1234
1235 if (v->shadow_vqs_enabled) {
1236 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
1237 vhost_svq_set_svq_kick_fd(svq, file->fd);
1238 return 0;
1239 } else {
1240 return vhost_vdpa_set_vring_dev_kick(dev, file);
1241 }
1242 }
1243
1244 static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
1245 struct vhost_vring_file *file)
1246 {
1247 struct vhost_vdpa *v = dev->opaque;
1248
1249 if (v->shadow_vqs_enabled) {
1250 int vdpa_idx = file->index - dev->vq_index;
1251 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
1252
1253 vhost_svq_set_svq_call_fd(svq, file->fd);
1254 return 0;
1255 } else {
1256 return vhost_vdpa_set_vring_dev_call(dev, file);
1257 }
1258 }
1259
1260 static int vhost_vdpa_get_features(struct vhost_dev *dev,
1261 uint64_t *features)
1262 {
1263 struct vhost_vdpa *v = dev->opaque;
1264 int ret = vhost_vdpa_get_dev_features(dev, features);
1265
1266 if (ret == 0 && v->shadow_vqs_enabled) {
1267 /* Add SVQ logging capabilities */
1268 *features |= BIT_ULL(VHOST_F_LOG_ALL);
1269 }
1270
1271 return ret;
1272 }
1273
1274 static int vhost_vdpa_set_owner(struct vhost_dev *dev)
1275 {
1276 if (!vhost_vdpa_first_dev(dev)) {
1277 return 0;
1278 }
1279
1280 trace_vhost_vdpa_set_owner(dev);
1281 return vhost_vdpa_call(dev, VHOST_SET_OWNER, NULL);
1282 }
1283
1284 static int vhost_vdpa_vq_get_addr(struct vhost_dev *dev,
1285 struct vhost_vring_addr *addr, struct vhost_virtqueue *vq)
1286 {
1287 assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
1288 addr->desc_user_addr = (uint64_t)(unsigned long)vq->desc_phys;
1289 addr->avail_user_addr = (uint64_t)(unsigned long)vq->avail_phys;
1290 addr->used_user_addr = (uint64_t)(unsigned long)vq->used_phys;
1291 trace_vhost_vdpa_vq_get_addr(dev, vq, addr->desc_user_addr,
1292 addr->avail_user_addr, addr->used_user_addr);
1293 return 0;
1294 }
1295
1296 static bool vhost_vdpa_force_iommu(struct vhost_dev *dev)
1297 {
1298 return true;
1299 }
1300
1301 const VhostOps vdpa_ops = {
1302 .backend_type = VHOST_BACKEND_TYPE_VDPA,
1303 .vhost_backend_init = vhost_vdpa_init,
1304 .vhost_backend_cleanup = vhost_vdpa_cleanup,
1305 .vhost_set_log_base = vhost_vdpa_set_log_base,
1306 .vhost_set_vring_addr = vhost_vdpa_set_vring_addr,
1307 .vhost_set_vring_num = vhost_vdpa_set_vring_num,
1308 .vhost_set_vring_base = vhost_vdpa_set_vring_base,
1309 .vhost_get_vring_base = vhost_vdpa_get_vring_base,
1310 .vhost_set_vring_kick = vhost_vdpa_set_vring_kick,
1311 .vhost_set_vring_call = vhost_vdpa_set_vring_call,
1312 .vhost_get_features = vhost_vdpa_get_features,
1313 .vhost_set_backend_cap = vhost_vdpa_set_backend_cap,
1314 .vhost_set_owner = vhost_vdpa_set_owner,
1315 .vhost_set_vring_endian = NULL,
1316 .vhost_backend_memslots_limit = vhost_vdpa_memslots_limit,
1317 .vhost_set_mem_table = vhost_vdpa_set_mem_table,
1318 .vhost_set_features = vhost_vdpa_set_features,
1319 .vhost_reset_device = vhost_vdpa_reset_device,
1320 .vhost_get_vq_index = vhost_vdpa_get_vq_index,
1321 .vhost_get_config = vhost_vdpa_get_config,
1322 .vhost_set_config = vhost_vdpa_set_config,
1323 .vhost_requires_shm_log = NULL,
1324 .vhost_migration_done = NULL,
1325 .vhost_backend_can_merge = NULL,
1326 .vhost_net_set_mtu = NULL,
1327 .vhost_set_iotlb_callback = NULL,
1328 .vhost_send_device_iotlb_msg = NULL,
1329 .vhost_dev_start = vhost_vdpa_dev_start,
1330 .vhost_get_device_id = vhost_vdpa_get_device_id,
1331 .vhost_vq_get_addr = vhost_vdpa_vq_get_addr,
1332 .vhost_force_iommu = vhost_vdpa_force_iommu,
1333 };