]> git.proxmox.com Git - mirror_qemu.git/blame - hw/virtio/vhost.c
Merge tag 'for_upstream' of https://git.kernel.org/pub/scm/virt/kvm/mst/qemu into...
[mirror_qemu.git] / hw / virtio / vhost.c
CommitLineData
d5970055
MT
1/*
2 * vhost support
3 *
4 * Copyright Red Hat, Inc. 2010
5 *
6 * Authors:
7 * Michael S. Tsirkin <mst@redhat.com>
8 *
9 * This work is licensed under the terms of the GNU GPL, version 2. See
10 * the COPYING file in the top-level directory.
6b620ca3
PB
11 *
12 * Contributions after 2012-01-13 are licensed under the terms of the
13 * GNU GPL, version 2 or (at your option) any later version.
d5970055
MT
14 */
15
9b8bfe21 16#include "qemu/osdep.h"
da34e65c 17#include "qapi/error.h"
0d09e41a 18#include "hw/virtio/vhost.h"
5444e768 19#include "qemu/atomic.h"
1de7afc9 20#include "qemu/range.h"
04b7a152 21#include "qemu/error-report.h"
15324404 22#include "qemu/memfd.h"
345cc1cb 23#include "qemu/log.h"
18658a3c 24#include "standard-headers/linux/vhost_types.h"
1c819449 25#include "hw/virtio/virtio-bus.h"
795c40b8 26#include "migration/blocker.h"
ca77ee28 27#include "migration/qemu-file-types.h"
c471ad0e 28#include "sysemu/dma.h"
aa3c40f6 29#include "trace.h"
d5970055 30
162bba7f
MAL
31/* enabled until disconnected backend stabilizes */
32#define _VHOST_DEBUG 1
33
34#ifdef _VHOST_DEBUG
5d33ae4b
RK
35#define VHOST_OPS_DEBUG(retval, fmt, ...) \
36 do { \
37 error_report(fmt ": %s (%d)", ## __VA_ARGS__, \
38 strerror(-retval), -retval); \
39 } while (0)
162bba7f 40#else
5d33ae4b 41#define VHOST_OPS_DEBUG(retval, fmt, ...) \
162bba7f
MAL
42 do { } while (0)
43#endif
44
309750fa 45static struct vhost_log *vhost_log;
15324404 46static struct vhost_log *vhost_log_shm;
309750fa 47
2ce68e4c
IM
48static unsigned int used_memslots;
49static QLIST_HEAD(, vhost_dev) vhost_devices =
50 QLIST_HEAD_INITIALIZER(vhost_devices);
51
52bool vhost_has_free_slot(void)
53{
54 unsigned int slots_limit = ~0U;
55 struct vhost_dev *hdev;
56
57 QLIST_FOREACH(hdev, &vhost_devices, entry) {
58 unsigned int r = hdev->vhost_ops->vhost_backend_memslots_limit(hdev);
59 slots_limit = MIN(slots_limit, r);
60 }
61 return slots_limit > used_memslots;
62}
63
d5970055 64static void vhost_dev_sync_region(struct vhost_dev *dev,
2817b260 65 MemoryRegionSection *section,
d5970055
MT
66 uint64_t mfirst, uint64_t mlast,
67 uint64_t rfirst, uint64_t rlast)
68{
309750fa
JW
69 vhost_log_chunk_t *log = dev->log->log;
70
d5970055
MT
71 uint64_t start = MAX(mfirst, rfirst);
72 uint64_t end = MIN(mlast, rlast);
309750fa
JW
73 vhost_log_chunk_t *from = log + start / VHOST_LOG_CHUNK;
74 vhost_log_chunk_t *to = log + end / VHOST_LOG_CHUNK + 1;
33c5793b 75 uint64_t addr = QEMU_ALIGN_DOWN(start, VHOST_LOG_CHUNK);
d5970055 76
d5970055
MT
77 if (end < start) {
78 return;
79 }
e314672a 80 assert(end / VHOST_LOG_CHUNK < dev->log_size);
fbbaf9ae 81 assert(start / VHOST_LOG_CHUNK < dev->log_size);
e314672a 82
d5970055
MT
83 for (;from < to; ++from) {
84 vhost_log_chunk_t log;
d5970055
MT
85 /* We first check with non-atomic: much cheaper,
86 * and we expect non-dirty to be the common case. */
87 if (!*from) {
0c600ce2 88 addr += VHOST_LOG_CHUNK;
d5970055
MT
89 continue;
90 }
5444e768
PB
91 /* Data must be read atomically. We don't really need barrier semantics
92 * but it's easier to use atomic_* than roll our own. */
d73415a3 93 log = qatomic_xchg(from, 0);
747eb78b
NC
94 while (log) {
95 int bit = ctzl(log);
6b37a23d
MT
96 hwaddr page_addr;
97 hwaddr section_offset;
98 hwaddr mr_offset;
6b37a23d
MT
99 page_addr = addr + bit * VHOST_LOG_PAGE;
100 section_offset = page_addr - section->offset_within_address_space;
101 mr_offset = section_offset + section->offset_within_region;
102 memory_region_set_dirty(section->mr, mr_offset, VHOST_LOG_PAGE);
d5970055
MT
103 log &= ~(0x1ull << bit);
104 }
105 addr += VHOST_LOG_CHUNK;
106 }
107}
108
74b5d2b5 109bool vhost_dev_has_iommu(struct vhost_dev *dev)
345cc1cb
JW
110{
111 VirtIODevice *vdev = dev->vdev;
112
113 /*
114 * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support
115 * incremental memory mapping API via IOTLB API. For platform that
116 * does not have IOMMU, there's no need to enable this feature
117 * which may cause unnecessary IOTLB miss/update transactions.
118 */
119 if (vdev) {
120 return virtio_bus_device_iommu_enabled(vdev) &&
121 virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
122 } else {
123 return false;
124 }
125}
126
04097f7c 127static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
2817b260 128 MemoryRegionSection *section,
6b37a23d
MT
129 hwaddr first,
130 hwaddr last)
d5970055 131{
d5970055 132 int i;
6b37a23d
MT
133 hwaddr start_addr;
134 hwaddr end_addr;
04097f7c 135
d5970055
MT
136 if (!dev->log_enabled || !dev->started) {
137 return 0;
138 }
6b37a23d 139 start_addr = section->offset_within_address_space;
052e87b0 140 end_addr = range_get_last(start_addr, int128_get64(section->size));
6b37a23d
MT
141 start_addr = MAX(first, start_addr);
142 end_addr = MIN(last, end_addr);
143
d5970055
MT
144 for (i = 0; i < dev->mem->nregions; ++i) {
145 struct vhost_memory_region *reg = dev->mem->regions + i;
2817b260 146 vhost_dev_sync_region(dev, section, start_addr, end_addr,
d5970055
MT
147 reg->guest_phys_addr,
148 range_get_last(reg->guest_phys_addr,
149 reg->memory_size));
150 }
151 for (i = 0; i < dev->nvqs; ++i) {
152 struct vhost_virtqueue *vq = dev->vqs + i;
240e647a
LH
153
154 if (!vq->used_phys && !vq->used_size) {
155 continue;
156 }
157
345cc1cb
JW
158 if (vhost_dev_has_iommu(dev)) {
159 IOMMUTLBEntry iotlb;
160 hwaddr used_phys = vq->used_phys, used_size = vq->used_size;
161 hwaddr phys, s, offset;
162
163 while (used_size) {
164 rcu_read_lock();
165 iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as,
166 used_phys,
167 true,
168 MEMTXATTRS_UNSPECIFIED);
169 rcu_read_unlock();
170
171 if (!iotlb.target_as) {
172 qemu_log_mask(LOG_GUEST_ERROR, "translation "
173 "failure for used_iova %"PRIx64"\n",
174 used_phys);
175 return -EINVAL;
176 }
177
178 offset = used_phys & iotlb.addr_mask;
179 phys = iotlb.translated_addr + offset;
180
181 /*
182 * Distance from start of used ring until last byte of
183 * IOMMU page.
184 */
185 s = iotlb.addr_mask - offset;
186 /*
187 * Size of used ring, or of the part of it until end
188 * of IOMMU page. To avoid zero result, do the adding
189 * outside of MIN().
190 */
191 s = MIN(s, used_size - 1) + 1;
192
193 vhost_dev_sync_region(dev, section, start_addr, end_addr, phys,
194 range_get_last(phys, s));
195 used_size -= s;
196 used_phys += s;
197 }
198 } else {
199 vhost_dev_sync_region(dev, section, start_addr,
200 end_addr, vq->used_phys,
201 range_get_last(vq->used_phys, vq->used_size));
202 }
d5970055
MT
203 }
204 return 0;
205}
206
04097f7c
AK
207static void vhost_log_sync(MemoryListener *listener,
208 MemoryRegionSection *section)
209{
210 struct vhost_dev *dev = container_of(listener, struct vhost_dev,
211 memory_listener);
6b37a23d
MT
212 vhost_sync_dirty_bitmap(dev, section, 0x0, ~0x0ULL);
213}
04097f7c 214
6b37a23d
MT
215static void vhost_log_sync_range(struct vhost_dev *dev,
216 hwaddr first, hwaddr last)
217{
218 int i;
219 /* FIXME: this is N^2 in number of sections */
220 for (i = 0; i < dev->n_mem_sections; ++i) {
221 MemoryRegionSection *section = &dev->mem_sections[i];
222 vhost_sync_dirty_bitmap(dev, section, first, last);
223 }
04097f7c
AK
224}
225
d5970055
MT
226static uint64_t vhost_get_log_size(struct vhost_dev *dev)
227{
228 uint64_t log_size = 0;
229 int i;
230 for (i = 0; i < dev->mem->nregions; ++i) {
231 struct vhost_memory_region *reg = dev->mem->regions + i;
232 uint64_t last = range_get_last(reg->guest_phys_addr,
233 reg->memory_size);
234 log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
235 }
d5970055
MT
236 return log_size;
237}
15324404 238
9b1d929a
TG
239static int vhost_set_backend_type(struct vhost_dev *dev,
240 VhostBackendType backend_type)
241{
242 int r = 0;
243
244 switch (backend_type) {
245#ifdef CONFIG_VHOST_KERNEL
246 case VHOST_BACKEND_TYPE_KERNEL:
247 dev->vhost_ops = &kernel_ops;
248 break;
249#endif
250#ifdef CONFIG_VHOST_USER
251 case VHOST_BACKEND_TYPE_USER:
252 dev->vhost_ops = &user_ops;
253 break;
254#endif
255#ifdef CONFIG_VHOST_VDPA
256 case VHOST_BACKEND_TYPE_VDPA:
257 dev->vhost_ops = &vdpa_ops;
258 break;
259#endif
260 default:
261 error_report("Unknown vhost backend type");
262 r = -1;
263 }
264
265 return r;
266}
267
15324404 268static struct vhost_log *vhost_log_alloc(uint64_t size, bool share)
309750fa 269{
0f2956f9 270 Error *err = NULL;
15324404
MAL
271 struct vhost_log *log;
272 uint64_t logsize = size * sizeof(*(log->log));
273 int fd = -1;
274
275 log = g_new0(struct vhost_log, 1);
276 if (share) {
277 log->log = qemu_memfd_alloc("vhost-log", logsize,
278 F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
0f2956f9
MAL
279 &fd, &err);
280 if (err) {
281 error_report_err(err);
282 g_free(log);
283 return NULL;
284 }
15324404
MAL
285 memset(log->log, 0, logsize);
286 } else {
287 log->log = g_malloc0(logsize);
288 }
309750fa
JW
289
290 log->size = size;
291 log->refcnt = 1;
15324404 292 log->fd = fd;
309750fa
JW
293
294 return log;
295}
296
15324404 297static struct vhost_log *vhost_log_get(uint64_t size, bool share)
309750fa 298{
15324404
MAL
299 struct vhost_log *log = share ? vhost_log_shm : vhost_log;
300
301 if (!log || log->size != size) {
302 log = vhost_log_alloc(size, share);
303 if (share) {
304 vhost_log_shm = log;
305 } else {
306 vhost_log = log;
307 }
309750fa 308 } else {
15324404 309 ++log->refcnt;
309750fa
JW
310 }
311
15324404 312 return log;
309750fa
JW
313}
314
315static void vhost_log_put(struct vhost_dev *dev, bool sync)
316{
317 struct vhost_log *log = dev->log;
318
319 if (!log) {
320 return;
321 }
322
323 --log->refcnt;
324 if (log->refcnt == 0) {
325 /* Sync only the range covered by the old log */
326 if (dev->log_size && sync) {
327 vhost_log_sync_range(dev, 0, dev->log_size * VHOST_LOG_CHUNK - 1);
328 }
15324404 329
309750fa 330 if (vhost_log == log) {
15324404 331 g_free(log->log);
309750fa 332 vhost_log = NULL;
15324404
MAL
333 } else if (vhost_log_shm == log) {
334 qemu_memfd_free(log->log, log->size * sizeof(*(log->log)),
335 log->fd);
336 vhost_log_shm = NULL;
309750fa 337 }
15324404 338
309750fa
JW
339 g_free(log);
340 }
5c0ba1be
FF
341
342 dev->log = NULL;
343 dev->log_size = 0;
309750fa 344}
d5970055 345
15324404
MAL
346static bool vhost_dev_log_is_shared(struct vhost_dev *dev)
347{
348 return dev->vhost_ops->vhost_requires_shm_log &&
349 dev->vhost_ops->vhost_requires_shm_log(dev);
350}
351
352static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
d5970055 353{
15324404 354 struct vhost_log *log = vhost_log_get(size, vhost_dev_log_is_shared(dev));
309750fa 355 uint64_t log_base = (uintptr_t)log->log;
6b37a23d 356 int r;
6528499f 357
636f4ddd
MAL
358 /* inform backend of log switching, this must be done before
359 releasing the current log, to ensure no logging is lost */
9a78a5dd 360 r = dev->vhost_ops->vhost_set_log_base(dev, log_base, log);
162bba7f 361 if (r < 0) {
5d33ae4b 362 VHOST_OPS_DEBUG(r, "vhost_set_log_base failed");
162bba7f
MAL
363 }
364
309750fa 365 vhost_log_put(dev, true);
d5970055
MT
366 dev->log = log;
367 dev->log_size = size;
368}
369
c471ad0e 370static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr,
b897a474 371 hwaddr *plen, bool is_write)
c471ad0e
JW
372{
373 if (!vhost_dev_has_iommu(dev)) {
374 return cpu_physical_memory_map(addr, plen, is_write);
375 } else {
376 return (void *)(uintptr_t)addr;
377 }
378}
379
380static void vhost_memory_unmap(struct vhost_dev *dev, void *buffer,
381 hwaddr len, int is_write,
382 hwaddr access_len)
383{
384 if (!vhost_dev_has_iommu(dev)) {
385 cpu_physical_memory_unmap(buffer, len, is_write, access_len);
386 }
387}
f1f9e6c5 388
0ca1fd2d
DDAG
389static int vhost_verify_ring_part_mapping(void *ring_hva,
390 uint64_t ring_gpa,
391 uint64_t ring_size,
392 void *reg_hva,
393 uint64_t reg_gpa,
394 uint64_t reg_size)
f1f9e6c5 395{
0ca1fd2d
DDAG
396 uint64_t hva_ring_offset;
397 uint64_t ring_last = range_get_last(ring_gpa, ring_size);
398 uint64_t reg_last = range_get_last(reg_gpa, reg_size);
f1f9e6c5 399
0ca1fd2d 400 if (ring_last < reg_gpa || ring_gpa > reg_last) {
f1f9e6c5
GK
401 return 0;
402 }
0ca1fd2d
DDAG
403 /* check that whole ring's is mapped */
404 if (ring_last > reg_last) {
405 return -ENOMEM;
f1f9e6c5 406 }
0ca1fd2d
DDAG
407 /* check that ring's MemoryRegion wasn't replaced */
408 hva_ring_offset = ring_gpa - reg_gpa;
409 if (ring_hva != reg_hva + hva_ring_offset) {
410 return -EBUSY;
f1f9e6c5 411 }
0ca1fd2d
DDAG
412
413 return 0;
f1f9e6c5
GK
414}
415
d5970055 416static int vhost_verify_ring_mappings(struct vhost_dev *dev,
0ca1fd2d
DDAG
417 void *reg_hva,
418 uint64_t reg_gpa,
419 uint64_t reg_size)
d5970055 420{
f1f9e6c5 421 int i, j;
8617343f 422 int r = 0;
f1f9e6c5
GK
423 const char *part_name[] = {
424 "descriptor table",
425 "available ring",
426 "used ring"
427 };
8617343f 428
aebbdbee
JW
429 if (vhost_dev_has_iommu(dev)) {
430 return 0;
431 }
432
f1f9e6c5 433 for (i = 0; i < dev->nvqs; ++i) {
d5970055 434 struct vhost_virtqueue *vq = dev->vqs + i;
d5970055 435
fb20fbb7
JH
436 if (vq->desc_phys == 0) {
437 continue;
438 }
439
f1f9e6c5 440 j = 0;
0ca1fd2d
DDAG
441 r = vhost_verify_ring_part_mapping(
442 vq->desc, vq->desc_phys, vq->desc_size,
443 reg_hva, reg_gpa, reg_size);
2fe45ec3 444 if (r) {
f1f9e6c5 445 break;
d5970055 446 }
f1f9e6c5
GK
447
448 j++;
0ca1fd2d 449 r = vhost_verify_ring_part_mapping(
9fac50c8 450 vq->avail, vq->avail_phys, vq->avail_size,
0ca1fd2d 451 reg_hva, reg_gpa, reg_size);
2fe45ec3 452 if (r) {
f1f9e6c5 453 break;
d5970055 454 }
f1f9e6c5
GK
455
456 j++;
0ca1fd2d 457 r = vhost_verify_ring_part_mapping(
9fac50c8 458 vq->used, vq->used_phys, vq->used_size,
0ca1fd2d 459 reg_hva, reg_gpa, reg_size);
2fe45ec3 460 if (r) {
f1f9e6c5 461 break;
d5970055 462 }
f1f9e6c5
GK
463 }
464
465 if (r == -ENOMEM) {
466 error_report("Unable to map %s for ring %d", part_name[j], i);
467 } else if (r == -EBUSY) {
468 error_report("%s relocated for ring %d", part_name[j], i);
d5970055 469 }
8617343f 470 return r;
d5970055
MT
471}
472
083b9bd7
AB
473/*
474 * vhost_section: identify sections needed for vhost access
475 *
476 * We only care about RAM sections here (where virtqueue and guest
477 * internals accessed by virtio might live). If we find one we still
478 * allow the backend to potentially filter it out of our list.
479 */
988a2775 480static bool vhost_section(struct vhost_dev *dev, MemoryRegionSection *section)
af603142 481{
083b9bd7
AB
482 MemoryRegion *mr = section->mr;
483
484 if (memory_region_is_ram(mr) && !memory_region_is_rom(mr)) {
485 uint8_t dirty_mask = memory_region_get_dirty_log_mask(mr);
486 uint8_t handled_dirty;
487
488 /*
489 * Kernel based vhost doesn't handle any block which is doing
490 * dirty-tracking other than migration for which it has
491 * specific logging support. However for TCG the kernel never
492 * gets involved anyway so we can also ignore it's
493 * self-modiying code detection flags. However a vhost-user
494 * client could still confuse a TCG guest if it re-writes
495 * executable memory that has already been translated.
496 */
497 handled_dirty = (1 << DIRTY_MEMORY_MIGRATION) |
498 (1 << DIRTY_MEMORY_CODE);
499
500 if (dirty_mask & ~handled_dirty) {
501 trace_vhost_reject_section(mr->name, 1);
502 return false;
503 }
aa3c40f6 504
083b9bd7
AB
505 if (dev->vhost_ops->vhost_backend_mem_section_filter &&
506 !dev->vhost_ops->vhost_backend_mem_section_filter(dev, section)) {
507 trace_vhost_reject_section(mr->name, 2);
508 return false;
509 }
988a2775 510
083b9bd7
AB
511 trace_vhost_section(mr->name);
512 return true;
513 } else {
514 trace_vhost_reject_section(mr->name, 3);
515 return false;
516 }
af603142
NB
517}
518
519static void vhost_begin(MemoryListener *listener)
520{
521 struct vhost_dev *dev = container_of(listener, struct vhost_dev,
522 memory_listener);
c44317ef
DDAG
523 dev->tmp_sections = NULL;
524 dev->n_tmp_sections = 0;
af603142 525}
d5970055 526
af603142
NB
527static void vhost_commit(MemoryListener *listener)
528{
529 struct vhost_dev *dev = container_of(listener, struct vhost_dev,
530 memory_listener);
c44317ef
DDAG
531 MemoryRegionSection *old_sections;
532 int n_old_sections;
af603142 533 uint64_t log_size;
ade6d081 534 size_t regions_size;
af603142 535 int r;
0ca1fd2d 536 int i;
ade6d081 537 bool changed = false;
af603142 538
ade6d081
DDAG
539 /* Note we can be called before the device is started, but then
540 * starting the device calls set_mem_table, so we need to have
541 * built the data structures.
542 */
c44317ef
DDAG
543 old_sections = dev->mem_sections;
544 n_old_sections = dev->n_mem_sections;
545 dev->mem_sections = dev->tmp_sections;
546 dev->n_mem_sections = dev->n_tmp_sections;
547
ade6d081
DDAG
548 if (dev->n_mem_sections != n_old_sections) {
549 changed = true;
550 } else {
551 /* Same size, lets check the contents */
3fc4a64c
DDAG
552 for (int i = 0; i < n_old_sections; i++) {
553 if (!MemoryRegionSection_eq(&old_sections[i],
554 &dev->mem_sections[i])) {
555 changed = true;
556 break;
557 }
558 }
af603142 559 }
ade6d081
DDAG
560
561 trace_vhost_commit(dev->started, changed);
562 if (!changed) {
c44317ef 563 goto out;
d5970055 564 }
ade6d081
DDAG
565
566 /* Rebuild the regions list from the new sections list */
567 regions_size = offsetof(struct vhost_memory, regions) +
568 dev->n_mem_sections * sizeof dev->mem->regions[0];
569 dev->mem = g_realloc(dev->mem, regions_size);
570 dev->mem->nregions = dev->n_mem_sections;
571 used_memslots = dev->mem->nregions;
572 for (i = 0; i < dev->n_mem_sections; i++) {
573 struct vhost_memory_region *cur_vmr = dev->mem->regions + i;
574 struct MemoryRegionSection *mrs = dev->mem_sections + i;
575
576 cur_vmr->guest_phys_addr = mrs->offset_within_address_space;
577 cur_vmr->memory_size = int128_get64(mrs->size);
578 cur_vmr->userspace_addr =
579 (uintptr_t)memory_region_get_ram_ptr(mrs->mr) +
580 mrs->offset_within_region;
581 cur_vmr->flags_padding = 0;
582 }
583
584 if (!dev->started) {
c44317ef 585 goto out;
af603142 586 }
d5970055 587
0ca1fd2d
DDAG
588 for (i = 0; i < dev->mem->nregions; i++) {
589 if (vhost_verify_ring_mappings(dev,
590 (void *)(uintptr_t)dev->mem->regions[i].userspace_addr,
591 dev->mem->regions[i].guest_phys_addr,
592 dev->mem->regions[i].memory_size)) {
593 error_report("Verify ring failure on region %d", i);
594 abort();
595 }
d5970055
MT
596 }
597
598 if (!dev->log_enabled) {
21e70425 599 r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
162bba7f 600 if (r < 0) {
5d33ae4b 601 VHOST_OPS_DEBUG(r, "vhost_set_mem_table failed");
162bba7f 602 }
c44317ef 603 goto out;
d5970055
MT
604 }
605 log_size = vhost_get_log_size(dev);
606 /* We allocate an extra 4K bytes to log,
607 * to reduce the * number of reallocations. */
608#define VHOST_LOG_BUFFER (0x1000 / sizeof *dev->log)
609 /* To log more, must increase log size before table update. */
610 if (dev->log_size < log_size) {
611 vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER);
612 }
21e70425 613 r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
162bba7f 614 if (r < 0) {
5d33ae4b 615 VHOST_OPS_DEBUG(r, "vhost_set_mem_table failed");
162bba7f 616 }
d5970055
MT
617 /* To log less, can only decrease log size after table update. */
618 if (dev->log_size > log_size + VHOST_LOG_BUFFER) {
619 vhost_dev_log_resize(dev, log_size);
620 }
c44317ef
DDAG
621
622out:
623 /* Deref the old list of sections, this must happen _after_ the
624 * vhost_set_mem_table to ensure the client isn't still using the
625 * section we're about to unref.
626 */
627 while (n_old_sections--) {
628 memory_region_unref(old_sections[n_old_sections].mr);
629 }
630 g_free(old_sections);
631 return;
632}
633
48d7c975
DDAG
634/* Adds the section data to the tmp_section structure.
635 * It relies on the listener calling us in memory address order
636 * and for each region (via the _add and _nop methods) to
637 * join neighbours.
638 */
639static void vhost_region_add_section(struct vhost_dev *dev,
640 MemoryRegionSection *section)
c44317ef 641{
48d7c975
DDAG
642 bool need_add = true;
643 uint64_t mrs_size = int128_get64(section->size);
644 uint64_t mrs_gpa = section->offset_within_address_space;
645 uintptr_t mrs_host = (uintptr_t)memory_region_get_ram_ptr(section->mr) +
646 section->offset_within_region;
c1ece84e 647 RAMBlock *mrs_rb = section->mr->ram_block;
48d7c975
DDAG
648
649 trace_vhost_region_add_section(section->mr->name, mrs_gpa, mrs_size,
650 mrs_host);
651
83475056 652 if (dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER) {
76525114
DDAG
653 /* Round the section to it's page size */
654 /* First align the start down to a page boundary */
655 size_t mrs_page = qemu_ram_pagesize(mrs_rb);
656 uint64_t alignage = mrs_host & (mrs_page - 1);
657 if (alignage) {
658 mrs_host -= alignage;
659 mrs_size += alignage;
660 mrs_gpa -= alignage;
661 }
662 /* Now align the size up to a page boundary */
663 alignage = mrs_size & (mrs_page - 1);
664 if (alignage) {
665 mrs_size += mrs_page - alignage;
666 }
83475056
MT
667 trace_vhost_region_add_section_aligned(section->mr->name, mrs_gpa,
668 mrs_size, mrs_host);
76525114 669 }
c1ece84e 670
48d7c975
DDAG
671 if (dev->n_tmp_sections) {
672 /* Since we already have at least one section, lets see if
673 * this extends it; since we're scanning in order, we only
674 * have to look at the last one, and the FlatView that calls
675 * us shouldn't have overlaps.
676 */
677 MemoryRegionSection *prev_sec = dev->tmp_sections +
678 (dev->n_tmp_sections - 1);
679 uint64_t prev_gpa_start = prev_sec->offset_within_address_space;
680 uint64_t prev_size = int128_get64(prev_sec->size);
681 uint64_t prev_gpa_end = range_get_last(prev_gpa_start, prev_size);
682 uint64_t prev_host_start =
683 (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr) +
684 prev_sec->offset_within_region;
685 uint64_t prev_host_end = range_get_last(prev_host_start, prev_size);
686
c1ece84e
DDAG
687 if (mrs_gpa <= (prev_gpa_end + 1)) {
688 /* OK, looks like overlapping/intersecting - it's possible that
689 * the rounding to page sizes has made them overlap, but they should
690 * match up in the same RAMBlock if they do.
691 */
692 if (mrs_gpa < prev_gpa_start) {
ff477614
DDAG
693 error_report("%s:Section '%s' rounded to %"PRIx64
694 " prior to previous '%s' %"PRIx64,
695 __func__, section->mr->name, mrs_gpa,
696 prev_sec->mr->name, prev_gpa_start);
c1ece84e
DDAG
697 /* A way to cleanly fail here would be better */
698 return;
699 }
700 /* Offset from the start of the previous GPA to this GPA */
701 size_t offset = mrs_gpa - prev_gpa_start;
702
703 if (prev_host_start + offset == mrs_host &&
704 section->mr == prev_sec->mr &&
705 (!dev->vhost_ops->vhost_backend_can_merge ||
706 dev->vhost_ops->vhost_backend_can_merge(dev,
48d7c975
DDAG
707 mrs_host, mrs_size,
708 prev_host_start, prev_size))) {
c1ece84e
DDAG
709 uint64_t max_end = MAX(prev_host_end, mrs_host + mrs_size);
710 need_add = false;
711 prev_sec->offset_within_address_space =
712 MIN(prev_gpa_start, mrs_gpa);
713 prev_sec->offset_within_region =
714 MIN(prev_host_start, mrs_host) -
715 (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr);
716 prev_sec->size = int128_make64(max_end - MIN(prev_host_start,
717 mrs_host));
718 trace_vhost_region_add_section_merge(section->mr->name,
719 int128_get64(prev_sec->size),
720 prev_sec->offset_within_address_space,
721 prev_sec->offset_within_region);
722 } else {
e7b94a84
DDAG
723 /* adjoining regions are fine, but overlapping ones with
724 * different blocks/offsets shouldn't happen
725 */
726 if (mrs_gpa != prev_gpa_end + 1) {
727 error_report("%s: Overlapping but not coherent sections "
728 "at %"PRIx64,
729 __func__, mrs_gpa);
730 return;
731 }
c1ece84e 732 }
48d7c975
DDAG
733 }
734 }
735
736 if (need_add) {
737 ++dev->n_tmp_sections;
738 dev->tmp_sections = g_renew(MemoryRegionSection, dev->tmp_sections,
739 dev->n_tmp_sections);
740 dev->tmp_sections[dev->n_tmp_sections - 1] = *section;
741 /* The flatview isn't stable and we don't use it, making it NULL
742 * means we can memcmp the list.
743 */
744 dev->tmp_sections[dev->n_tmp_sections - 1].fv = NULL;
745 memory_region_ref(section->mr);
746 }
50c1e149
AK
747}
748
938eeb64
DDAG
749/* Used for both add and nop callbacks */
750static void vhost_region_addnop(MemoryListener *listener,
751 MemoryRegionSection *section)
04097f7c 752{
2817b260
AK
753 struct vhost_dev *dev = container_of(listener, struct vhost_dev,
754 memory_listener);
755
988a2775 756 if (!vhost_section(dev, section)) {
c49450b9
AK
757 return;
758 }
48d7c975 759 vhost_region_add_section(dev, section);
04097f7c
AK
760}
761
375f74f4
JW
762static void vhost_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
763{
764 struct vhost_iommu *iommu = container_of(n, struct vhost_iommu, n);
765 struct vhost_dev *hdev = iommu->hdev;
766 hwaddr iova = iotlb->iova + iommu->iommu_offset;
767
020e571b
MC
768 if (vhost_backend_invalidate_device_iotlb(hdev, iova,
769 iotlb->addr_mask + 1)) {
375f74f4
JW
770 error_report("Fail to invalidate device iotlb");
771 }
772}
773
774static void vhost_iommu_region_add(MemoryListener *listener,
775 MemoryRegionSection *section)
776{
777 struct vhost_dev *dev = container_of(listener, struct vhost_dev,
778 iommu_listener);
779 struct vhost_iommu *iommu;
698feb5e 780 Int128 end;
805d4496 781 int iommu_idx;
388a86df 782 IOMMUMemoryRegion *iommu_mr;
958ec334 783 int ret;
375f74f4
JW
784
785 if (!memory_region_is_iommu(section->mr)) {
786 return;
787 }
788
388a86df
TB
789 iommu_mr = IOMMU_MEMORY_REGION(section->mr);
790
375f74f4 791 iommu = g_malloc0(sizeof(*iommu));
698feb5e
PX
792 end = int128_add(int128_make64(section->offset_within_region),
793 section->size);
794 end = int128_sub(end, int128_one());
cb1efcf4
PM
795 iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
796 MEMTXATTRS_UNSPECIFIED);
698feb5e 797 iommu_notifier_init(&iommu->n, vhost_iommu_unmap_notify,
b68ba1ca 798 IOMMU_NOTIFIER_DEVIOTLB_UNMAP,
698feb5e 799 section->offset_within_region,
cb1efcf4
PM
800 int128_get64(end),
801 iommu_idx);
375f74f4
JW
802 iommu->mr = section->mr;
803 iommu->iommu_offset = section->offset_within_address_space -
804 section->offset_within_region;
805 iommu->hdev = dev;
958ec334
PX
806 ret = memory_region_register_iommu_notifier(section->mr, &iommu->n, NULL);
807 if (ret) {
808 /*
809 * Some vIOMMUs do not support dev-iotlb yet. If so, try to use the
810 * UNMAP legacy message
811 */
812 iommu->n.notifier_flags = IOMMU_NOTIFIER_UNMAP;
813 memory_region_register_iommu_notifier(section->mr, &iommu->n,
814 &error_fatal);
815 }
375f74f4
JW
816 QLIST_INSERT_HEAD(&dev->iommu_list, iommu, iommu_next);
817 /* TODO: can replay help performance here? */
818}
819
820static void vhost_iommu_region_del(MemoryListener *listener,
821 MemoryRegionSection *section)
822{
823 struct vhost_dev *dev = container_of(listener, struct vhost_dev,
824 iommu_listener);
825 struct vhost_iommu *iommu;
826
827 if (!memory_region_is_iommu(section->mr)) {
828 return;
829 }
830
831 QLIST_FOREACH(iommu, &dev->iommu_list, iommu_next) {
698feb5e
PX
832 if (iommu->mr == section->mr &&
833 iommu->n.start == section->offset_within_region) {
375f74f4
JW
834 memory_region_unregister_iommu_notifier(iommu->mr,
835 &iommu->n);
836 QLIST_REMOVE(iommu, iommu_next);
837 g_free(iommu);
838 break;
839 }
840 }
841}
842
d5970055
MT
843static int vhost_virtqueue_set_addr(struct vhost_dev *dev,
844 struct vhost_virtqueue *vq,
845 unsigned idx, bool enable_log)
846{
b4ab225c
CL
847 struct vhost_vring_addr addr;
848 int r;
849 memset(&addr, 0, sizeof(struct vhost_vring_addr));
850
851 if (dev->vhost_ops->vhost_vq_get_addr) {
852 r = dev->vhost_ops->vhost_vq_get_addr(dev, &addr, vq);
853 if (r < 0) {
5d33ae4b
RK
854 VHOST_OPS_DEBUG(r, "vhost_vq_get_addr failed");
855 return r;
b4ab225c
CL
856 }
857 } else {
858 addr.desc_user_addr = (uint64_t)(unsigned long)vq->desc;
859 addr.avail_user_addr = (uint64_t)(unsigned long)vq->avail;
860 addr.used_user_addr = (uint64_t)(unsigned long)vq->used;
861 }
862 addr.index = idx;
863 addr.log_guest_addr = vq->used_phys;
864 addr.flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0;
865 r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr);
d5970055 866 if (r < 0) {
5d33ae4b 867 VHOST_OPS_DEBUG(r, "vhost_set_vring_addr failed");
d5970055 868 }
5d33ae4b 869 return r;
d5970055
MT
870}
871
c471ad0e
JW
872static int vhost_dev_set_features(struct vhost_dev *dev,
873 bool enable_log)
d5970055
MT
874{
875 uint64_t features = dev->acked_features;
876 int r;
877 if (enable_log) {
9a2ba823 878 features |= 0x1ULL << VHOST_F_LOG_ALL;
d5970055 879 }
f7ef7e6e
JW
880 if (!vhost_dev_has_iommu(dev)) {
881 features &= ~(0x1ULL << VIRTIO_F_IOMMU_PLATFORM);
882 }
7a471694
CL
883 if (dev->vhost_ops->vhost_force_iommu) {
884 if (dev->vhost_ops->vhost_force_iommu(dev) == true) {
885 features |= 0x1ULL << VIRTIO_F_IOMMU_PLATFORM;
886 }
887 }
21e70425 888 r = dev->vhost_ops->vhost_set_features(dev, features);
c6409692 889 if (r < 0) {
5d33ae4b 890 VHOST_OPS_DEBUG(r, "vhost_set_features failed");
b37556ed
JW
891 goto out;
892 }
893 if (dev->vhost_ops->vhost_set_backend_cap) {
894 r = dev->vhost_ops->vhost_set_backend_cap(dev);
895 if (r < 0) {
5d33ae4b 896 VHOST_OPS_DEBUG(r, "vhost_set_backend_cap failed");
b37556ed
JW
897 goto out;
898 }
c6409692 899 }
b37556ed
JW
900
901out:
5d33ae4b 902 return r;
d5970055
MT
903}
904
905static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log)
906{
162bba7f 907 int r, i, idx;
1e5a050f
DS
908 hwaddr addr;
909
d5970055
MT
910 r = vhost_dev_set_features(dev, enable_log);
911 if (r < 0) {
912 goto err_features;
913 }
914 for (i = 0; i < dev->nvqs; ++i) {
25a2a920 915 idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
1e5a050f
DS
916 addr = virtio_queue_get_desc_addr(dev->vdev, idx);
917 if (!addr) {
918 /*
919 * The queue might not be ready for start. If this
920 * is the case there is no reason to continue the process.
921 * The similar logic is used by the vhost_virtqueue_start()
922 * routine.
923 */
924 continue;
925 }
25a2a920 926 r = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
d5970055
MT
927 enable_log);
928 if (r < 0) {
929 goto err_vq;
930 }
931 }
932 return 0;
933err_vq:
934 for (; i >= 0; --i) {
25a2a920 935 idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
9ce305c8
NX
936 addr = virtio_queue_get_desc_addr(dev->vdev, idx);
937 if (!addr) {
938 continue;
939 }
162bba7f
MAL
940 vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
941 dev->log_enabled);
d5970055 942 }
162bba7f 943 vhost_dev_set_features(dev, dev->log_enabled);
d5970055
MT
944err_features:
945 return r;
946}
947
705f7f2f 948static int vhost_migration_log(MemoryListener *listener, bool enable)
d5970055 949{
04097f7c
AK
950 struct vhost_dev *dev = container_of(listener, struct vhost_dev,
951 memory_listener);
d5970055 952 int r;
705f7f2f 953 if (enable == dev->log_enabled) {
d5970055
MT
954 return 0;
955 }
956 if (!dev->started) {
957 dev->log_enabled = enable;
958 return 0;
959 }
f5b22d06
DS
960
961 r = 0;
d5970055
MT
962 if (!enable) {
963 r = vhost_dev_set_log(dev, false);
964 if (r < 0) {
f5b22d06 965 goto check_dev_state;
d5970055 966 }
309750fa 967 vhost_log_put(dev, false);
d5970055
MT
968 } else {
969 vhost_dev_log_resize(dev, vhost_get_log_size(dev));
970 r = vhost_dev_set_log(dev, true);
971 if (r < 0) {
f5b22d06 972 goto check_dev_state;
d5970055
MT
973 }
974 }
f5b22d06
DS
975
976check_dev_state:
d5970055 977 dev->log_enabled = enable;
f5b22d06
DS
978 /*
979 * vhost-user-* devices could change their state during log
980 * initialization due to disconnect. So check dev state after
981 * vhost communication.
982 */
983 if (!dev->started) {
984 /*
985 * Since device is in the stopped state, it is okay for
986 * migration. Return success.
987 */
988 r = 0;
989 }
990 if (r) {
cba42d61 991 /* An error occurred. */
f5b22d06
DS
992 dev->log_enabled = false;
993 }
994
995 return r;
d5970055
MT
996}
997
04097f7c
AK
998static void vhost_log_global_start(MemoryListener *listener)
999{
1000 int r;
1001
1002 r = vhost_migration_log(listener, true);
1003 if (r < 0) {
1004 abort();
1005 }
1006}
1007
1008static void vhost_log_global_stop(MemoryListener *listener)
1009{
1010 int r;
1011
1012 r = vhost_migration_log(listener, false);
1013 if (r < 0) {
1014 abort();
1015 }
1016}
1017
1018static void vhost_log_start(MemoryListener *listener,
b2dfd71c
PB
1019 MemoryRegionSection *section,
1020 int old, int new)
04097f7c
AK
1021{
1022 /* FIXME: implement */
1023}
1024
1025static void vhost_log_stop(MemoryListener *listener,
b2dfd71c
PB
1026 MemoryRegionSection *section,
1027 int old, int new)
04097f7c
AK
1028{
1029 /* FIXME: implement */
1030}
1031
46f70ff1
GK
1032/* The vhost driver natively knows how to handle the vrings of non
1033 * cross-endian legacy devices and modern devices. Only legacy devices
1034 * exposed to a bi-endian guest may require the vhost driver to use a
1035 * specific endianness.
1036 */
a122ab24
GK
1037static inline bool vhost_needs_vring_endian(VirtIODevice *vdev)
1038{
e5848123
GK
1039 if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) {
1040 return false;
1041 }
e03b5686 1042#if HOST_BIG_ENDIAN
46f70ff1 1043 return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_LITTLE;
a122ab24 1044#else
46f70ff1 1045 return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_BIG;
a122ab24 1046#endif
a122ab24
GK
1047}
1048
04b7a152
GK
1049static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev *dev,
1050 bool is_big_endian,
1051 int vhost_vq_index)
1052{
5d33ae4b 1053 int r;
04b7a152
GK
1054 struct vhost_vring_state s = {
1055 .index = vhost_vq_index,
1056 .num = is_big_endian
1057 };
1058
5d33ae4b
RK
1059 r = dev->vhost_ops->vhost_set_vring_endian(dev, &s);
1060 if (r < 0) {
1061 VHOST_OPS_DEBUG(r, "vhost_set_vring_endian failed");
04b7a152 1062 }
5d33ae4b 1063 return r;
04b7a152
GK
1064}
1065
c471ad0e
JW
1066static int vhost_memory_region_lookup(struct vhost_dev *hdev,
1067 uint64_t gpa, uint64_t *uaddr,
1068 uint64_t *len)
1069{
1070 int i;
1071
1072 for (i = 0; i < hdev->mem->nregions; i++) {
1073 struct vhost_memory_region *reg = hdev->mem->regions + i;
1074
1075 if (gpa >= reg->guest_phys_addr &&
1076 reg->guest_phys_addr + reg->memory_size > gpa) {
1077 *uaddr = reg->userspace_addr + gpa - reg->guest_phys_addr;
1078 *len = reg->guest_phys_addr + reg->memory_size - gpa;
1079 return 0;
1080 }
1081 }
1082
1083 return -EFAULT;
1084}
1085
fc58bd0d 1086int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write)
c471ad0e
JW
1087{
1088 IOMMUTLBEntry iotlb;
1089 uint64_t uaddr, len;
fc58bd0d 1090 int ret = -EFAULT;
c471ad0e 1091
7a064bcc 1092 RCU_READ_LOCK_GUARD();
c471ad0e 1093
ffcbbe72
PX
1094 trace_vhost_iotlb_miss(dev, 1);
1095
c471ad0e 1096 iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as,
7446eb07
PM
1097 iova, write,
1098 MEMTXATTRS_UNSPECIFIED);
c471ad0e 1099 if (iotlb.target_as != NULL) {
fc58bd0d
MC
1100 ret = vhost_memory_region_lookup(dev, iotlb.translated_addr,
1101 &uaddr, &len);
1102 if (ret) {
ffcbbe72 1103 trace_vhost_iotlb_miss(dev, 3);
c471ad0e
JW
1104 error_report("Fail to lookup the translated address "
1105 "%"PRIx64, iotlb.translated_addr);
1106 goto out;
1107 }
1108
1109 len = MIN(iotlb.addr_mask + 1, len);
1110 iova = iova & ~iotlb.addr_mask;
1111
020e571b
MC
1112 ret = vhost_backend_update_device_iotlb(dev, iova, uaddr,
1113 len, iotlb.perm);
fc58bd0d 1114 if (ret) {
ffcbbe72 1115 trace_vhost_iotlb_miss(dev, 4);
c471ad0e
JW
1116 error_report("Fail to update device iotlb");
1117 goto out;
1118 }
1119 }
ffcbbe72
PX
1120
1121 trace_vhost_iotlb_miss(dev, 2);
1122
c471ad0e 1123out:
fc58bd0d 1124 return ret;
c471ad0e
JW
1125}
1126
ff48b628
KX
1127int vhost_virtqueue_start(struct vhost_dev *dev,
1128 struct VirtIODevice *vdev,
1129 struct vhost_virtqueue *vq,
1130 unsigned idx)
d5970055 1131{
96a3d98d
JW
1132 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1133 VirtioBusState *vbus = VIRTIO_BUS(qbus);
1134 VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus);
a8170e5e 1135 hwaddr s, l, a;
d5970055 1136 int r;
21e70425 1137 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
d5970055 1138 struct vhost_vring_file file = {
a9f98bb5 1139 .index = vhost_vq_index
d5970055
MT
1140 };
1141 struct vhost_vring_state state = {
a9f98bb5 1142 .index = vhost_vq_index
d5970055
MT
1143 };
1144 struct VirtQueue *vvq = virtio_get_queue(vdev, idx);
1145
fb20fbb7
JH
1146 a = virtio_queue_get_desc_addr(vdev, idx);
1147 if (a == 0) {
1148 /* Queue might not be ready for start */
1149 return 0;
1150 }
a9f98bb5 1151
d5970055 1152 vq->num = state.num = virtio_queue_get_num(vdev, idx);
21e70425 1153 r = dev->vhost_ops->vhost_set_vring_num(dev, &state);
d5970055 1154 if (r) {
5d33ae4b
RK
1155 VHOST_OPS_DEBUG(r, "vhost_set_vring_num failed");
1156 return r;
d5970055
MT
1157 }
1158
1159 state.num = virtio_queue_get_last_avail_idx(vdev, idx);
21e70425 1160 r = dev->vhost_ops->vhost_set_vring_base(dev, &state);
d5970055 1161 if (r) {
5d33ae4b
RK
1162 VHOST_OPS_DEBUG(r, "vhost_set_vring_base failed");
1163 return r;
d5970055
MT
1164 }
1165
e5848123 1166 if (vhost_needs_vring_endian(vdev)) {
04b7a152
GK
1167 r = vhost_virtqueue_set_vring_endian_legacy(dev,
1168 virtio_is_big_endian(vdev),
1169 vhost_vq_index);
1170 if (r) {
5d33ae4b 1171 return r;
04b7a152
GK
1172 }
1173 }
1174
f1f9e6c5 1175 vq->desc_size = s = l = virtio_queue_get_desc_size(vdev, idx);
fb20fbb7 1176 vq->desc_phys = a;
b897a474 1177 vq->desc = vhost_memory_map(dev, a, &l, false);
d5970055
MT
1178 if (!vq->desc || l != s) {
1179 r = -ENOMEM;
1180 goto fail_alloc_desc;
1181 }
f1f9e6c5
GK
1182 vq->avail_size = s = l = virtio_queue_get_avail_size(vdev, idx);
1183 vq->avail_phys = a = virtio_queue_get_avail_addr(vdev, idx);
b897a474 1184 vq->avail = vhost_memory_map(dev, a, &l, false);
d5970055
MT
1185 if (!vq->avail || l != s) {
1186 r = -ENOMEM;
1187 goto fail_alloc_avail;
1188 }
1189 vq->used_size = s = l = virtio_queue_get_used_size(vdev, idx);
1190 vq->used_phys = a = virtio_queue_get_used_addr(vdev, idx);
b897a474 1191 vq->used = vhost_memory_map(dev, a, &l, true);
d5970055
MT
1192 if (!vq->used || l != s) {
1193 r = -ENOMEM;
1194 goto fail_alloc_used;
1195 }
1196
a9f98bb5 1197 r = vhost_virtqueue_set_addr(dev, vq, vhost_vq_index, dev->log_enabled);
d5970055 1198 if (r < 0) {
d5970055
MT
1199 goto fail_alloc;
1200 }
a9f98bb5 1201
d5970055 1202 file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq));
21e70425 1203 r = dev->vhost_ops->vhost_set_vring_kick(dev, &file);
d5970055 1204 if (r) {
5d33ae4b 1205 VHOST_OPS_DEBUG(r, "vhost_set_vring_kick failed");
d5970055
MT
1206 goto fail_kick;
1207 }
1208
f56a1247
MT
1209 /* Clear and discard previous events if any. */
1210 event_notifier_test_and_clear(&vq->masked_notifier);
d5970055 1211
5669655a
VK
1212 /* Init vring in unmasked state, unless guest_notifier_mask
1213 * will do it later.
1214 */
1215 if (!vdev->use_guest_notifier_mask) {
1216 /* TODO: check and handle errors. */
1217 vhost_virtqueue_mask(dev, vdev, idx, false);
1218 }
1219
96a3d98d
JW
1220 if (k->query_guest_notifiers &&
1221 k->query_guest_notifiers(qbus->parent) &&
1222 virtio_queue_vector(vdev, idx) == VIRTIO_NO_VECTOR) {
1223 file.fd = -1;
1224 r = dev->vhost_ops->vhost_set_vring_call(dev, &file);
1225 if (r) {
1226 goto fail_vector;
1227 }
1228 }
1229
d5970055
MT
1230 return 0;
1231
96a3d98d 1232fail_vector:
d5970055 1233fail_kick:
d5970055 1234fail_alloc:
c471ad0e
JW
1235 vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx),
1236 0, 0);
d5970055 1237fail_alloc_used:
c471ad0e
JW
1238 vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx),
1239 0, 0);
d5970055 1240fail_alloc_avail:
c471ad0e
JW
1241 vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx),
1242 0, 0);
d5970055
MT
1243fail_alloc_desc:
1244 return r;
1245}
1246
e1f101d9
KX
1247void vhost_virtqueue_stop(struct vhost_dev *dev,
1248 struct VirtIODevice *vdev,
1249 struct vhost_virtqueue *vq,
1250 unsigned idx)
d5970055 1251{
21e70425 1252 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
d5970055 1253 struct vhost_vring_state state = {
04b7a152 1254 .index = vhost_vq_index,
d5970055
MT
1255 };
1256 int r;
fb20fbb7 1257
fa4ae4be 1258 if (virtio_queue_get_desc_addr(vdev, idx) == 0) {
fb20fbb7
JH
1259 /* Don't stop the virtqueue which might have not been started */
1260 return;
1261 }
fc57fd99 1262
21e70425 1263 r = dev->vhost_ops->vhost_get_vring_base(dev, &state);
d5970055 1264 if (r < 0) {
5d33ae4b 1265 VHOST_OPS_DEBUG(r, "vhost VQ %u ring restore failed: %d", idx, r);
2ae39a11
MC
1266 /* Connection to the backend is broken, so let's sync internal
1267 * last avail idx to the device used idx.
1268 */
1269 virtio_queue_restore_last_avail_idx(vdev, idx);
499c5579
MAL
1270 } else {
1271 virtio_queue_set_last_avail_idx(vdev, idx, state.num);
d5970055 1272 }
3561ba14 1273 virtio_queue_invalidate_signalled_used(vdev, idx);
aa94d521 1274 virtio_queue_update_used_idx(vdev, idx);
04b7a152
GK
1275
1276 /* In the cross-endian case, we need to reset the vring endianness to
1277 * native as legacy devices expect so by default.
1278 */
e5848123 1279 if (vhost_needs_vring_endian(vdev)) {
162bba7f
MAL
1280 vhost_virtqueue_set_vring_endian_legacy(dev,
1281 !virtio_is_big_endian(vdev),
1282 vhost_vq_index);
04b7a152
GK
1283 }
1284
c471ad0e
JW
1285 vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx),
1286 1, virtio_queue_get_used_size(vdev, idx));
1287 vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx),
1288 0, virtio_queue_get_avail_size(vdev, idx));
1289 vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx),
1290 0, virtio_queue_get_desc_size(vdev, idx));
d5970055
MT
1291}
1292
69e87b32
JW
1293static int vhost_virtqueue_set_busyloop_timeout(struct vhost_dev *dev,
1294 int n, uint32_t timeout)
1295{
1296 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n);
1297 struct vhost_vring_state state = {
1298 .index = vhost_vq_index,
1299 .num = timeout,
1300 };
1301 int r;
1302
1303 if (!dev->vhost_ops->vhost_set_vring_busyloop_timeout) {
1304 return -EINVAL;
1305 }
1306
1307 r = dev->vhost_ops->vhost_set_vring_busyloop_timeout(dev, &state);
1308 if (r) {
5d33ae4b 1309 VHOST_OPS_DEBUG(r, "vhost_set_vring_busyloop_timeout failed");
69e87b32
JW
1310 return r;
1311 }
1312
1313 return 0;
1314}
1315
ae50ae0b
KK
1316static void vhost_virtqueue_error_notifier(EventNotifier *n)
1317{
1318 struct vhost_virtqueue *vq = container_of(n, struct vhost_virtqueue,
1319 error_notifier);
1320 struct vhost_dev *dev = vq->dev;
1321 int index = vq - dev->vqs;
1322
1323 if (event_notifier_test_and_clear(n) && dev->vdev) {
1324 VHOST_OPS_DEBUG(-EINVAL, "vhost vring error in virtqueue %d",
1325 dev->vq_index + index);
1326 }
1327}
1328
f56a1247
MT
1329static int vhost_virtqueue_init(struct vhost_dev *dev,
1330 struct vhost_virtqueue *vq, int n)
1331{
21e70425 1332 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n);
f56a1247 1333 struct vhost_vring_file file = {
b931bfbf 1334 .index = vhost_vq_index,
f56a1247
MT
1335 };
1336 int r = event_notifier_init(&vq->masked_notifier, 0);
1337 if (r < 0) {
1338 return r;
1339 }
1340
ff5eb77b 1341 file.fd = event_notifier_get_wfd(&vq->masked_notifier);
21e70425 1342 r = dev->vhost_ops->vhost_set_vring_call(dev, &file);
f56a1247 1343 if (r) {
5d33ae4b 1344 VHOST_OPS_DEBUG(r, "vhost_set_vring_call failed");
f56a1247
MT
1345 goto fail_call;
1346 }
c471ad0e
JW
1347
1348 vq->dev = dev;
1349
ae50ae0b
KK
1350 if (dev->vhost_ops->vhost_set_vring_err) {
1351 r = event_notifier_init(&vq->error_notifier, 0);
1352 if (r < 0) {
1353 goto fail_call;
1354 }
1355
1356 file.fd = event_notifier_get_fd(&vq->error_notifier);
1357 r = dev->vhost_ops->vhost_set_vring_err(dev, &file);
1358 if (r) {
1359 VHOST_OPS_DEBUG(r, "vhost_set_vring_err failed");
1360 goto fail_err;
1361 }
1362
1363 event_notifier_set_handler(&vq->error_notifier,
1364 vhost_virtqueue_error_notifier);
1365 }
1366
f56a1247 1367 return 0;
ae50ae0b
KK
1368
1369fail_err:
1370 event_notifier_cleanup(&vq->error_notifier);
f56a1247
MT
1371fail_call:
1372 event_notifier_cleanup(&vq->masked_notifier);
1373 return r;
1374}
1375
1376static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq)
1377{
1378 event_notifier_cleanup(&vq->masked_notifier);
ae50ae0b
KK
1379 if (vq->dev->vhost_ops->vhost_set_vring_err) {
1380 event_notifier_set_handler(&vq->error_notifier, NULL);
1381 event_notifier_cleanup(&vq->error_notifier);
1382 }
f56a1247
MT
1383}
1384
81647a65 1385int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
a6945f22
KW
1386 VhostBackendType backend_type, uint32_t busyloop_timeout,
1387 Error **errp)
d5970055
MT
1388{
1389 uint64_t features;
a06db3ec 1390 int i, r, n_initialized_vqs = 0;
81647a65 1391
c471ad0e 1392 hdev->vdev = NULL;
d2fc4402
MAL
1393 hdev->migration_blocker = NULL;
1394
7cb8a9b9
MAL
1395 r = vhost_set_backend_type(hdev, backend_type);
1396 assert(r >= 0);
1a1bfac9 1397
28770ff9 1398 r = hdev->vhost_ops->vhost_backend_init(hdev, opaque, errp);
7cb8a9b9
MAL
1399 if (r < 0) {
1400 goto fail;
24d1eb33
NN
1401 }
1402
21e70425 1403 r = hdev->vhost_ops->vhost_set_owner(hdev);
d5970055 1404 if (r < 0) {
f2a6e6c4 1405 error_setg_errno(errp, -r, "vhost_set_owner failed");
d5970055
MT
1406 goto fail;
1407 }
1408
21e70425 1409 r = hdev->vhost_ops->vhost_get_features(hdev, &features);
d5970055 1410 if (r < 0) {
f2a6e6c4 1411 error_setg_errno(errp, -r, "vhost_get_features failed");
d5970055
MT
1412 goto fail;
1413 }
f56a1247 1414
a06db3ec 1415 for (i = 0; i < hdev->nvqs; ++i, ++n_initialized_vqs) {
b931bfbf 1416 r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index + i);
f56a1247 1417 if (r < 0) {
a6945f22 1418 error_setg_errno(errp, -r, "Failed to initialize virtqueue %d", i);
a06db3ec 1419 goto fail;
f56a1247
MT
1420 }
1421 }
69e87b32
JW
1422
1423 if (busyloop_timeout) {
1424 for (i = 0; i < hdev->nvqs; ++i) {
1425 r = vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i,
1426 busyloop_timeout);
1427 if (r < 0) {
f2a6e6c4 1428 error_setg_errno(errp, -r, "Failed to set busyloop timeout");
69e87b32
JW
1429 goto fail_busyloop;
1430 }
1431 }
1432 }
1433
d5970055
MT
1434 hdev->features = features;
1435
04097f7c 1436 hdev->memory_listener = (MemoryListener) {
142518bd 1437 .name = "vhost",
50c1e149
AK
1438 .begin = vhost_begin,
1439 .commit = vhost_commit,
938eeb64
DDAG
1440 .region_add = vhost_region_addnop,
1441 .region_nop = vhost_region_addnop,
04097f7c
AK
1442 .log_start = vhost_log_start,
1443 .log_stop = vhost_log_stop,
1444 .log_sync = vhost_log_sync,
1445 .log_global_start = vhost_log_global_start,
1446 .log_global_stop = vhost_log_global_stop,
72e22d2f 1447 .priority = 10
04097f7c 1448 };
d2fc4402 1449
375f74f4 1450 hdev->iommu_listener = (MemoryListener) {
142518bd 1451 .name = "vhost-iommu",
375f74f4
JW
1452 .region_add = vhost_iommu_region_add,
1453 .region_del = vhost_iommu_region_del,
1454 };
c471ad0e 1455
d2fc4402
MAL
1456 if (hdev->migration_blocker == NULL) {
1457 if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) {
1458 error_setg(&hdev->migration_blocker,
1459 "Migration disabled: vhost lacks VHOST_F_LOG_ALL feature.");
648abbfb 1460 } else if (vhost_dev_log_is_shared(hdev) && !qemu_memfd_alloc_check()) {
31190ed7
MAL
1461 error_setg(&hdev->migration_blocker,
1462 "Migration disabled: failed to allocate shared memory");
d2fc4402
MAL
1463 }
1464 }
1465
1466 if (hdev->migration_blocker != NULL) {
28770ff9 1467 r = migrate_add_blocker(hdev->migration_blocker, errp);
436c831a 1468 if (r < 0) {
fe44dc91
AA
1469 error_free(hdev->migration_blocker);
1470 goto fail_busyloop;
1471 }
7145872e 1472 }
d2fc4402 1473
7267c094 1474 hdev->mem = g_malloc0(offsetof(struct vhost_memory, regions));
2817b260
AK
1475 hdev->n_mem_sections = 0;
1476 hdev->mem_sections = NULL;
d5970055
MT
1477 hdev->log = NULL;
1478 hdev->log_size = 0;
1479 hdev->log_enabled = false;
1480 hdev->started = false;
f6790af6 1481 memory_listener_register(&hdev->memory_listener, &address_space_memory);
5be5f9be 1482 QLIST_INSERT_HEAD(&vhost_devices, hdev, entry);
9e2a2a3e
JZ
1483
1484 if (used_memslots > hdev->vhost_ops->vhost_backend_memslots_limit(hdev)) {
a6945f22
KW
1485 error_setg(errp, "vhost backend memory slots limit is less"
1486 " than current number of present memory slots");
f2a6e6c4 1487 r = -EINVAL;
1d8d014e 1488 goto fail_busyloop;
9e2a2a3e
JZ
1489 }
1490
d5970055 1491 return 0;
a06db3ec 1492
69e87b32 1493fail_busyloop:
1d8d014e
SH
1494 if (busyloop_timeout) {
1495 while (--i >= 0) {
1496 vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i, 0);
1497 }
69e87b32 1498 }
d5970055 1499fail:
a06db3ec
MAL
1500 hdev->nvqs = n_initialized_vqs;
1501 vhost_dev_cleanup(hdev);
d5970055
MT
1502 return r;
1503}
1504
1505void vhost_dev_cleanup(struct vhost_dev *hdev)
1506{
f56a1247 1507 int i;
e0547b59 1508
a2761231
AB
1509 trace_vhost_dev_cleanup(hdev);
1510
f56a1247
MT
1511 for (i = 0; i < hdev->nvqs; ++i) {
1512 vhost_virtqueue_cleanup(hdev->vqs + i);
1513 }
5be5f9be
MAL
1514 if (hdev->mem) {
1515 /* those are only safe after successful init */
1516 memory_listener_unregister(&hdev->memory_listener);
1517 QLIST_REMOVE(hdev, entry);
1518 }
7145872e
MT
1519 if (hdev->migration_blocker) {
1520 migrate_del_blocker(hdev->migration_blocker);
1521 error_free(hdev->migration_blocker);
1522 }
7267c094 1523 g_free(hdev->mem);
2817b260 1524 g_free(hdev->mem_sections);
e0547b59
MAL
1525 if (hdev->vhost_ops) {
1526 hdev->vhost_ops->vhost_backend_cleanup(hdev);
1527 }
7b527247 1528 assert(!hdev->log);
e0547b59
MAL
1529
1530 memset(hdev, 0, sizeof(struct vhost_dev));
d5970055
MT
1531}
1532
92099aa4
LV
1533static void vhost_dev_disable_notifiers_nvqs(struct vhost_dev *hdev,
1534 VirtIODevice *vdev,
1535 unsigned int nvqs)
1536{
1537 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1538 int i, r;
1539
1540 /*
1541 * Batch all the host notifiers in a single transaction to avoid
1542 * quadratic time complexity in address_space_update_ioeventfds().
1543 */
1544 memory_region_transaction_begin();
1545
1546 for (i = 0; i < nvqs; ++i) {
1547 r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
1548 false);
1549 if (r < 0) {
1550 error_report("vhost VQ %d notifier cleanup failed: %d", i, -r);
1551 }
1552 assert(r >= 0);
1553 }
1554
1555 /*
1556 * The transaction expects the ioeventfds to be open when it
1557 * commits. Do it now, before the cleanup loop.
1558 */
1559 memory_region_transaction_commit();
1560
1561 for (i = 0; i < nvqs; ++i) {
1562 virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i);
1563 }
1564 virtio_device_release_ioeventfd(vdev);
1565}
1566
b0b3db79
MT
1567/* Stop processing guest IO notifications in qemu.
1568 * Start processing them in vhost in kernel.
1569 */
1570int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
1571{
1c819449 1572 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
8771589b 1573 int i, r;
4afba631 1574
310837de
PB
1575 /* We will pass the notifiers to the kernel, make sure that QEMU
1576 * doesn't interfere.
1577 */
1578 r = virtio_device_grab_ioeventfd(vdev);
1579 if (r < 0) {
4afba631 1580 error_report("binding does not support host notifiers");
8771589b 1581 return r;
b0b3db79
MT
1582 }
1583
0fdc6b85
LM
1584 /*
1585 * Batch all the host notifiers in a single transaction to avoid
1586 * quadratic time complexity in address_space_update_ioeventfds().
1587 */
1588 memory_region_transaction_begin();
1589
b0b3db79 1590 for (i = 0; i < hdev->nvqs; ++i) {
b1f0a33d
CH
1591 r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
1592 true);
b0b3db79 1593 if (r < 0) {
4afba631 1594 error_report("vhost VQ %d notifier binding failed: %d", i, -r);
0fdc6b85 1595 memory_region_transaction_commit();
92099aa4 1596 vhost_dev_disable_notifiers_nvqs(hdev, vdev, i);
8771589b 1597 return r;
b0b3db79
MT
1598 }
1599 }
1600
0fdc6b85
LM
1601 memory_region_transaction_commit();
1602
b0b3db79 1603 return 0;
b0b3db79
MT
1604}
1605
1606/* Stop processing guest IO notifications in vhost.
1607 * Start processing them in qemu.
1608 * This might actually run the qemu handlers right away,
1609 * so virtio in qemu must be completely setup when this is called.
1610 */
1611void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
1612{
92099aa4 1613 vhost_dev_disable_notifiers_nvqs(hdev, vdev, hdev->nvqs);
b0b3db79
MT
1614}
1615
f56a1247
MT
1616/* Test and clear event pending status.
1617 * Should be called after unmask to avoid losing events.
1618 */
1619bool vhost_virtqueue_pending(struct vhost_dev *hdev, int n)
1620{
a9f98bb5 1621 struct vhost_virtqueue *vq = hdev->vqs + n - hdev->vq_index;
a9f98bb5 1622 assert(n >= hdev->vq_index && n < hdev->vq_index + hdev->nvqs);
f56a1247
MT
1623 return event_notifier_test_and_clear(&vq->masked_notifier);
1624}
1625
1626/* Mask/unmask events from this vq. */
1627void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n,
1628 bool mask)
1629{
1630 struct VirtQueue *vvq = virtio_get_queue(vdev, n);
a9f98bb5 1631 int r, index = n - hdev->vq_index;
fc57fd99 1632 struct vhost_vring_file file;
f56a1247 1633
8695de0f
MAL
1634 /* should only be called after backend is connected */
1635 assert(hdev->vhost_ops);
1636
f56a1247 1637 if (mask) {
5669655a 1638 assert(vdev->use_guest_notifier_mask);
ff5eb77b 1639 file.fd = event_notifier_get_wfd(&hdev->vqs[index].masked_notifier);
f56a1247 1640 } else {
ff5eb77b 1641 file.fd = event_notifier_get_wfd(virtio_queue_get_guest_notifier(vvq));
f56a1247 1642 }
fc57fd99 1643
21e70425
MAL
1644 file.index = hdev->vhost_ops->vhost_get_vq_index(hdev, n);
1645 r = hdev->vhost_ops->vhost_set_vring_call(hdev, &file);
162bba7f 1646 if (r < 0) {
f9a09ca3
CL
1647 error_report("vhost_set_vring_call failed %d", -r);
1648 }
1649}
1650
1651bool vhost_config_pending(struct vhost_dev *hdev)
1652{
1653 assert(hdev->vhost_ops);
1654 if ((hdev->started == false) ||
1655 (hdev->vhost_ops->vhost_set_config_call == NULL)) {
1656 return false;
1657 }
1658
1659 EventNotifier *notifier =
1660 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier;
1661 return event_notifier_test_and_clear(notifier);
1662}
1663
1664void vhost_config_mask(struct vhost_dev *hdev, VirtIODevice *vdev, bool mask)
1665{
1666 int fd;
1667 int r;
1668 EventNotifier *notifier =
1669 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier;
1670 EventNotifier *config_notifier = &vdev->config_notifier;
1671 assert(hdev->vhost_ops);
1672
1673 if ((hdev->started == false) ||
1674 (hdev->vhost_ops->vhost_set_config_call == NULL)) {
1675 return;
1676 }
1677 if (mask) {
1678 assert(vdev->use_guest_notifier_mask);
1679 fd = event_notifier_get_fd(notifier);
1680 } else {
1681 fd = event_notifier_get_fd(config_notifier);
1682 }
1683 r = hdev->vhost_ops->vhost_set_config_call(hdev, fd);
1684 if (r < 0) {
1685 error_report("vhost_set_config_call failed %d", -r);
1686 }
1687}
1688
1689static void vhost_stop_config_intr(struct vhost_dev *dev)
1690{
1691 int fd = -1;
1692 assert(dev->vhost_ops);
1693 if (dev->vhost_ops->vhost_set_config_call) {
1694 dev->vhost_ops->vhost_set_config_call(dev, fd);
1695 }
1696}
1697
1698static void vhost_start_config_intr(struct vhost_dev *dev)
1699{
1700 int r;
1701
1702 assert(dev->vhost_ops);
1703 int fd = event_notifier_get_fd(&dev->vdev->config_notifier);
1704 if (dev->vhost_ops->vhost_set_config_call) {
1705 r = dev->vhost_ops->vhost_set_config_call(dev, fd);
1706 if (!r) {
1707 event_notifier_set(&dev->vdev->config_notifier);
1708 }
162bba7f 1709 }
f56a1247
MT
1710}
1711
9a2ba823
CH
1712uint64_t vhost_get_features(struct vhost_dev *hdev, const int *feature_bits,
1713 uint64_t features)
2e6d46d7
NN
1714{
1715 const int *bit = feature_bits;
1716 while (*bit != VHOST_INVALID_FEATURE_BIT) {
9a2ba823 1717 uint64_t bit_mask = (1ULL << *bit);
2e6d46d7
NN
1718 if (!(hdev->features & bit_mask)) {
1719 features &= ~bit_mask;
1720 }
1721 bit++;
1722 }
1723 return features;
1724}
1725
1726void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits,
9a2ba823 1727 uint64_t features)
2e6d46d7
NN
1728{
1729 const int *bit = feature_bits;
1730 while (*bit != VHOST_INVALID_FEATURE_BIT) {
9a2ba823 1731 uint64_t bit_mask = (1ULL << *bit);
2e6d46d7
NN
1732 if (features & bit_mask) {
1733 hdev->acked_features |= bit_mask;
1734 }
1735 bit++;
1736 }
1737}
1738
4c3e257b 1739int vhost_dev_get_config(struct vhost_dev *hdev, uint8_t *config,
50de5138 1740 uint32_t config_len, Error **errp)
4c3e257b
CL
1741{
1742 assert(hdev->vhost_ops);
1743
1744 if (hdev->vhost_ops->vhost_get_config) {
66647ed4
MA
1745 return hdev->vhost_ops->vhost_get_config(hdev, config, config_len,
1746 errp);
4c3e257b
CL
1747 }
1748
50de5138 1749 error_setg(errp, "vhost_get_config not implemented");
5d33ae4b 1750 return -ENOSYS;
4c3e257b
CL
1751}
1752
1753int vhost_dev_set_config(struct vhost_dev *hdev, const uint8_t *data,
1754 uint32_t offset, uint32_t size, uint32_t flags)
1755{
1756 assert(hdev->vhost_ops);
1757
1758 if (hdev->vhost_ops->vhost_set_config) {
1759 return hdev->vhost_ops->vhost_set_config(hdev, data, offset,
1760 size, flags);
1761 }
1762
5d33ae4b 1763 return -ENOSYS;
4c3e257b
CL
1764}
1765
1766void vhost_dev_set_config_notifier(struct vhost_dev *hdev,
1767 const VhostDevConfigOps *ops)
1768{
4c3e257b
CL
1769 hdev->config_ops = ops;
1770}
1771
5ad204bf
XY
1772void vhost_dev_free_inflight(struct vhost_inflight *inflight)
1773{
0ac2e635 1774 if (inflight && inflight->addr) {
5ad204bf
XY
1775 qemu_memfd_free(inflight->addr, inflight->size, inflight->fd);
1776 inflight->addr = NULL;
1777 inflight->fd = -1;
1778 }
1779}
1780
1781static int vhost_dev_resize_inflight(struct vhost_inflight *inflight,
1782 uint64_t new_size)
1783{
1784 Error *err = NULL;
1785 int fd = -1;
1786 void *addr = qemu_memfd_alloc("vhost-inflight", new_size,
1787 F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
1788 &fd, &err);
1789
1790 if (err) {
1791 error_report_err(err);
5d33ae4b 1792 return -ENOMEM;
5ad204bf
XY
1793 }
1794
1795 vhost_dev_free_inflight(inflight);
1796 inflight->offset = 0;
1797 inflight->addr = addr;
1798 inflight->fd = fd;
1799 inflight->size = new_size;
1800
1801 return 0;
1802}
1803
1804void vhost_dev_save_inflight(struct vhost_inflight *inflight, QEMUFile *f)
1805{
1806 if (inflight->addr) {
1807 qemu_put_be64(f, inflight->size);
1808 qemu_put_be16(f, inflight->queue_size);
1809 qemu_put_buffer(f, inflight->addr, inflight->size);
1810 } else {
1811 qemu_put_be64(f, 0);
1812 }
1813}
1814
1815int vhost_dev_load_inflight(struct vhost_inflight *inflight, QEMUFile *f)
1816{
1817 uint64_t size;
1818
1819 size = qemu_get_be64(f);
1820 if (!size) {
1821 return 0;
1822 }
1823
1824 if (inflight->size != size) {
5d33ae4b
RK
1825 int ret = vhost_dev_resize_inflight(inflight, size);
1826 if (ret < 0) {
1827 return ret;
5ad204bf
XY
1828 }
1829 }
1830 inflight->queue_size = qemu_get_be16(f);
1831
1832 qemu_get_buffer(f, inflight->addr, size);
1833
1834 return 0;
1835}
1836
1b0063b3
JY
1837int vhost_dev_prepare_inflight(struct vhost_dev *hdev, VirtIODevice *vdev)
1838{
1839 int r;
1840
1841 if (hdev->vhost_ops->vhost_get_inflight_fd == NULL ||
1842 hdev->vhost_ops->vhost_set_inflight_fd == NULL) {
1843 return 0;
1844 }
1845
1846 hdev->vdev = vdev;
1847
1848 r = vhost_dev_set_features(hdev, hdev->log_enabled);
1849 if (r < 0) {
5d33ae4b 1850 VHOST_OPS_DEBUG(r, "vhost_dev_prepare_inflight failed");
1b0063b3
JY
1851 return r;
1852 }
1853
1854 return 0;
1855}
1856
5ad204bf
XY
1857int vhost_dev_set_inflight(struct vhost_dev *dev,
1858 struct vhost_inflight *inflight)
1859{
1860 int r;
1861
1862 if (dev->vhost_ops->vhost_set_inflight_fd && inflight->addr) {
1863 r = dev->vhost_ops->vhost_set_inflight_fd(dev, inflight);
1864 if (r) {
5d33ae4b
RK
1865 VHOST_OPS_DEBUG(r, "vhost_set_inflight_fd failed");
1866 return r;
5ad204bf
XY
1867 }
1868 }
1869
1870 return 0;
1871}
1872
1873int vhost_dev_get_inflight(struct vhost_dev *dev, uint16_t queue_size,
1874 struct vhost_inflight *inflight)
1875{
1876 int r;
1877
1878 if (dev->vhost_ops->vhost_get_inflight_fd) {
1879 r = dev->vhost_ops->vhost_get_inflight_fd(dev, queue_size, inflight);
1880 if (r) {
5d33ae4b
RK
1881 VHOST_OPS_DEBUG(r, "vhost_get_inflight_fd failed");
1882 return r;
5ad204bf
XY
1883 }
1884 }
1885
1886 return 0;
1887}
1888
4daa5054
SG
1889static int vhost_dev_set_vring_enable(struct vhost_dev *hdev, int enable)
1890{
1891 if (!hdev->vhost_ops->vhost_set_vring_enable) {
1892 return 0;
1893 }
1894
1895 /*
1896 * For vhost-user devices, if VHOST_USER_F_PROTOCOL_FEATURES has not
1897 * been negotiated, the rings start directly in the enabled state, and
1898 * .vhost_set_vring_enable callback will fail since
1899 * VHOST_USER_SET_VRING_ENABLE is not supported.
1900 */
1901 if (hdev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER &&
1902 !virtio_has_feature(hdev->backend_features,
1903 VHOST_USER_F_PROTOCOL_FEATURES)) {
1904 return 0;
1905 }
1906
1907 return hdev->vhost_ops->vhost_set_vring_enable(hdev, enable);
1908}
1909
b0b3db79 1910/* Host notifiers must be enabled at this point. */
4daa5054 1911int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings)
d5970055
MT
1912{
1913 int i, r;
24f4fe34 1914
8695de0f
MAL
1915 /* should only be called after backend is connected */
1916 assert(hdev->vhost_ops);
1917
4daa5054 1918 trace_vhost_dev_start(hdev, vdev->name, vrings);
a2761231 1919
c255488d 1920 vdev->vhost_started = true;
24f4fe34 1921 hdev->started = true;
c471ad0e 1922 hdev->vdev = vdev;
24f4fe34 1923
d5970055
MT
1924 r = vhost_dev_set_features(hdev, hdev->log_enabled);
1925 if (r < 0) {
54dd9321 1926 goto fail_features;
d5970055 1927 }
c471ad0e
JW
1928
1929 if (vhost_dev_has_iommu(hdev)) {
375f74f4 1930 memory_listener_register(&hdev->iommu_listener, vdev->dma_as);
c471ad0e
JW
1931 }
1932
21e70425 1933 r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem);
d5970055 1934 if (r < 0) {
5d33ae4b 1935 VHOST_OPS_DEBUG(r, "vhost_set_mem_table failed");
54dd9321 1936 goto fail_mem;
d5970055 1937 }
d154e0ba 1938 for (i = 0; i < hdev->nvqs; ++i) {
f56a1247 1939 r = vhost_virtqueue_start(hdev,
a9f98bb5
JW
1940 vdev,
1941 hdev->vqs + i,
1942 hdev->vq_index + i);
d154e0ba
MT
1943 if (r < 0) {
1944 goto fail_vq;
1945 }
1946 }
1947
f9a09ca3
CL
1948 r = event_notifier_init(
1949 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier, 0);
1950 if (r < 0) {
77ece20b
PP
1951 VHOST_OPS_DEBUG(r, "event_notifier_init failed");
1952 goto fail_vq;
f9a09ca3
CL
1953 }
1954 event_notifier_test_and_clear(
1955 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier);
1956 if (!vdev->use_guest_notifier_mask) {
1957 vhost_config_mask(hdev, vdev, true);
1958 }
d5970055 1959 if (hdev->log_enabled) {
e05ca820
MT
1960 uint64_t log_base;
1961
d5970055 1962 hdev->log_size = vhost_get_log_size(hdev);
15324404
MAL
1963 hdev->log = vhost_log_get(hdev->log_size,
1964 vhost_dev_log_is_shared(hdev));
309750fa 1965 log_base = (uintptr_t)hdev->log->log;
c2bea314 1966 r = hdev->vhost_ops->vhost_set_log_base(hdev,
9a78a5dd
MAL
1967 hdev->log_size ? log_base : 0,
1968 hdev->log);
d5970055 1969 if (r < 0) {
5d33ae4b 1970 VHOST_OPS_DEBUG(r, "vhost_set_log_base failed");
54dd9321 1971 goto fail_log;
d5970055
MT
1972 }
1973 }
4daa5054
SG
1974 if (vrings) {
1975 r = vhost_dev_set_vring_enable(hdev, true);
1976 if (r) {
1977 goto fail_log;
1978 }
1979 }
ca71db43
CL
1980 if (hdev->vhost_ops->vhost_dev_start) {
1981 r = hdev->vhost_ops->vhost_dev_start(hdev, true);
1982 if (r) {
4daa5054 1983 goto fail_start;
ca71db43
CL
1984 }
1985 }
3f63b4c6
JW
1986 if (vhost_dev_has_iommu(hdev) &&
1987 hdev->vhost_ops->vhost_set_iotlb_callback) {
1988 hdev->vhost_ops->vhost_set_iotlb_callback(hdev, true);
c471ad0e
JW
1989
1990 /* Update used ring information for IOTLB to work correctly,
1991 * vhost-kernel code requires for this.*/
1992 for (i = 0; i < hdev->nvqs; ++i) {
1993 struct vhost_virtqueue *vq = hdev->vqs + i;
1994 vhost_device_iotlb_miss(hdev, vq->used_phys, true);
1995 }
1996 }
f9a09ca3 1997 vhost_start_config_intr(hdev);
d5970055 1998 return 0;
4daa5054
SG
1999fail_start:
2000 if (vrings) {
2001 vhost_dev_set_vring_enable(hdev, false);
2002 }
54dd9321 2003fail_log:
24bfa207 2004 vhost_log_put(hdev, false);
d5970055
MT
2005fail_vq:
2006 while (--i >= 0) {
f56a1247 2007 vhost_virtqueue_stop(hdev,
a9f98bb5
JW
2008 vdev,
2009 hdev->vqs + i,
2010 hdev->vq_index + i);
d5970055 2011 }
c471ad0e 2012
54dd9321 2013fail_mem:
1e3ffb34
PP
2014 if (vhost_dev_has_iommu(hdev)) {
2015 memory_listener_unregister(&hdev->iommu_listener);
2016 }
54dd9321 2017fail_features:
c255488d 2018 vdev->vhost_started = false;
24f4fe34 2019 hdev->started = false;
d5970055
MT
2020 return r;
2021}
2022
b0b3db79 2023/* Host notifiers must be enabled at this point. */
4daa5054 2024void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings)
d5970055 2025{
a9f98bb5 2026 int i;
54dd9321 2027
8695de0f
MAL
2028 /* should only be called after backend is connected */
2029 assert(hdev->vhost_ops);
f9a09ca3
CL
2030 event_notifier_test_and_clear(
2031 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier);
2032 event_notifier_test_and_clear(&vdev->config_notifier);
8695de0f 2033
4daa5054 2034 trace_vhost_dev_stop(hdev, vdev->name, vrings);
a2761231 2035
ca71db43
CL
2036 if (hdev->vhost_ops->vhost_dev_start) {
2037 hdev->vhost_ops->vhost_dev_start(hdev, false);
2038 }
4daa5054
SG
2039 if (vrings) {
2040 vhost_dev_set_vring_enable(hdev, false);
2041 }
d5970055 2042 for (i = 0; i < hdev->nvqs; ++i) {
f56a1247 2043 vhost_virtqueue_stop(hdev,
a9f98bb5
JW
2044 vdev,
2045 hdev->vqs + i,
2046 hdev->vq_index + i);
d5970055 2047 }
c3716f26
EP
2048 if (hdev->vhost_ops->vhost_reset_status) {
2049 hdev->vhost_ops->vhost_reset_status(hdev);
2050 }
54dd9321 2051
c471ad0e 2052 if (vhost_dev_has_iommu(hdev)) {
3f63b4c6
JW
2053 if (hdev->vhost_ops->vhost_set_iotlb_callback) {
2054 hdev->vhost_ops->vhost_set_iotlb_callback(hdev, false);
2055 }
375f74f4 2056 memory_listener_unregister(&hdev->iommu_listener);
c471ad0e 2057 }
f9a09ca3 2058 vhost_stop_config_intr(hdev);
309750fa 2059 vhost_log_put(hdev, true);
d5970055 2060 hdev->started = false;
c255488d 2061 vdev->vhost_started = false;
c471ad0e 2062 hdev->vdev = NULL;
d5970055 2063}
950d94ba
MAL
2064
2065int vhost_net_set_backend(struct vhost_dev *hdev,
2066 struct vhost_vring_file *file)
2067{
2068 if (hdev->vhost_ops->vhost_net_set_backend) {
2069 return hdev->vhost_ops->vhost_net_set_backend(hdev, file);
2070 }
2071
5d33ae4b 2072 return -ENOSYS;
950d94ba 2073}