2 * generic functions used by VFIO devices
4 * Copyright Red Hat, Inc. 2012
7 * Alex Williamson <alex.williamson@redhat.com>
9 * This work is licensed under the terms of the GNU GPL, version 2. See
10 * the COPYING file in the top-level directory.
12 * Based on qemu-kvm device-assignment:
13 * Adapted for KVM by Qumranet.
14 * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
15 * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
16 * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
17 * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
18 * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
21 #include "qemu/osdep.h"
22 #include <sys/ioctl.h>
24 #include <linux/kvm.h>
26 #include <linux/vfio.h>
28 #include "hw/vfio/vfio-common.h"
29 #include "hw/vfio/pci.h"
30 #include "exec/address-spaces.h"
31 #include "exec/memory.h"
32 #include "exec/ram_addr.h"
34 #include "qemu/error-report.h"
35 #include "qemu/main-loop.h"
36 #include "qemu/range.h"
37 #include "sysemu/kvm.h"
38 #include "sysemu/reset.h"
39 #include "sysemu/runstate.h"
41 #include "qapi/error.h"
42 #include "migration/migration.h"
43 #include "migration/misc.h"
44 #include "migration/blocker.h"
45 #include "migration/qemu-file.h"
46 #include "sysemu/tpm.h"
48 VFIODeviceList vfio_device_list
=
49 QLIST_HEAD_INITIALIZER(vfio_device_list
);
50 static QLIST_HEAD(, VFIOAddressSpace
) vfio_address_spaces
=
51 QLIST_HEAD_INITIALIZER(vfio_address_spaces
);
55 * We have a single VFIO pseudo device per KVM VM. Once created it lives
56 * for the life of the VM. Closing the file descriptor only drops our
57 * reference to it and the device's reference to kvm. Therefore once
58 * initialized, this file descriptor is only released on QEMU exit and
59 * we'll re-use it should another vfio device be attached before then.
61 int vfio_kvm_device_fd
= -1;
65 * Device state interfaces
68 bool vfio_mig_active(void)
72 if (QLIST_EMPTY(&vfio_device_list
)) {
76 QLIST_FOREACH(vbasedev
, &vfio_device_list
, next
) {
77 if (vbasedev
->migration_blocker
) {
84 static Error
*multiple_devices_migration_blocker
;
87 * Multiple devices migration is allowed only if all devices support P2P
88 * migration. Single device migration is allowed regardless of P2P migration
91 static bool vfio_multiple_devices_migration_is_supported(void)
94 unsigned int device_num
= 0;
95 bool all_support_p2p
= true;
97 QLIST_FOREACH(vbasedev
, &vfio_device_list
, next
) {
98 if (vbasedev
->migration
) {
101 if (!(vbasedev
->migration
->mig_flags
& VFIO_MIGRATION_P2P
)) {
102 all_support_p2p
= false;
107 return all_support_p2p
|| device_num
<= 1;
110 int vfio_block_multiple_devices_migration(VFIODevice
*vbasedev
, Error
**errp
)
114 if (vfio_multiple_devices_migration_is_supported()) {
118 if (vbasedev
->enable_migration
== ON_OFF_AUTO_ON
) {
119 error_setg(errp
, "Multiple VFIO devices migration is supported only if "
120 "all of them support P2P migration");
124 if (multiple_devices_migration_blocker
) {
128 error_setg(&multiple_devices_migration_blocker
,
129 "Multiple VFIO devices migration is supported only if all of "
130 "them support P2P migration");
131 ret
= migrate_add_blocker(&multiple_devices_migration_blocker
, errp
);
136 void vfio_unblock_multiple_devices_migration(void)
138 if (!multiple_devices_migration_blocker
||
139 !vfio_multiple_devices_migration_is_supported()) {
143 migrate_del_blocker(&multiple_devices_migration_blocker
);
146 bool vfio_viommu_preset(VFIODevice
*vbasedev
)
148 return vbasedev
->container
->space
->as
!= &address_space_memory
;
151 static void vfio_set_migration_error(int err
)
153 MigrationState
*ms
= migrate_get_current();
155 if (migration_is_setup_or_active(ms
->state
)) {
156 WITH_QEMU_LOCK_GUARD(&ms
->qemu_file_lock
) {
157 if (ms
->to_dst_file
) {
158 qemu_file_set_error(ms
->to_dst_file
, err
);
164 bool vfio_device_state_is_running(VFIODevice
*vbasedev
)
166 VFIOMigration
*migration
= vbasedev
->migration
;
168 return migration
->device_state
== VFIO_DEVICE_STATE_RUNNING
||
169 migration
->device_state
== VFIO_DEVICE_STATE_RUNNING_P2P
;
172 bool vfio_device_state_is_precopy(VFIODevice
*vbasedev
)
174 VFIOMigration
*migration
= vbasedev
->migration
;
176 return migration
->device_state
== VFIO_DEVICE_STATE_PRE_COPY
||
177 migration
->device_state
== VFIO_DEVICE_STATE_PRE_COPY_P2P
;
180 static bool vfio_devices_all_dirty_tracking(VFIOContainer
*container
)
182 VFIODevice
*vbasedev
;
183 MigrationState
*ms
= migrate_get_current();
185 if (ms
->state
!= MIGRATION_STATUS_ACTIVE
&&
186 ms
->state
!= MIGRATION_STATUS_DEVICE
) {
190 QLIST_FOREACH(vbasedev
, &container
->device_list
, container_next
) {
191 VFIOMigration
*migration
= vbasedev
->migration
;
197 if (vbasedev
->pre_copy_dirty_page_tracking
== ON_OFF_AUTO_OFF
&&
198 (vfio_device_state_is_running(vbasedev
) ||
199 vfio_device_state_is_precopy(vbasedev
))) {
206 bool vfio_devices_all_device_dirty_tracking(VFIOContainer
*container
)
208 VFIODevice
*vbasedev
;
210 QLIST_FOREACH(vbasedev
, &container
->device_list
, container_next
) {
211 if (!vbasedev
->dirty_pages_supported
) {
220 * Check if all VFIO devices are running and migration is active, which is
221 * essentially equivalent to the migration being in pre-copy phase.
223 bool vfio_devices_all_running_and_mig_active(VFIOContainer
*container
)
225 VFIODevice
*vbasedev
;
227 if (!migration_is_active(migrate_get_current())) {
231 QLIST_FOREACH(vbasedev
, &container
->device_list
, container_next
) {
232 VFIOMigration
*migration
= vbasedev
->migration
;
238 if (vfio_device_state_is_running(vbasedev
) ||
239 vfio_device_state_is_precopy(vbasedev
)) {
248 static bool vfio_listener_skipped_section(MemoryRegionSection
*section
)
250 return (!memory_region_is_ram(section
->mr
) &&
251 !memory_region_is_iommu(section
->mr
)) ||
252 memory_region_is_protected(section
->mr
) ||
254 * Sizing an enabled 64-bit BAR can cause spurious mappings to
255 * addresses in the upper part of the 64-bit address space. These
256 * are never accessed by the CPU and beyond the address width of
257 * some IOMMU hardware. TODO: VFIO should tell us the IOMMU width.
259 section
->offset_within_address_space
& (1ULL << 63);
262 /* Called with rcu_read_lock held. */
263 static bool vfio_get_xlat_addr(IOMMUTLBEntry
*iotlb
, void **vaddr
,
264 ram_addr_t
*ram_addr
, bool *read_only
)
266 bool ret
, mr_has_discard_manager
;
268 ret
= memory_get_xlat_addr(iotlb
, vaddr
, ram_addr
, read_only
,
269 &mr_has_discard_manager
);
270 if (ret
&& mr_has_discard_manager
) {
272 * Malicious VMs might trigger discarding of IOMMU-mapped memory. The
273 * pages will remain pinned inside vfio until unmapped, resulting in a
274 * higher memory consumption than expected. If memory would get
275 * populated again later, there would be an inconsistency between pages
276 * pinned by vfio and pages seen by QEMU. This is the case until
277 * unmapped from the IOMMU (e.g., during device reset).
279 * With malicious guests, we really only care about pinning more memory
280 * than expected. RLIMIT_MEMLOCK set for the user/process can never be
281 * exceeded and can be used to mitigate this problem.
283 warn_report_once("Using vfio with vIOMMUs and coordinated discarding of"
284 " RAM (e.g., virtio-mem) works, however, malicious"
285 " guests can trigger pinning of more memory than"
286 " intended via an IOMMU. It's possible to mitigate "
287 " by setting/adjusting RLIMIT_MEMLOCK.");
292 static void vfio_iommu_map_notify(IOMMUNotifier
*n
, IOMMUTLBEntry
*iotlb
)
294 VFIOGuestIOMMU
*giommu
= container_of(n
, VFIOGuestIOMMU
, n
);
295 VFIOContainerBase
*bcontainer
= &giommu
->container
->bcontainer
;
296 hwaddr iova
= iotlb
->iova
+ giommu
->iommu_offset
;
300 trace_vfio_iommu_map_notify(iotlb
->perm
== IOMMU_NONE
? "UNMAP" : "MAP",
301 iova
, iova
+ iotlb
->addr_mask
);
303 if (iotlb
->target_as
!= &address_space_memory
) {
304 error_report("Wrong target AS \"%s\", only system memory is allowed",
305 iotlb
->target_as
->name
? iotlb
->target_as
->name
: "none");
306 vfio_set_migration_error(-EINVAL
);
312 if ((iotlb
->perm
& IOMMU_RW
) != IOMMU_NONE
) {
315 if (!vfio_get_xlat_addr(iotlb
, &vaddr
, NULL
, &read_only
)) {
319 * vaddr is only valid until rcu_read_unlock(). But after
320 * vfio_dma_map has set up the mapping the pages will be
321 * pinned by the kernel. This makes sure that the RAM backend
322 * of vaddr will always be there, even if the memory object is
323 * destroyed and its backing memory munmap-ed.
325 ret
= vfio_container_dma_map(bcontainer
, iova
,
326 iotlb
->addr_mask
+ 1, vaddr
,
329 error_report("vfio_container_dma_map(%p, 0x%"HWADDR_PRIx
", "
330 "0x%"HWADDR_PRIx
", %p) = %d (%s)",
332 iotlb
->addr_mask
+ 1, vaddr
, ret
, strerror(-ret
));
335 ret
= vfio_container_dma_unmap(bcontainer
, iova
,
336 iotlb
->addr_mask
+ 1, iotlb
);
338 error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx
", "
339 "0x%"HWADDR_PRIx
") = %d (%s)",
341 iotlb
->addr_mask
+ 1, ret
, strerror(-ret
));
342 vfio_set_migration_error(ret
);
349 static void vfio_ram_discard_notify_discard(RamDiscardListener
*rdl
,
350 MemoryRegionSection
*section
)
352 VFIORamDiscardListener
*vrdl
= container_of(rdl
, VFIORamDiscardListener
,
354 const hwaddr size
= int128_get64(section
->size
);
355 const hwaddr iova
= section
->offset_within_address_space
;
358 /* Unmap with a single call. */
359 ret
= vfio_container_dma_unmap(&vrdl
->container
->bcontainer
,
362 error_report("%s: vfio_container_dma_unmap() failed: %s", __func__
,
367 static int vfio_ram_discard_notify_populate(RamDiscardListener
*rdl
,
368 MemoryRegionSection
*section
)
370 VFIORamDiscardListener
*vrdl
= container_of(rdl
, VFIORamDiscardListener
,
372 const hwaddr end
= section
->offset_within_region
+
373 int128_get64(section
->size
);
374 hwaddr start
, next
, iova
;
379 * Map in (aligned within memory region) minimum granularity, so we can
380 * unmap in minimum granularity later.
382 for (start
= section
->offset_within_region
; start
< end
; start
= next
) {
383 next
= ROUND_UP(start
+ 1, vrdl
->granularity
);
384 next
= MIN(next
, end
);
386 iova
= start
- section
->offset_within_region
+
387 section
->offset_within_address_space
;
388 vaddr
= memory_region_get_ram_ptr(section
->mr
) + start
;
390 ret
= vfio_container_dma_map(&vrdl
->container
->bcontainer
, iova
,
391 next
- start
, vaddr
, section
->readonly
);
394 vfio_ram_discard_notify_discard(rdl
, section
);
401 static void vfio_register_ram_discard_listener(VFIOContainer
*container
,
402 MemoryRegionSection
*section
)
404 RamDiscardManager
*rdm
= memory_region_get_ram_discard_manager(section
->mr
);
405 VFIORamDiscardListener
*vrdl
;
407 /* Ignore some corner cases not relevant in practice. */
408 g_assert(QEMU_IS_ALIGNED(section
->offset_within_region
, TARGET_PAGE_SIZE
));
409 g_assert(QEMU_IS_ALIGNED(section
->offset_within_address_space
,
411 g_assert(QEMU_IS_ALIGNED(int128_get64(section
->size
), TARGET_PAGE_SIZE
));
413 vrdl
= g_new0(VFIORamDiscardListener
, 1);
414 vrdl
->container
= container
;
415 vrdl
->mr
= section
->mr
;
416 vrdl
->offset_within_address_space
= section
->offset_within_address_space
;
417 vrdl
->size
= int128_get64(section
->size
);
418 vrdl
->granularity
= ram_discard_manager_get_min_granularity(rdm
,
421 g_assert(vrdl
->granularity
&& is_power_of_2(vrdl
->granularity
));
422 g_assert(container
->pgsizes
&&
423 vrdl
->granularity
>= 1ULL << ctz64(container
->pgsizes
));
425 ram_discard_listener_init(&vrdl
->listener
,
426 vfio_ram_discard_notify_populate
,
427 vfio_ram_discard_notify_discard
, true);
428 ram_discard_manager_register_listener(rdm
, &vrdl
->listener
, section
);
429 QLIST_INSERT_HEAD(&container
->vrdl_list
, vrdl
, next
);
432 * Sanity-check if we have a theoretically problematic setup where we could
433 * exceed the maximum number of possible DMA mappings over time. We assume
434 * that each mapped section in the same address space as a RamDiscardManager
435 * section consumes exactly one DMA mapping, with the exception of
436 * RamDiscardManager sections; i.e., we don't expect to have gIOMMU sections
437 * in the same address space as RamDiscardManager sections.
439 * We assume that each section in the address space consumes one memslot.
440 * We take the number of KVM memory slots as a best guess for the maximum
441 * number of sections in the address space we could have over time,
442 * also consuming DMA mappings.
444 if (container
->dma_max_mappings
) {
445 unsigned int vrdl_count
= 0, vrdl_mappings
= 0, max_memslots
= 512;
449 max_memslots
= kvm_get_max_memslots();
453 QLIST_FOREACH(vrdl
, &container
->vrdl_list
, next
) {
456 start
= QEMU_ALIGN_DOWN(vrdl
->offset_within_address_space
,
458 end
= ROUND_UP(vrdl
->offset_within_address_space
+ vrdl
->size
,
460 vrdl_mappings
+= (end
- start
) / vrdl
->granularity
;
464 if (vrdl_mappings
+ max_memslots
- vrdl_count
>
465 container
->dma_max_mappings
) {
466 warn_report("%s: possibly running out of DMA mappings. E.g., try"
467 " increasing the 'block-size' of virtio-mem devies."
468 " Maximum possible DMA mappings: %d, Maximum possible"
469 " memslots: %d", __func__
, container
->dma_max_mappings
,
475 static void vfio_unregister_ram_discard_listener(VFIOContainer
*container
,
476 MemoryRegionSection
*section
)
478 RamDiscardManager
*rdm
= memory_region_get_ram_discard_manager(section
->mr
);
479 VFIORamDiscardListener
*vrdl
= NULL
;
481 QLIST_FOREACH(vrdl
, &container
->vrdl_list
, next
) {
482 if (vrdl
->mr
== section
->mr
&&
483 vrdl
->offset_within_address_space
==
484 section
->offset_within_address_space
) {
490 hw_error("vfio: Trying to unregister missing RAM discard listener");
493 ram_discard_manager_unregister_listener(rdm
, &vrdl
->listener
);
494 QLIST_REMOVE(vrdl
, next
);
498 static bool vfio_known_safe_misalignment(MemoryRegionSection
*section
)
500 MemoryRegion
*mr
= section
->mr
;
502 if (!TPM_IS_CRB(mr
->owner
)) {
506 /* this is a known safe misaligned region, just trace for debug purpose */
507 trace_vfio_known_safe_misalignment(memory_region_name(mr
),
508 section
->offset_within_address_space
,
509 section
->offset_within_region
,
510 qemu_real_host_page_size());
514 static bool vfio_listener_valid_section(MemoryRegionSection
*section
,
517 if (vfio_listener_skipped_section(section
)) {
518 trace_vfio_listener_region_skip(name
,
519 section
->offset_within_address_space
,
520 section
->offset_within_address_space
+
521 int128_get64(int128_sub(section
->size
, int128_one())));
525 if (unlikely((section
->offset_within_address_space
&
526 ~qemu_real_host_page_mask()) !=
527 (section
->offset_within_region
& ~qemu_real_host_page_mask()))) {
528 if (!vfio_known_safe_misalignment(section
)) {
529 error_report("%s received unaligned region %s iova=0x%"PRIx64
530 " offset_within_region=0x%"PRIx64
531 " qemu_real_host_page_size=0x%"PRIxPTR
,
532 __func__
, memory_region_name(section
->mr
),
533 section
->offset_within_address_space
,
534 section
->offset_within_region
,
535 qemu_real_host_page_size());
543 static bool vfio_get_section_iova_range(VFIOContainer
*container
,
544 MemoryRegionSection
*section
,
545 hwaddr
*out_iova
, hwaddr
*out_end
,
551 iova
= REAL_HOST_PAGE_ALIGN(section
->offset_within_address_space
);
552 llend
= int128_make64(section
->offset_within_address_space
);
553 llend
= int128_add(llend
, section
->size
);
554 llend
= int128_and(llend
, int128_exts64(qemu_real_host_page_mask()));
556 if (int128_ge(int128_make64(iova
), llend
)) {
561 *out_end
= int128_get64(int128_sub(llend
, int128_one()));
568 static void vfio_listener_region_add(MemoryListener
*listener
,
569 MemoryRegionSection
*section
)
571 VFIOContainer
*container
= container_of(listener
, VFIOContainer
, listener
);
573 Int128 llend
, llsize
;
578 if (!vfio_listener_valid_section(section
, "region_add")) {
582 if (!vfio_get_section_iova_range(container
, section
, &iova
, &end
, &llend
)) {
583 if (memory_region_is_ram_device(section
->mr
)) {
584 trace_vfio_listener_region_add_no_dma_map(
585 memory_region_name(section
->mr
),
586 section
->offset_within_address_space
,
587 int128_getlo(section
->size
),
588 qemu_real_host_page_size());
593 if (vfio_container_add_section_window(container
, section
, &err
)) {
597 memory_region_ref(section
->mr
);
599 if (memory_region_is_iommu(section
->mr
)) {
600 VFIOGuestIOMMU
*giommu
;
601 IOMMUMemoryRegion
*iommu_mr
= IOMMU_MEMORY_REGION(section
->mr
);
604 trace_vfio_listener_region_add_iommu(iova
, end
);
606 * FIXME: For VFIO iommu types which have KVM acceleration to
607 * avoid bouncing all map/unmaps through qemu this way, this
608 * would be the right place to wire that up (tell the KVM
609 * device emulation the VFIO iommu handles to use).
611 giommu
= g_malloc0(sizeof(*giommu
));
612 giommu
->iommu_mr
= iommu_mr
;
613 giommu
->iommu_offset
= section
->offset_within_address_space
-
614 section
->offset_within_region
;
615 giommu
->container
= container
;
616 llend
= int128_add(int128_make64(section
->offset_within_region
),
618 llend
= int128_sub(llend
, int128_one());
619 iommu_idx
= memory_region_iommu_attrs_to_index(iommu_mr
,
620 MEMTXATTRS_UNSPECIFIED
);
621 iommu_notifier_init(&giommu
->n
, vfio_iommu_map_notify
,
622 IOMMU_NOTIFIER_IOTLB_EVENTS
,
623 section
->offset_within_region
,
627 ret
= memory_region_iommu_set_page_size_mask(giommu
->iommu_mr
,
635 if (container
->iova_ranges
) {
636 ret
= memory_region_iommu_set_iova_ranges(giommu
->iommu_mr
,
637 container
->iova_ranges
, &err
);
644 ret
= memory_region_register_iommu_notifier(section
->mr
, &giommu
->n
,
650 QLIST_INSERT_HEAD(&container
->giommu_list
, giommu
, giommu_next
);
651 memory_region_iommu_replay(giommu
->iommu_mr
, &giommu
->n
);
656 /* Here we assume that memory_region_is_ram(section->mr)==true */
659 * For RAM memory regions with a RamDiscardManager, we only want to map the
660 * actually populated parts - and update the mapping whenever we're notified
663 if (memory_region_has_ram_discard_manager(section
->mr
)) {
664 vfio_register_ram_discard_listener(container
, section
);
668 vaddr
= memory_region_get_ram_ptr(section
->mr
) +
669 section
->offset_within_region
+
670 (iova
- section
->offset_within_address_space
);
672 trace_vfio_listener_region_add_ram(iova
, end
, vaddr
);
674 llsize
= int128_sub(llend
, int128_make64(iova
));
676 if (memory_region_is_ram_device(section
->mr
)) {
677 hwaddr pgmask
= (1ULL << ctz64(container
->pgsizes
)) - 1;
679 if ((iova
& pgmask
) || (int128_get64(llsize
) & pgmask
)) {
680 trace_vfio_listener_region_add_no_dma_map(
681 memory_region_name(section
->mr
),
682 section
->offset_within_address_space
,
683 int128_getlo(section
->size
),
689 ret
= vfio_container_dma_map(&container
->bcontainer
,
690 iova
, int128_get64(llsize
), vaddr
,
693 error_setg(&err
, "vfio_container_dma_map(%p, 0x%"HWADDR_PRIx
", "
694 "0x%"HWADDR_PRIx
", %p) = %d (%s)",
695 container
, iova
, int128_get64(llsize
), vaddr
, ret
,
697 if (memory_region_is_ram_device(section
->mr
)) {
698 /* Allow unexpected mappings not to be fatal for RAM devices */
699 error_report_err(err
);
708 if (memory_region_is_ram_device(section
->mr
)) {
709 error_reportf_err(err
, "PCI p2p may not work: ");
713 * On the initfn path, store the first error in the container so we
714 * can gracefully fail. Runtime, there's not much we can do other
715 * than throw a hardware error.
717 if (!container
->initialized
) {
718 if (!container
->error
) {
719 error_propagate_prepend(&container
->error
, err
,
721 memory_region_name(section
->mr
));
726 error_report_err(err
);
727 hw_error("vfio: DMA mapping failed, unable to continue");
731 static void vfio_listener_region_del(MemoryListener
*listener
,
732 MemoryRegionSection
*section
)
734 VFIOContainer
*container
= container_of(listener
, VFIOContainer
, listener
);
736 Int128 llend
, llsize
;
738 bool try_unmap
= true;
740 if (!vfio_listener_valid_section(section
, "region_del")) {
744 if (memory_region_is_iommu(section
->mr
)) {
745 VFIOGuestIOMMU
*giommu
;
747 QLIST_FOREACH(giommu
, &container
->giommu_list
, giommu_next
) {
748 if (MEMORY_REGION(giommu
->iommu_mr
) == section
->mr
&&
749 giommu
->n
.start
== section
->offset_within_region
) {
750 memory_region_unregister_iommu_notifier(section
->mr
,
752 QLIST_REMOVE(giommu
, giommu_next
);
759 * FIXME: We assume the one big unmap below is adequate to
760 * remove any individual page mappings in the IOMMU which
761 * might have been copied into VFIO. This works for a page table
762 * based IOMMU where a big unmap flattens a large range of IO-PTEs.
763 * That may not be true for all IOMMU types.
767 if (!vfio_get_section_iova_range(container
, section
, &iova
, &end
, &llend
)) {
771 llsize
= int128_sub(llend
, int128_make64(iova
));
773 trace_vfio_listener_region_del(iova
, end
);
775 if (memory_region_is_ram_device(section
->mr
)) {
778 pgmask
= (1ULL << ctz64(container
->pgsizes
)) - 1;
779 try_unmap
= !((iova
& pgmask
) || (int128_get64(llsize
) & pgmask
));
780 } else if (memory_region_has_ram_discard_manager(section
->mr
)) {
781 vfio_unregister_ram_discard_listener(container
, section
);
782 /* Unregistering will trigger an unmap. */
787 if (int128_eq(llsize
, int128_2_64())) {
788 /* The unmap ioctl doesn't accept a full 64-bit span. */
789 llsize
= int128_rshift(llsize
, 1);
790 ret
= vfio_container_dma_unmap(&container
->bcontainer
, iova
,
791 int128_get64(llsize
), NULL
);
793 error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx
", "
794 "0x%"HWADDR_PRIx
") = %d (%s)",
795 container
, iova
, int128_get64(llsize
), ret
,
798 iova
+= int128_get64(llsize
);
800 ret
= vfio_container_dma_unmap(&container
->bcontainer
, iova
,
801 int128_get64(llsize
), NULL
);
803 error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx
", "
804 "0x%"HWADDR_PRIx
") = %d (%s)",
805 container
, iova
, int128_get64(llsize
), ret
,
810 memory_region_unref(section
->mr
);
812 vfio_container_del_section_window(container
, section
);
815 typedef struct VFIODirtyRanges
{
824 typedef struct VFIODirtyRangesListener
{
825 VFIOContainer
*container
;
826 VFIODirtyRanges ranges
;
827 MemoryListener listener
;
828 } VFIODirtyRangesListener
;
830 static bool vfio_section_is_vfio_pci(MemoryRegionSection
*section
,
831 VFIOContainer
*container
)
833 VFIOPCIDevice
*pcidev
;
834 VFIODevice
*vbasedev
;
837 owner
= memory_region_owner(section
->mr
);
839 QLIST_FOREACH(vbasedev
, &container
->device_list
, container_next
) {
840 if (vbasedev
->type
!= VFIO_DEVICE_TYPE_PCI
) {
843 pcidev
= container_of(vbasedev
, VFIOPCIDevice
, vbasedev
);
844 if (OBJECT(pcidev
) == owner
) {
852 static void vfio_dirty_tracking_update(MemoryListener
*listener
,
853 MemoryRegionSection
*section
)
855 VFIODirtyRangesListener
*dirty
= container_of(listener
,
856 VFIODirtyRangesListener
,
858 VFIODirtyRanges
*range
= &dirty
->ranges
;
859 hwaddr iova
, end
, *min
, *max
;
861 if (!vfio_listener_valid_section(section
, "tracking_update") ||
862 !vfio_get_section_iova_range(dirty
->container
, section
,
863 &iova
, &end
, NULL
)) {
868 * The address space passed to the dirty tracker is reduced to three ranges:
869 * one for 32-bit DMA ranges, one for 64-bit DMA ranges and one for the
872 * The underlying reports of dirty will query a sub-interval of each of
875 * The purpose of the three range handling is to handle known cases of big
876 * holes in the address space, like the x86 AMD 1T hole, and firmware (like
877 * OVMF) which may relocate the pci-hole64 to the end of the address space.
878 * The latter would otherwise generate large ranges for tracking, stressing
879 * the limits of supported hardware. The pci-hole32 will always be below 4G
880 * (overlapping or not) so it doesn't need special handling and is part of
883 * The alternative would be an IOVATree but that has a much bigger runtime
884 * overhead and unnecessary complexity.
886 if (vfio_section_is_vfio_pci(section
, dirty
->container
) &&
887 iova
>= UINT32_MAX
) {
888 min
= &range
->minpci64
;
889 max
= &range
->maxpci64
;
891 min
= (end
<= UINT32_MAX
) ? &range
->min32
: &range
->min64
;
892 max
= (end
<= UINT32_MAX
) ? &range
->max32
: &range
->max64
;
901 trace_vfio_device_dirty_tracking_update(iova
, end
, *min
, *max
);
905 static const MemoryListener vfio_dirty_tracking_listener
= {
906 .name
= "vfio-tracking",
907 .region_add
= vfio_dirty_tracking_update
,
910 static void vfio_dirty_tracking_init(VFIOContainer
*container
,
911 VFIODirtyRanges
*ranges
)
913 VFIODirtyRangesListener dirty
;
915 memset(&dirty
, 0, sizeof(dirty
));
916 dirty
.ranges
.min32
= UINT32_MAX
;
917 dirty
.ranges
.min64
= UINT64_MAX
;
918 dirty
.ranges
.minpci64
= UINT64_MAX
;
919 dirty
.listener
= vfio_dirty_tracking_listener
;
920 dirty
.container
= container
;
922 memory_listener_register(&dirty
.listener
,
923 container
->space
->as
);
925 *ranges
= dirty
.ranges
;
928 * The memory listener is synchronous, and used to calculate the range
929 * to dirty tracking. Unregister it after we are done as we are not
930 * interested in any follow-up updates.
932 memory_listener_unregister(&dirty
.listener
);
935 static void vfio_devices_dma_logging_stop(VFIOContainer
*container
)
937 uint64_t buf
[DIV_ROUND_UP(sizeof(struct vfio_device_feature
),
938 sizeof(uint64_t))] = {};
939 struct vfio_device_feature
*feature
= (struct vfio_device_feature
*)buf
;
940 VFIODevice
*vbasedev
;
942 feature
->argsz
= sizeof(buf
);
943 feature
->flags
= VFIO_DEVICE_FEATURE_SET
|
944 VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP
;
946 QLIST_FOREACH(vbasedev
, &container
->device_list
, container_next
) {
947 if (!vbasedev
->dirty_tracking
) {
951 if (ioctl(vbasedev
->fd
, VFIO_DEVICE_FEATURE
, feature
)) {
952 warn_report("%s: Failed to stop DMA logging, err %d (%s)",
953 vbasedev
->name
, -errno
, strerror(errno
));
955 vbasedev
->dirty_tracking
= false;
959 static struct vfio_device_feature
*
960 vfio_device_feature_dma_logging_start_create(VFIOContainer
*container
,
961 VFIODirtyRanges
*tracking
)
963 struct vfio_device_feature
*feature
;
965 struct vfio_device_feature_dma_logging_control
*control
;
966 struct vfio_device_feature_dma_logging_range
*ranges
;
968 feature_size
= sizeof(struct vfio_device_feature
) +
969 sizeof(struct vfio_device_feature_dma_logging_control
);
970 feature
= g_try_malloc0(feature_size
);
975 feature
->argsz
= feature_size
;
976 feature
->flags
= VFIO_DEVICE_FEATURE_SET
|
977 VFIO_DEVICE_FEATURE_DMA_LOGGING_START
;
979 control
= (struct vfio_device_feature_dma_logging_control
*)feature
->data
;
980 control
->page_size
= qemu_real_host_page_size();
983 * DMA logging uAPI guarantees to support at least a number of ranges that
984 * fits into a single host kernel base page.
986 control
->num_ranges
= !!tracking
->max32
+ !!tracking
->max64
+
987 !!tracking
->maxpci64
;
988 ranges
= g_try_new0(struct vfio_device_feature_dma_logging_range
,
989 control
->num_ranges
);
997 control
->ranges
= (__u64
)(uintptr_t)ranges
;
998 if (tracking
->max32
) {
999 ranges
->iova
= tracking
->min32
;
1000 ranges
->length
= (tracking
->max32
- tracking
->min32
) + 1;
1003 if (tracking
->max64
) {
1004 ranges
->iova
= tracking
->min64
;
1005 ranges
->length
= (tracking
->max64
- tracking
->min64
) + 1;
1008 if (tracking
->maxpci64
) {
1009 ranges
->iova
= tracking
->minpci64
;
1010 ranges
->length
= (tracking
->maxpci64
- tracking
->minpci64
) + 1;
1013 trace_vfio_device_dirty_tracking_start(control
->num_ranges
,
1014 tracking
->min32
, tracking
->max32
,
1015 tracking
->min64
, tracking
->max64
,
1016 tracking
->minpci64
, tracking
->maxpci64
);
1021 static void vfio_device_feature_dma_logging_start_destroy(
1022 struct vfio_device_feature
*feature
)
1024 struct vfio_device_feature_dma_logging_control
*control
=
1025 (struct vfio_device_feature_dma_logging_control
*)feature
->data
;
1026 struct vfio_device_feature_dma_logging_range
*ranges
=
1027 (struct vfio_device_feature_dma_logging_range
*)(uintptr_t)control
->ranges
;
1033 static int vfio_devices_dma_logging_start(VFIOContainer
*container
)
1035 struct vfio_device_feature
*feature
;
1036 VFIODirtyRanges ranges
;
1037 VFIODevice
*vbasedev
;
1040 vfio_dirty_tracking_init(container
, &ranges
);
1041 feature
= vfio_device_feature_dma_logging_start_create(container
,
1047 QLIST_FOREACH(vbasedev
, &container
->device_list
, container_next
) {
1048 if (vbasedev
->dirty_tracking
) {
1052 ret
= ioctl(vbasedev
->fd
, VFIO_DEVICE_FEATURE
, feature
);
1055 error_report("%s: Failed to start DMA logging, err %d (%s)",
1056 vbasedev
->name
, ret
, strerror(errno
));
1059 vbasedev
->dirty_tracking
= true;
1064 vfio_devices_dma_logging_stop(container
);
1067 vfio_device_feature_dma_logging_start_destroy(feature
);
1072 static void vfio_listener_log_global_start(MemoryListener
*listener
)
1074 VFIOContainer
*container
= container_of(listener
, VFIOContainer
, listener
);
1077 if (vfio_devices_all_device_dirty_tracking(container
)) {
1078 ret
= vfio_devices_dma_logging_start(container
);
1080 ret
= vfio_set_dirty_page_tracking(container
, true);
1084 error_report("vfio: Could not start dirty page tracking, err: %d (%s)",
1085 ret
, strerror(-ret
));
1086 vfio_set_migration_error(ret
);
1090 static void vfio_listener_log_global_stop(MemoryListener
*listener
)
1092 VFIOContainer
*container
= container_of(listener
, VFIOContainer
, listener
);
1095 if (vfio_devices_all_device_dirty_tracking(container
)) {
1096 vfio_devices_dma_logging_stop(container
);
1098 ret
= vfio_set_dirty_page_tracking(container
, false);
1102 error_report("vfio: Could not stop dirty page tracking, err: %d (%s)",
1103 ret
, strerror(-ret
));
1104 vfio_set_migration_error(ret
);
1108 static int vfio_device_dma_logging_report(VFIODevice
*vbasedev
, hwaddr iova
,
1109 hwaddr size
, void *bitmap
)
1111 uint64_t buf
[DIV_ROUND_UP(sizeof(struct vfio_device_feature
) +
1112 sizeof(struct vfio_device_feature_dma_logging_report
),
1113 sizeof(__u64
))] = {};
1114 struct vfio_device_feature
*feature
= (struct vfio_device_feature
*)buf
;
1115 struct vfio_device_feature_dma_logging_report
*report
=
1116 (struct vfio_device_feature_dma_logging_report
*)feature
->data
;
1118 report
->iova
= iova
;
1119 report
->length
= size
;
1120 report
->page_size
= qemu_real_host_page_size();
1121 report
->bitmap
= (__u64
)(uintptr_t)bitmap
;
1123 feature
->argsz
= sizeof(buf
);
1124 feature
->flags
= VFIO_DEVICE_FEATURE_GET
|
1125 VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT
;
1127 if (ioctl(vbasedev
->fd
, VFIO_DEVICE_FEATURE
, feature
)) {
1134 int vfio_devices_query_dirty_bitmap(VFIOContainer
*container
,
1135 VFIOBitmap
*vbmap
, hwaddr iova
,
1138 VFIODevice
*vbasedev
;
1141 QLIST_FOREACH(vbasedev
, &container
->device_list
, container_next
) {
1142 ret
= vfio_device_dma_logging_report(vbasedev
, iova
, size
,
1145 error_report("%s: Failed to get DMA logging report, iova: "
1146 "0x%" HWADDR_PRIx
", size: 0x%" HWADDR_PRIx
1148 vbasedev
->name
, iova
, size
, ret
, strerror(-ret
));
1157 int vfio_get_dirty_bitmap(VFIOContainer
*container
, uint64_t iova
,
1158 uint64_t size
, ram_addr_t ram_addr
)
1160 bool all_device_dirty_tracking
=
1161 vfio_devices_all_device_dirty_tracking(container
);
1162 uint64_t dirty_pages
;
1166 if (!container
->dirty_pages_supported
&& !all_device_dirty_tracking
) {
1167 cpu_physical_memory_set_dirty_range(ram_addr
, size
,
1168 tcg_enabled() ? DIRTY_CLIENTS_ALL
:
1169 DIRTY_CLIENTS_NOCODE
);
1173 ret
= vfio_bitmap_alloc(&vbmap
, size
);
1178 if (all_device_dirty_tracking
) {
1179 ret
= vfio_devices_query_dirty_bitmap(container
, &vbmap
, iova
, size
);
1181 ret
= vfio_query_dirty_bitmap(container
, &vbmap
, iova
, size
);
1188 dirty_pages
= cpu_physical_memory_set_dirty_lebitmap(vbmap
.bitmap
, ram_addr
,
1191 trace_vfio_get_dirty_bitmap(container
->fd
, iova
, size
, vbmap
.size
,
1192 ram_addr
, dirty_pages
);
1194 g_free(vbmap
.bitmap
);
1201 VFIOGuestIOMMU
*giommu
;
1202 } vfio_giommu_dirty_notifier
;
1204 static void vfio_iommu_map_dirty_notify(IOMMUNotifier
*n
, IOMMUTLBEntry
*iotlb
)
1206 vfio_giommu_dirty_notifier
*gdn
= container_of(n
,
1207 vfio_giommu_dirty_notifier
, n
);
1208 VFIOGuestIOMMU
*giommu
= gdn
->giommu
;
1209 VFIOContainer
*container
= giommu
->container
;
1210 hwaddr iova
= iotlb
->iova
+ giommu
->iommu_offset
;
1211 ram_addr_t translated_addr
;
1214 trace_vfio_iommu_map_dirty_notify(iova
, iova
+ iotlb
->addr_mask
);
1216 if (iotlb
->target_as
!= &address_space_memory
) {
1217 error_report("Wrong target AS \"%s\", only system memory is allowed",
1218 iotlb
->target_as
->name
? iotlb
->target_as
->name
: "none");
1223 if (vfio_get_xlat_addr(iotlb
, NULL
, &translated_addr
, NULL
)) {
1224 ret
= vfio_get_dirty_bitmap(container
, iova
, iotlb
->addr_mask
+ 1,
1227 error_report("vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx
", "
1228 "0x%"HWADDR_PRIx
") = %d (%s)",
1229 container
, iova
, iotlb
->addr_mask
+ 1, ret
,
1237 vfio_set_migration_error(ret
);
1241 static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection
*section
,
1244 const hwaddr size
= int128_get64(section
->size
);
1245 const hwaddr iova
= section
->offset_within_address_space
;
1246 const ram_addr_t ram_addr
= memory_region_get_ram_addr(section
->mr
) +
1247 section
->offset_within_region
;
1248 VFIORamDiscardListener
*vrdl
= opaque
;
1251 * Sync the whole mapped region (spanning multiple individual mappings)
1254 return vfio_get_dirty_bitmap(vrdl
->container
, iova
, size
, ram_addr
);
1257 static int vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainer
*container
,
1258 MemoryRegionSection
*section
)
1260 RamDiscardManager
*rdm
= memory_region_get_ram_discard_manager(section
->mr
);
1261 VFIORamDiscardListener
*vrdl
= NULL
;
1263 QLIST_FOREACH(vrdl
, &container
->vrdl_list
, next
) {
1264 if (vrdl
->mr
== section
->mr
&&
1265 vrdl
->offset_within_address_space
==
1266 section
->offset_within_address_space
) {
1272 hw_error("vfio: Trying to sync missing RAM discard listener");
1276 * We only want/can synchronize the bitmap for actually mapped parts -
1277 * which correspond to populated parts. Replay all populated parts.
1279 return ram_discard_manager_replay_populated(rdm
, section
,
1280 vfio_ram_discard_get_dirty_bitmap
,
1284 static int vfio_sync_dirty_bitmap(VFIOContainer
*container
,
1285 MemoryRegionSection
*section
)
1287 ram_addr_t ram_addr
;
1289 if (memory_region_is_iommu(section
->mr
)) {
1290 VFIOGuestIOMMU
*giommu
;
1292 QLIST_FOREACH(giommu
, &container
->giommu_list
, giommu_next
) {
1293 if (MEMORY_REGION(giommu
->iommu_mr
) == section
->mr
&&
1294 giommu
->n
.start
== section
->offset_within_region
) {
1296 vfio_giommu_dirty_notifier gdn
= { .giommu
= giommu
};
1297 int idx
= memory_region_iommu_attrs_to_index(giommu
->iommu_mr
,
1298 MEMTXATTRS_UNSPECIFIED
);
1300 llend
= int128_add(int128_make64(section
->offset_within_region
),
1302 llend
= int128_sub(llend
, int128_one());
1304 iommu_notifier_init(&gdn
.n
,
1305 vfio_iommu_map_dirty_notify
,
1307 section
->offset_within_region
,
1308 int128_get64(llend
),
1310 memory_region_iommu_replay(giommu
->iommu_mr
, &gdn
.n
);
1315 } else if (memory_region_has_ram_discard_manager(section
->mr
)) {
1316 return vfio_sync_ram_discard_listener_dirty_bitmap(container
, section
);
1319 ram_addr
= memory_region_get_ram_addr(section
->mr
) +
1320 section
->offset_within_region
;
1322 return vfio_get_dirty_bitmap(container
,
1323 REAL_HOST_PAGE_ALIGN(section
->offset_within_address_space
),
1324 int128_get64(section
->size
), ram_addr
);
1327 static void vfio_listener_log_sync(MemoryListener
*listener
,
1328 MemoryRegionSection
*section
)
1330 VFIOContainer
*container
= container_of(listener
, VFIOContainer
, listener
);
1333 if (vfio_listener_skipped_section(section
)) {
1337 if (vfio_devices_all_dirty_tracking(container
)) {
1338 ret
= vfio_sync_dirty_bitmap(container
, section
);
1340 error_report("vfio: Failed to sync dirty bitmap, err: %d (%s)", ret
,
1342 vfio_set_migration_error(ret
);
1347 const MemoryListener vfio_memory_listener
= {
1349 .region_add
= vfio_listener_region_add
,
1350 .region_del
= vfio_listener_region_del
,
1351 .log_global_start
= vfio_listener_log_global_start
,
1352 .log_global_stop
= vfio_listener_log_global_stop
,
1353 .log_sync
= vfio_listener_log_sync
,
1356 void vfio_reset_handler(void *opaque
)
1358 VFIODevice
*vbasedev
;
1360 QLIST_FOREACH(vbasedev
, &vfio_device_list
, next
) {
1361 if (vbasedev
->dev
->realized
) {
1362 vbasedev
->ops
->vfio_compute_needs_reset(vbasedev
);
1366 QLIST_FOREACH(vbasedev
, &vfio_device_list
, next
) {
1367 if (vbasedev
->dev
->realized
&& vbasedev
->needs_reset
) {
1368 vbasedev
->ops
->vfio_hot_reset_multi(vbasedev
);
1373 int vfio_kvm_device_add_fd(int fd
, Error
**errp
)
1376 struct kvm_device_attr attr
= {
1377 .group
= KVM_DEV_VFIO_FILE
,
1378 .attr
= KVM_DEV_VFIO_FILE_ADD
,
1379 .addr
= (uint64_t)(unsigned long)&fd
,
1382 if (!kvm_enabled()) {
1386 if (vfio_kvm_device_fd
< 0) {
1387 struct kvm_create_device cd
= {
1388 .type
= KVM_DEV_TYPE_VFIO
,
1391 if (kvm_vm_ioctl(kvm_state
, KVM_CREATE_DEVICE
, &cd
)) {
1392 error_setg_errno(errp
, errno
, "Failed to create KVM VFIO device");
1396 vfio_kvm_device_fd
= cd
.fd
;
1399 if (ioctl(vfio_kvm_device_fd
, KVM_SET_DEVICE_ATTR
, &attr
)) {
1400 error_setg_errno(errp
, errno
, "Failed to add fd %d to KVM VFIO device",
1408 int vfio_kvm_device_del_fd(int fd
, Error
**errp
)
1411 struct kvm_device_attr attr
= {
1412 .group
= KVM_DEV_VFIO_FILE
,
1413 .attr
= KVM_DEV_VFIO_FILE_DEL
,
1414 .addr
= (uint64_t)(unsigned long)&fd
,
1417 if (vfio_kvm_device_fd
< 0) {
1418 error_setg(errp
, "KVM VFIO device isn't created yet");
1422 if (ioctl(vfio_kvm_device_fd
, KVM_SET_DEVICE_ATTR
, &attr
)) {
1423 error_setg_errno(errp
, errno
,
1424 "Failed to remove fd %d from KVM VFIO device", fd
);
1431 VFIOAddressSpace
*vfio_get_address_space(AddressSpace
*as
)
1433 VFIOAddressSpace
*space
;
1435 QLIST_FOREACH(space
, &vfio_address_spaces
, list
) {
1436 if (space
->as
== as
) {
1441 /* No suitable VFIOAddressSpace, create a new one */
1442 space
= g_malloc0(sizeof(*space
));
1444 QLIST_INIT(&space
->containers
);
1446 if (QLIST_EMPTY(&vfio_address_spaces
)) {
1447 qemu_register_reset(vfio_reset_handler
, NULL
);
1450 QLIST_INSERT_HEAD(&vfio_address_spaces
, space
, list
);
1455 void vfio_put_address_space(VFIOAddressSpace
*space
)
1457 if (QLIST_EMPTY(&space
->containers
)) {
1458 QLIST_REMOVE(space
, list
);
1461 if (QLIST_EMPTY(&vfio_address_spaces
)) {
1462 qemu_unregister_reset(vfio_reset_handler
, NULL
);
1466 struct vfio_device_info
*vfio_get_device_info(int fd
)
1468 struct vfio_device_info
*info
;
1469 uint32_t argsz
= sizeof(*info
);
1471 info
= g_malloc0(argsz
);
1474 info
->argsz
= argsz
;
1476 if (ioctl(fd
, VFIO_DEVICE_GET_INFO
, info
)) {
1481 if (info
->argsz
> argsz
) {
1482 argsz
= info
->argsz
;
1483 info
= g_realloc(info
, argsz
);