hw/vfio/common.c

   1 /*
   2  * generic functions used by VFIO devices
   3  *
   4  * Copyright Red Hat, Inc. 2012
   5  *
   6  * Authors:
   7  *  Alex Williamson <alex.williamson@redhat.com>
   8  *
   9  * This work is licensed under the terms of the GNU GPL, version 2.  See
  10  * the COPYING file in the top-level directory.
  11  *
  12  * Based on qemu-kvm device-assignment:
  13  *  Adapted for KVM by Qumranet.
  14  *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
  15  *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
  16  *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
  17  *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
  18  *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
  19  */
  20
  21 #include "qemu/osdep.h"
  22 #include <sys/ioctl.h>
  23 #ifdef CONFIG_KVM
  24 #include <linux/kvm.h>
  25 #endif
  26 #include <linux/vfio.h>
  27
  28 #include "hw/vfio/vfio-common.h"
  29 #include "hw/vfio/vfio.h"
  30 #include "exec/address-spaces.h"
  31 #include "exec/memory.h"
  32 #include "exec/ram_addr.h"
  33 #include "hw/hw.h"
  34 #include "qemu/error-report.h"
  35 #include "qemu/main-loop.h"
  36 #include "qemu/range.h"
  37 #include "sysemu/kvm.h"
  38 #include "sysemu/reset.h"
  39 #include "sysemu/runstate.h"
  40 #include "trace.h"
  41 #include "qapi/error.h"
  42 #include "migration/migration.h"
  43 #include "migration/misc.h"
  44 #include "migration/blocker.h"
  45 #include "sysemu/tpm.h"
  46
  47 VFIOGroupList vfio_group_list =
  48     QLIST_HEAD_INITIALIZER(vfio_group_list);
  49 static QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces =
  50     QLIST_HEAD_INITIALIZER(vfio_address_spaces);
  51
  52 #ifdef CONFIG_KVM
  53 /*
  54  * We have a single VFIO pseudo device per KVM VM.  Once created it lives
  55  * for the life of the VM.  Closing the file descriptor only drops our
  56  * reference to it and the device's reference to kvm.  Therefore once
  57  * initialized, this file descriptor is only released on QEMU exit and
  58  * we'll re-use it should another vfio device be attached before then.
  59  */
  60 static int vfio_kvm_device_fd = -1;
  61 #endif
  62
  63 /*
  64  * Common VFIO interrupt disable
  65  */
  66 void vfio_disable_irqindex(VFIODevice *vbasedev, int index)
  67 {
  68     struct vfio_irq_set irq_set = {
  69         .argsz = sizeof(irq_set),
  70         .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
  71         .index = index,
  72         .start = 0,
  73         .count = 0,
  74     };
  75
  76     ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
  77 }
  78
  79 void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index)
  80 {
  81     struct vfio_irq_set irq_set = {
  82         .argsz = sizeof(irq_set),
  83         .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK,
  84         .index = index,
  85         .start = 0,
  86         .count = 1,
  87     };
  88
  89     ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
  90 }
  91
  92 void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index)
  93 {
  94     struct vfio_irq_set irq_set = {
  95         .argsz = sizeof(irq_set),
  96         .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
  97         .index = index,
  98         .start = 0,
  99         .count = 1,
 100     };
 101
 102     ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
 103 }
 104
 105 static inline const char *action_to_str(int action)
 106 {
 107     switch (action) {
 108     case VFIO_IRQ_SET_ACTION_MASK:
 109         return "MASK";
 110     case VFIO_IRQ_SET_ACTION_UNMASK:
 111         return "UNMASK";
 112     case VFIO_IRQ_SET_ACTION_TRIGGER:
 113         return "TRIGGER";
 114     default:
 115         return "UNKNOWN ACTION";
 116     }
 117 }
 118
 119 static const char *index_to_str(VFIODevice *vbasedev, int index)
 120 {
 121     if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
 122         return NULL;
 123     }
 124
 125     switch (index) {
 126     case VFIO_PCI_INTX_IRQ_INDEX:
 127         return "INTX";
 128     case VFIO_PCI_MSI_IRQ_INDEX:
 129         return "MSI";
 130     case VFIO_PCI_MSIX_IRQ_INDEX:
 131         return "MSIX";
 132     case VFIO_PCI_ERR_IRQ_INDEX:
 133         return "ERR";
 134     case VFIO_PCI_REQ_IRQ_INDEX:
 135         return "REQ";
 136     default:
 137         return NULL;
 138     }
 139 }
 140
 141 static int vfio_ram_block_discard_disable(VFIOContainer *container, bool state)
 142 {
 143     switch (container->iommu_type) {
 144     case VFIO_TYPE1v2_IOMMU:
 145     case VFIO_TYPE1_IOMMU:
 146         /*
 147          * We support coordinated discarding of RAM via the RamDiscardManager.
 148          */
 149         return ram_block_uncoordinated_discard_disable(state);
 150     default:
 151         /*
 152          * VFIO_SPAPR_TCE_IOMMU most probably works just fine with
 153          * RamDiscardManager, however, it is completely untested.
 154          *
 155          * VFIO_SPAPR_TCE_v2_IOMMU with "DMA memory preregistering" does
 156          * completely the opposite of managing mapping/pinning dynamically as
 157          * required by RamDiscardManager. We would have to special-case sections
 158          * with a RamDiscardManager.
 159          */
 160         return ram_block_discard_disable(state);
 161     }
 162 }
 163
 164 int vfio_set_irq_signaling(VFIODevice *vbasedev, int index, int subindex,
 165                            int action, int fd, Error **errp)
 166 {
 167     struct vfio_irq_set *irq_set;
 168     int argsz, ret = 0;
 169     const char *name;
 170     int32_t *pfd;
 171
 172     argsz = sizeof(*irq_set) + sizeof(*pfd);
 173
 174     irq_set = g_malloc0(argsz);
 175     irq_set->argsz = argsz;
 176     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | action;
 177     irq_set->index = index;
 178     irq_set->start = subindex;
 179     irq_set->count = 1;
 180     pfd = (int32_t *)&irq_set->data;
 181     *pfd = fd;
 182
 183     if (ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irq_set)) {
 184         ret = -errno;
 185     }
 186     g_free(irq_set);
 187
 188     if (!ret) {
 189         return 0;
 190     }
 191
 192     error_setg_errno(errp, -ret, "VFIO_DEVICE_SET_IRQS failure");
 193
 194     name = index_to_str(vbasedev, index);
 195     if (name) {
 196         error_prepend(errp, "%s-%d: ", name, subindex);
 197     } else {
 198         error_prepend(errp, "index %d-%d: ", index, subindex);
 199     }
 200     error_prepend(errp,
 201                   "Failed to %s %s eventfd signaling for interrupt ",
 202                   fd < 0 ? "tear down" : "set up", action_to_str(action));
 203     return ret;
 204 }
 205
 206 /*
 207  * IO Port/MMIO - Beware of the endians, VFIO is always little endian
 208  */
 209 void vfio_region_write(void *opaque, hwaddr addr,
 210                        uint64_t data, unsigned size)
 211 {
 212     VFIORegion *region = opaque;
 213     VFIODevice *vbasedev = region->vbasedev;
 214     union {
 215         uint8_t byte;
 216         uint16_t word;
 217         uint32_t dword;
 218         uint64_t qword;
 219     } buf;
 220
 221     switch (size) {
 222     case 1:
 223         buf.byte = data;
 224         break;
 225     case 2:
 226         buf.word = cpu_to_le16(data);
 227         break;
 228     case 4:
 229         buf.dword = cpu_to_le32(data);
 230         break;
 231     case 8:
 232         buf.qword = cpu_to_le64(data);
 233         break;
 234     default:
 235         hw_error("vfio: unsupported write size, %u bytes", size);
 236         break;
 237     }
 238
 239     if (pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
 240         error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64
 241                      ",%d) failed: %m",
 242                      __func__, vbasedev->name, region->nr,
 243                      addr, data, size);
 244     }
 245
 246     trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size);
 247
 248     /*
 249      * A read or write to a BAR always signals an INTx EOI.  This will
 250      * do nothing if not pending (including not in INTx mode).  We assume
 251      * that a BAR access is in response to an interrupt and that BAR
 252      * accesses will service the interrupt.  Unfortunately, we don't know
 253      * which access will service the interrupt, so we're potentially
 254      * getting quite a few host interrupts per guest interrupt.
 255      */
 256     vbasedev->ops->vfio_eoi(vbasedev);
 257 }
 258
 259 uint64_t vfio_region_read(void *opaque,
 260                           hwaddr addr, unsigned size)
 261 {
 262     VFIORegion *region = opaque;
 263     VFIODevice *vbasedev = region->vbasedev;
 264     union {
 265         uint8_t byte;
 266         uint16_t word;
 267         uint32_t dword;
 268         uint64_t qword;
 269     } buf;
 270     uint64_t data = 0;
 271
 272     if (pread(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
 273         error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m",
 274                      __func__, vbasedev->name, region->nr,
 275                      addr, size);
 276         return (uint64_t)-1;
 277     }
 278     switch (size) {
 279     case 1:
 280         data = buf.byte;
 281         break;
 282     case 2:
 283         data = le16_to_cpu(buf.word);
 284         break;
 285     case 4:
 286         data = le32_to_cpu(buf.dword);
 287         break;
 288     case 8:
 289         data = le64_to_cpu(buf.qword);
 290         break;
 291     default:
 292         hw_error("vfio: unsupported read size, %u bytes", size);
 293         break;
 294     }
 295
 296     trace_vfio_region_read(vbasedev->name, region->nr, addr, size, data);
 297
 298     /* Same as write above */
 299     vbasedev->ops->vfio_eoi(vbasedev);
 300
 301     return data;
 302 }
 303
 304 const MemoryRegionOps vfio_region_ops = {
 305     .read = vfio_region_read,
 306     .write = vfio_region_write,
 307     .endianness = DEVICE_LITTLE_ENDIAN,
 308     .valid = {
 309         .min_access_size = 1,
 310         .max_access_size = 8,
 311     },
 312     .impl = {
 313         .min_access_size = 1,
 314         .max_access_size = 8,
 315     },
 316 };
 317
 318 /*
 319  * Device state interfaces
 320  */
 321
 322 bool vfio_mig_active(void)
 323 {
 324     VFIOGroup *group;
 325     VFIODevice *vbasedev;
 326
 327     if (QLIST_EMPTY(&vfio_group_list)) {
 328         return false;
 329     }
 330
 331     QLIST_FOREACH(group, &vfio_group_list, next) {
 332         QLIST_FOREACH(vbasedev, &group->device_list, next) {
 333             if (vbasedev->migration_blocker) {
 334                 return false;
 335             }
 336         }
 337     }
 338     return true;
 339 }
 340
 341 static Error *multiple_devices_migration_blocker;
 342
 343 static unsigned int vfio_migratable_device_num(void)
 344 {
 345     VFIOGroup *group;
 346     VFIODevice *vbasedev;
 347     unsigned int device_num = 0;
 348
 349     QLIST_FOREACH(group, &vfio_group_list, next) {
 350         QLIST_FOREACH(vbasedev, &group->device_list, next) {
 351             if (vbasedev->migration) {
 352                 device_num++;
 353             }
 354         }
 355     }
 356
 357     return device_num;
 358 }
 359
 360 int vfio_block_multiple_devices_migration(Error **errp)
 361 {
 362     int ret;
 363
 364     if (multiple_devices_migration_blocker ||
 365         vfio_migratable_device_num() <= 1) {
 366         return 0;
 367     }
 368
 369     error_setg(&multiple_devices_migration_blocker,
 370                "Migration is currently not supported with multiple "
 371                "VFIO devices");
 372     ret = migrate_add_blocker(multiple_devices_migration_blocker, errp);
 373     if (ret < 0) {
 374         error_free(multiple_devices_migration_blocker);
 375         multiple_devices_migration_blocker = NULL;
 376     }
 377
 378     return ret;
 379 }
 380
 381 void vfio_unblock_multiple_devices_migration(void)
 382 {
 383     if (!multiple_devices_migration_blocker ||
 384         vfio_migratable_device_num() > 1) {
 385         return;
 386     }
 387
 388     migrate_del_blocker(multiple_devices_migration_blocker);
 389     error_free(multiple_devices_migration_blocker);
 390     multiple_devices_migration_blocker = NULL;
 391 }
 392
 393 static bool vfio_devices_all_dirty_tracking(VFIOContainer *container)
 394 {
 395     VFIOGroup *group;
 396     VFIODevice *vbasedev;
 397     MigrationState *ms = migrate_get_current();
 398
 399     if (!migration_is_setup_or_active(ms->state)) {
 400         return false;
 401     }
 402
 403     QLIST_FOREACH(group, &container->group_list, container_next) {
 404         QLIST_FOREACH(vbasedev, &group->device_list, next) {
 405             VFIOMigration *migration = vbasedev->migration;
 406
 407             if (!migration) {
 408                 return false;
 409             }
 410
 411             if (vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF &&
 412                 migration->device_state == VFIO_DEVICE_STATE_RUNNING) {
 413                 return false;
 414             }
 415         }
 416     }
 417     return true;
 418 }
 419
 420 /*
 421  * Check if all VFIO devices are running and migration is active, which is
 422  * essentially equivalent to the migration being in pre-copy phase.
 423  */
 424 static bool vfio_devices_all_running_and_mig_active(VFIOContainer *container)
 425 {
 426     VFIOGroup *group;
 427     VFIODevice *vbasedev;
 428
 429     if (!migration_is_active(migrate_get_current())) {
 430         return false;
 431     }
 432
 433     QLIST_FOREACH(group, &container->group_list, container_next) {
 434         QLIST_FOREACH(vbasedev, &group->device_list, next) {
 435             VFIOMigration *migration = vbasedev->migration;
 436
 437             if (!migration) {
 438                 return false;
 439             }
 440
 441             if (migration->device_state == VFIO_DEVICE_STATE_RUNNING) {
 442                 continue;
 443             } else {
 444                 return false;
 445             }
 446         }
 447     }
 448     return true;
 449 }
 450
 451 static int vfio_dma_unmap_bitmap(VFIOContainer *container,
 452                                  hwaddr iova, ram_addr_t size,
 453                                  IOMMUTLBEntry *iotlb)
 454 {
 455     struct vfio_iommu_type1_dma_unmap *unmap;
 456     struct vfio_bitmap *bitmap;
 457     uint64_t pages = REAL_HOST_PAGE_ALIGN(size) / qemu_real_host_page_size();
 458     int ret;
 459
 460     unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap));
 461
 462     unmap->argsz = sizeof(*unmap) + sizeof(*bitmap);
 463     unmap->iova = iova;
 464     unmap->size = size;
 465     unmap->flags |= VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP;
 466     bitmap = (struct vfio_bitmap *)&unmap->data;
 467
 468     /*
 469      * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
 470      * qemu_real_host_page_size to mark those dirty. Hence set bitmap_pgsize
 471      * to qemu_real_host_page_size.
 472      */
 473
 474     bitmap->pgsize = qemu_real_host_page_size();
 475     bitmap->size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) /
 476                    BITS_PER_BYTE;
 477
 478     if (bitmap->size > container->max_dirty_bitmap_size) {
 479         error_report("UNMAP: Size of bitmap too big 0x%"PRIx64,
 480                      (uint64_t)bitmap->size);
 481         ret = -E2BIG;
 482         goto unmap_exit;
 483     }
 484
 485     bitmap->data = g_try_malloc0(bitmap->size);
 486     if (!bitmap->data) {
 487         ret = -ENOMEM;
 488         goto unmap_exit;
 489     }
 490
 491     ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap);
 492     if (!ret) {
 493         cpu_physical_memory_set_dirty_lebitmap((unsigned long *)bitmap->data,
 494                 iotlb->translated_addr, pages);
 495     } else {
 496         error_report("VFIO_UNMAP_DMA with DIRTY_BITMAP : %m");
 497     }
 498
 499     g_free(bitmap->data);
 500 unmap_exit:
 501     g_free(unmap);
 502     return ret;
 503 }
 504
 505 /*
 506  * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
 507  */
 508 static int vfio_dma_unmap(VFIOContainer *container,
 509                           hwaddr iova, ram_addr_t size,
 510                           IOMMUTLBEntry *iotlb)
 511 {
 512     struct vfio_iommu_type1_dma_unmap unmap = {
 513         .argsz = sizeof(unmap),
 514         .flags = 0,
 515         .iova = iova,
 516         .size = size,
 517     };
 518
 519     if (iotlb && container->dirty_pages_supported &&
 520         vfio_devices_all_running_and_mig_active(container)) {
 521         return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
 522     }
 523
 524     while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
 525         /*
 526          * The type1 backend has an off-by-one bug in the kernel (71a7d3d78e3c
 527          * v4.15) where an overflow in its wrap-around check prevents us from
 528          * unmapping the last page of the address space.  Test for the error
 529          * condition and re-try the unmap excluding the last page.  The
 530          * expectation is that we've never mapped the last page anyway and this
 531          * unmap request comes via vIOMMU support which also makes it unlikely
 532          * that this page is used.  This bug was introduced well after type1 v2
 533          * support was introduced, so we shouldn't need to test for v1.  A fix
 534          * is queued for kernel v5.0 so this workaround can be removed once
 535          * affected kernels are sufficiently deprecated.
 536          */
 537         if (errno == EINVAL && unmap.size && !(unmap.iova + unmap.size) &&
 538             container->iommu_type == VFIO_TYPE1v2_IOMMU) {
 539             trace_vfio_dma_unmap_overflow_workaround();
 540             unmap.size -= 1ULL << ctz64(container->pgsizes);
 541             continue;
 542         }
 543         error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno));
 544         return -errno;
 545     }
 546
 547     if (iotlb && vfio_devices_all_running_and_mig_active(container)) {
 548         cpu_physical_memory_set_dirty_range(iotlb->translated_addr, size,
 549                                             tcg_enabled() ? DIRTY_CLIENTS_ALL :
 550                                             DIRTY_CLIENTS_NOCODE);
 551     }
 552
 553     return 0;
 554 }
 555
 556 static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
 557                         ram_addr_t size, void *vaddr, bool readonly)
 558 {
 559     struct vfio_iommu_type1_dma_map map = {
 560         .argsz = sizeof(map),
 561         .flags = VFIO_DMA_MAP_FLAG_READ,
 562         .vaddr = (__u64)(uintptr_t)vaddr,
 563         .iova = iova,
 564         .size = size,
 565     };
 566
 567     if (!readonly) {
 568         map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
 569     }
 570
 571     /*
 572      * Try the mapping, if it fails with EBUSY, unmap the region and try
 573      * again.  This shouldn't be necessary, but we sometimes see it in
 574      * the VGA ROM space.
 575      */
 576     if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
 577         (errno == EBUSY && vfio_dma_unmap(container, iova, size, NULL) == 0 &&
 578          ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
 579         return 0;
 580     }
 581
 582     error_report("VFIO_MAP_DMA failed: %s", strerror(errno));
 583     return -errno;
 584 }
 585
 586 static void vfio_host_win_add(VFIOContainer *container,
 587                               hwaddr min_iova, hwaddr max_iova,
 588                               uint64_t iova_pgsizes)
 589 {
 590     VFIOHostDMAWindow *hostwin;
 591
 592     QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
 593         if (ranges_overlap(hostwin->min_iova,
 594                            hostwin->max_iova - hostwin->min_iova + 1,
 595                            min_iova,
 596                            max_iova - min_iova + 1)) {
 597             hw_error("%s: Overlapped IOMMU are not enabled", __func__);
 598         }
 599     }
 600
 601     hostwin = g_malloc0(sizeof(*hostwin));
 602
 603     hostwin->min_iova = min_iova;
 604     hostwin->max_iova = max_iova;
 605     hostwin->iova_pgsizes = iova_pgsizes;
 606     QLIST_INSERT_HEAD(&container->hostwin_list, hostwin, hostwin_next);
 607 }
 608
 609 static int vfio_host_win_del(VFIOContainer *container, hwaddr min_iova,
 610                              hwaddr max_iova)
 611 {
 612     VFIOHostDMAWindow *hostwin;
 613
 614     QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
 615         if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) {
 616             QLIST_REMOVE(hostwin, hostwin_next);
 617             g_free(hostwin);
 618             return 0;
 619         }
 620     }
 621
 622     return -1;
 623 }
 624
 625 static bool vfio_listener_skipped_section(MemoryRegionSection *section)
 626 {
 627     return (!memory_region_is_ram(section->mr) &&
 628             !memory_region_is_iommu(section->mr)) ||
 629            memory_region_is_protected(section->mr) ||
 630            /*
 631             * Sizing an enabled 64-bit BAR can cause spurious mappings to
 632             * addresses in the upper part of the 64-bit address space.  These
 633             * are never accessed by the CPU and beyond the address width of
 634             * some IOMMU hardware.  TODO: VFIO should tell us the IOMMU width.
 635             */
 636            section->offset_within_address_space & (1ULL << 63);
 637 }
 638
 639 /* Called with rcu_read_lock held.  */
 640 static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
 641                                ram_addr_t *ram_addr, bool *read_only)
 642 {
 643     bool ret, mr_has_discard_manager;
 644
 645     ret = memory_get_xlat_addr(iotlb, vaddr, ram_addr, read_only,
 646                                &mr_has_discard_manager);
 647     if (ret && mr_has_discard_manager) {
 648         /*
 649          * Malicious VMs might trigger discarding of IOMMU-mapped memory. The
 650          * pages will remain pinned inside vfio until unmapped, resulting in a
 651          * higher memory consumption than expected. If memory would get
 652          * populated again later, there would be an inconsistency between pages
 653          * pinned by vfio and pages seen by QEMU. This is the case until
 654          * unmapped from the IOMMU (e.g., during device reset).
 655          *
 656          * With malicious guests, we really only care about pinning more memory
 657          * than expected. RLIMIT_MEMLOCK set for the user/process can never be
 658          * exceeded and can be used to mitigate this problem.
 659          */
 660         warn_report_once("Using vfio with vIOMMUs and coordinated discarding of"
 661                          " RAM (e.g., virtio-mem) works, however, malicious"
 662                          " guests can trigger pinning of more memory than"
 663                          " intended via an IOMMU. It's possible to mitigate "
 664                          " by setting/adjusting RLIMIT_MEMLOCK.");
 665     }
 666     return ret;
 667 }
 668
 669 static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
 670 {
 671     VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
 672     VFIOContainer *container = giommu->container;
 673     hwaddr iova = iotlb->iova + giommu->iommu_offset;
 674     void *vaddr;
 675     int ret;
 676
 677     trace_vfio_iommu_map_notify(iotlb->perm == IOMMU_NONE ? "UNMAP" : "MAP",
 678                                 iova, iova + iotlb->addr_mask);
 679
 680     if (iotlb->target_as != &address_space_memory) {
 681         error_report("Wrong target AS \"%s\", only system memory is allowed",
 682                      iotlb->target_as->name ? iotlb->target_as->name : "none");
 683         return;
 684     }
 685
 686     rcu_read_lock();
 687
 688     if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
 689         bool read_only;
 690
 691         if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only)) {
 692             goto out;
 693         }
 694         /*
 695          * vaddr is only valid until rcu_read_unlock(). But after
 696          * vfio_dma_map has set up the mapping the pages will be
 697          * pinned by the kernel. This makes sure that the RAM backend
 698          * of vaddr will always be there, even if the memory object is
 699          * destroyed and its backing memory munmap-ed.
 700          */
 701         ret = vfio_dma_map(container, iova,
 702                            iotlb->addr_mask + 1, vaddr,
 703                            read_only);
 704         if (ret) {
 705             error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
 706                          "0x%"HWADDR_PRIx", %p) = %d (%m)",
 707                          container, iova,
 708                          iotlb->addr_mask + 1, vaddr, ret);
 709         }
 710     } else {
 711         ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1, iotlb);
 712         if (ret) {
 713             error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
 714                          "0x%"HWADDR_PRIx") = %d (%m)",
 715                          container, iova,
 716                          iotlb->addr_mask + 1, ret);
 717         }
 718     }
 719 out:
 720     rcu_read_unlock();
 721 }
 722
 723 static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl,
 724                                             MemoryRegionSection *section)
 725 {
 726     VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
 727                                                 listener);
 728     const hwaddr size = int128_get64(section->size);
 729     const hwaddr iova = section->offset_within_address_space;
 730     int ret;
 731
 732     /* Unmap with a single call. */
 733     ret = vfio_dma_unmap(vrdl->container, iova, size , NULL);
 734     if (ret) {
 735         error_report("%s: vfio_dma_unmap() failed: %s", __func__,
 736                      strerror(-ret));
 737     }
 738 }
 739
 740 static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
 741                                             MemoryRegionSection *section)
 742 {
 743     VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
 744                                                 listener);
 745     const hwaddr end = section->offset_within_region +
 746                        int128_get64(section->size);
 747     hwaddr start, next, iova;
 748     void *vaddr;
 749     int ret;
 750
 751     /*
 752      * Map in (aligned within memory region) minimum granularity, so we can
 753      * unmap in minimum granularity later.
 754      */
 755     for (start = section->offset_within_region; start < end; start = next) {
 756         next = ROUND_UP(start + 1, vrdl->granularity);
 757         next = MIN(next, end);
 758
 759         iova = start - section->offset_within_region +
 760                section->offset_within_address_space;
 761         vaddr = memory_region_get_ram_ptr(section->mr) + start;
 762
 763         ret = vfio_dma_map(vrdl->container, iova, next - start,
 764                            vaddr, section->readonly);
 765         if (ret) {
 766             /* Rollback */
 767             vfio_ram_discard_notify_discard(rdl, section);
 768             return ret;
 769         }
 770     }
 771     return 0;
 772 }
 773
 774 static void vfio_register_ram_discard_listener(VFIOContainer *container,
 775                                                MemoryRegionSection *section)
 776 {
 777     RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
 778     VFIORamDiscardListener *vrdl;
 779
 780     /* Ignore some corner cases not relevant in practice. */
 781     g_assert(QEMU_IS_ALIGNED(section->offset_within_region, TARGET_PAGE_SIZE));
 782     g_assert(QEMU_IS_ALIGNED(section->offset_within_address_space,
 783                              TARGET_PAGE_SIZE));
 784     g_assert(QEMU_IS_ALIGNED(int128_get64(section->size), TARGET_PAGE_SIZE));
 785
 786     vrdl = g_new0(VFIORamDiscardListener, 1);
 787     vrdl->container = container;
 788     vrdl->mr = section->mr;
 789     vrdl->offset_within_address_space = section->offset_within_address_space;
 790     vrdl->size = int128_get64(section->size);
 791     vrdl->granularity = ram_discard_manager_get_min_granularity(rdm,
 792                                                                 section->mr);
 793
 794     g_assert(vrdl->granularity && is_power_of_2(vrdl->granularity));
 795     g_assert(container->pgsizes &&
 796              vrdl->granularity >= 1ULL << ctz64(container->pgsizes));
 797
 798     ram_discard_listener_init(&vrdl->listener,
 799                               vfio_ram_discard_notify_populate,
 800                               vfio_ram_discard_notify_discard, true);
 801     ram_discard_manager_register_listener(rdm, &vrdl->listener, section);
 802     QLIST_INSERT_HEAD(&container->vrdl_list, vrdl, next);
 803
 804     /*
 805      * Sanity-check if we have a theoretically problematic setup where we could
 806      * exceed the maximum number of possible DMA mappings over time. We assume
 807      * that each mapped section in the same address space as a RamDiscardManager
 808      * section consumes exactly one DMA mapping, with the exception of
 809      * RamDiscardManager sections; i.e., we don't expect to have gIOMMU sections
 810      * in the same address space as RamDiscardManager sections.
 811      *
 812      * We assume that each section in the address space consumes one memslot.
 813      * We take the number of KVM memory slots as a best guess for the maximum
 814      * number of sections in the address space we could have over time,
 815      * also consuming DMA mappings.
 816      */
 817     if (container->dma_max_mappings) {
 818         unsigned int vrdl_count = 0, vrdl_mappings = 0, max_memslots = 512;
 819
 820 #ifdef CONFIG_KVM
 821         if (kvm_enabled()) {
 822             max_memslots = kvm_get_max_memslots();
 823         }
 824 #endif
 825
 826         QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
 827             hwaddr start, end;
 828
 829             start = QEMU_ALIGN_DOWN(vrdl->offset_within_address_space,
 830                                     vrdl->granularity);
 831             end = ROUND_UP(vrdl->offset_within_address_space + vrdl->size,
 832                            vrdl->granularity);
 833             vrdl_mappings += (end - start) / vrdl->granularity;
 834             vrdl_count++;
 835         }
 836
 837         if (vrdl_mappings + max_memslots - vrdl_count >
 838             container->dma_max_mappings) {
 839             warn_report("%s: possibly running out of DMA mappings. E.g., try"
 840                         " increasing the 'block-size' of virtio-mem devies."
 841                         " Maximum possible DMA mappings: %d, Maximum possible"
 842                         " memslots: %d", __func__, container->dma_max_mappings,
 843                         max_memslots);
 844         }
 845     }
 846 }
 847
 848 static void vfio_unregister_ram_discard_listener(VFIOContainer *container,
 849                                                  MemoryRegionSection *section)
 850 {
 851     RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
 852     VFIORamDiscardListener *vrdl = NULL;
 853
 854     QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
 855         if (vrdl->mr == section->mr &&
 856             vrdl->offset_within_address_space ==
 857             section->offset_within_address_space) {
 858             break;
 859         }
 860     }
 861
 862     if (!vrdl) {
 863         hw_error("vfio: Trying to unregister missing RAM discard listener");
 864     }
 865
 866     ram_discard_manager_unregister_listener(rdm, &vrdl->listener);
 867     QLIST_REMOVE(vrdl, next);
 868     g_free(vrdl);
 869 }
 870
 871 static bool vfio_known_safe_misalignment(MemoryRegionSection *section)
 872 {
 873     MemoryRegion *mr = section->mr;
 874
 875     if (!TPM_IS_CRB(mr->owner)) {
 876         return false;
 877     }
 878
 879     /* this is a known safe misaligned region, just trace for debug purpose */
 880     trace_vfio_known_safe_misalignment(memory_region_name(mr),
 881                                        section->offset_within_address_space,
 882                                        section->offset_within_region,
 883                                        qemu_real_host_page_size());
 884     return true;
 885 }
 886
 887 static void vfio_listener_region_add(MemoryListener *listener,
 888                                      MemoryRegionSection *section)
 889 {
 890     VFIOContainer *container = container_of(listener, VFIOContainer, listener);
 891     hwaddr iova, end;
 892     Int128 llend, llsize;
 893     void *vaddr;
 894     int ret;
 895     VFIOHostDMAWindow *hostwin;
 896     bool hostwin_found;
 897     Error *err = NULL;
 898
 899     if (vfio_listener_skipped_section(section)) {
 900         trace_vfio_listener_region_add_skip(
 901                 section->offset_within_address_space,
 902                 section->offset_within_address_space +
 903                 int128_get64(int128_sub(section->size, int128_one())));
 904         return;
 905     }
 906
 907     if (unlikely((section->offset_within_address_space &
 908                   ~qemu_real_host_page_mask()) !=
 909                  (section->offset_within_region & ~qemu_real_host_page_mask()))) {
 910         if (!vfio_known_safe_misalignment(section)) {
 911             error_report("%s received unaligned region %s iova=0x%"PRIx64
 912                          " offset_within_region=0x%"PRIx64
 913                          " qemu_real_host_page_size=0x%"PRIxPTR,
 914                          __func__, memory_region_name(section->mr),
 915                          section->offset_within_address_space,
 916                          section->offset_within_region,
 917                          qemu_real_host_page_size());
 918         }
 919         return;
 920     }
 921
 922     iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
 923     llend = int128_make64(section->offset_within_address_space);
 924     llend = int128_add(llend, section->size);
 925     llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask()));
 926
 927     if (int128_ge(int128_make64(iova), llend)) {
 928         if (memory_region_is_ram_device(section->mr)) {
 929             trace_vfio_listener_region_add_no_dma_map(
 930                 memory_region_name(section->mr),
 931                 section->offset_within_address_space,
 932                 int128_getlo(section->size),
 933                 qemu_real_host_page_size());
 934         }
 935         return;
 936     }
 937     end = int128_get64(int128_sub(llend, int128_one()));
 938
 939     if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
 940         hwaddr pgsize = 0;
 941
 942         /* For now intersections are not allowed, we may relax this later */
 943         QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
 944             if (ranges_overlap(hostwin->min_iova,
 945                                hostwin->max_iova - hostwin->min_iova + 1,
 946                                section->offset_within_address_space,
 947                                int128_get64(section->size))) {
 948                 error_setg(&err,
 949                     "region [0x%"PRIx64",0x%"PRIx64"] overlaps with existing"
 950                     "host DMA window [0x%"PRIx64",0x%"PRIx64"]",
 951                     section->offset_within_address_space,
 952                     section->offset_within_address_space +
 953                         int128_get64(section->size) - 1,
 954                     hostwin->min_iova, hostwin->max_iova);
 955                 goto fail;
 956             }
 957         }
 958
 959         ret = vfio_spapr_create_window(container, section, &pgsize);
 960         if (ret) {
 961             error_setg_errno(&err, -ret, "Failed to create SPAPR window");
 962             goto fail;
 963         }
 964
 965         vfio_host_win_add(container, section->offset_within_address_space,
 966                           section->offset_within_address_space +
 967                           int128_get64(section->size) - 1, pgsize);
 968 #ifdef CONFIG_KVM
 969         if (kvm_enabled()) {
 970             VFIOGroup *group;
 971             IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
 972             struct kvm_vfio_spapr_tce param;
 973             struct kvm_device_attr attr = {
 974                 .group = KVM_DEV_VFIO_GROUP,
 975                 .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE,
 976                 .addr = (uint64_t)(unsigned long)&param,
 977             };
 978
 979             if (!memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_SPAPR_TCE_FD,
 980                                               &param.tablefd)) {
 981                 QLIST_FOREACH(group, &container->group_list, container_next) {
 982                     param.groupfd = group->fd;
 983                     if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
 984                         error_report("vfio: failed to setup fd %d "
 985                                      "for a group with fd %d: %s",
 986                                      param.tablefd, param.groupfd,
 987                                      strerror(errno));
 988                         return;
 989                     }
 990                     trace_vfio_spapr_group_attach(param.groupfd, param.tablefd);
 991                 }
 992             }
 993         }
 994 #endif
 995     }
 996
 997     hostwin_found = false;
 998     QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
 999         if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
1000             hostwin_found = true;
1001             break;
1002         }
1003     }
1004
1005     if (!hostwin_found) {
1006         error_setg(&err, "Container %p can't map guest IOVA region"
1007                    " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container, iova, end);
1008         goto fail;
1009     }
1010
1011     memory_region_ref(section->mr);
1012
1013     if (memory_region_is_iommu(section->mr)) {
1014         VFIOGuestIOMMU *giommu;
1015         IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
1016         int iommu_idx;
1017
1018         trace_vfio_listener_region_add_iommu(iova, end);
1019         /*
1020          * FIXME: For VFIO iommu types which have KVM acceleration to
1021          * avoid bouncing all map/unmaps through qemu this way, this
1022          * would be the right place to wire that up (tell the KVM
1023          * device emulation the VFIO iommu handles to use).
1024          */
1025         giommu = g_malloc0(sizeof(*giommu));
1026         giommu->iommu_mr = iommu_mr;
1027         giommu->iommu_offset = section->offset_within_address_space -
1028                                section->offset_within_region;
1029         giommu->container = container;
1030         llend = int128_add(int128_make64(section->offset_within_region),
1031                            section->size);
1032         llend = int128_sub(llend, int128_one());
1033         iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
1034                                                        MEMTXATTRS_UNSPECIFIED);
1035         iommu_notifier_init(&giommu->n, vfio_iommu_map_notify,
1036                             IOMMU_NOTIFIER_IOTLB_EVENTS,
1037                             section->offset_within_region,
1038                             int128_get64(llend),
1039                             iommu_idx);
1040
1041         ret = memory_region_iommu_set_page_size_mask(giommu->iommu_mr,
1042                                                      container->pgsizes,
1043                                                      &err);
1044         if (ret) {
1045             g_free(giommu);
1046             goto fail;
1047         }
1048
1049         ret = memory_region_register_iommu_notifier(section->mr, &giommu->n,
1050                                                     &err);
1051         if (ret) {
1052             g_free(giommu);
1053             goto fail;
1054         }
1055         QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
1056         memory_region_iommu_replay(giommu->iommu_mr, &giommu->n);
1057
1058         return;
1059     }
1060
1061     /* Here we assume that memory_region_is_ram(section->mr)==true */
1062
1063     /*
1064      * For RAM memory regions with a RamDiscardManager, we only want to map the
1065      * actually populated parts - and update the mapping whenever we're notified
1066      * about changes.
1067      */
1068     if (memory_region_has_ram_discard_manager(section->mr)) {
1069         vfio_register_ram_discard_listener(container, section);
1070         return;
1071     }
1072
1073     vaddr = memory_region_get_ram_ptr(section->mr) +
1074             section->offset_within_region +
1075             (iova - section->offset_within_address_space);
1076
1077     trace_vfio_listener_region_add_ram(iova, end, vaddr);
1078
1079     llsize = int128_sub(llend, int128_make64(iova));
1080
1081     if (memory_region_is_ram_device(section->mr)) {
1082         hwaddr pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
1083
1084         if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) {
1085             trace_vfio_listener_region_add_no_dma_map(
1086                 memory_region_name(section->mr),
1087                 section->offset_within_address_space,
1088                 int128_getlo(section->size),
1089                 pgmask + 1);
1090             return;
1091         }
1092     }
1093
1094     ret = vfio_dma_map(container, iova, int128_get64(llsize),
1095                        vaddr, section->readonly);
1096     if (ret) {
1097         error_setg(&err, "vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
1098                    "0x%"HWADDR_PRIx", %p) = %d (%m)",
1099                    container, iova, int128_get64(llsize), vaddr, ret);
1100         if (memory_region_is_ram_device(section->mr)) {
1101             /* Allow unexpected mappings not to be fatal for RAM devices */
1102             error_report_err(err);
1103             return;
1104         }
1105         goto fail;
1106     }
1107
1108     return;
1109
1110 fail:
1111     if (memory_region_is_ram_device(section->mr)) {
1112         error_report("failed to vfio_dma_map. pci p2p may not work");
1113         return;
1114     }
1115     /*
1116      * On the initfn path, store the first error in the container so we
1117      * can gracefully fail.  Runtime, there's not much we can do other
1118      * than throw a hardware error.
1119      */
1120     if (!container->initialized) {
1121         if (!container->error) {
1122             error_propagate_prepend(&container->error, err,
1123                                     "Region %s: ",
1124                                     memory_region_name(section->mr));
1125         } else {
1126             error_free(err);
1127         }
1128     } else {
1129         error_report_err(err);
1130         hw_error("vfio: DMA mapping failed, unable to continue");
1131     }
1132 }
1133
1134 static void vfio_listener_region_del(MemoryListener *listener,
1135                                      MemoryRegionSection *section)
1136 {
1137     VFIOContainer *container = container_of(listener, VFIOContainer, listener);
1138     hwaddr iova, end;
1139     Int128 llend, llsize;
1140     int ret;
1141     bool try_unmap = true;
1142
1143     if (vfio_listener_skipped_section(section)) {
1144         trace_vfio_listener_region_del_skip(
1145                 section->offset_within_address_space,
1146                 section->offset_within_address_space +
1147                 int128_get64(int128_sub(section->size, int128_one())));
1148         return;
1149     }
1150
1151     if (unlikely((section->offset_within_address_space &
1152                   ~qemu_real_host_page_mask()) !=
1153                  (section->offset_within_region & ~qemu_real_host_page_mask()))) {
1154         if (!vfio_known_safe_misalignment(section)) {
1155             error_report("%s received unaligned region %s iova=0x%"PRIx64
1156                          " offset_within_region=0x%"PRIx64
1157                          " qemu_real_host_page_size=0x%"PRIxPTR,
1158                          __func__, memory_region_name(section->mr),
1159                          section->offset_within_address_space,
1160                          section->offset_within_region,
1161                          qemu_real_host_page_size());
1162         }
1163         return;
1164     }
1165
1166     if (memory_region_is_iommu(section->mr)) {
1167         VFIOGuestIOMMU *giommu;
1168
1169         QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
1170             if (MEMORY_REGION(giommu->iommu_mr) == section->mr &&
1171                 giommu->n.start == section->offset_within_region) {
1172                 memory_region_unregister_iommu_notifier(section->mr,
1173                                                         &giommu->n);
1174                 QLIST_REMOVE(giommu, giommu_next);
1175                 g_free(giommu);
1176                 break;
1177             }
1178         }
1179
1180         /*
1181          * FIXME: We assume the one big unmap below is adequate to
1182          * remove any individual page mappings in the IOMMU which
1183          * might have been copied into VFIO. This works for a page table
1184          * based IOMMU where a big unmap flattens a large range of IO-PTEs.
1185          * That may not be true for all IOMMU types.
1186          */
1187     }
1188
1189     iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
1190     llend = int128_make64(section->offset_within_address_space);
1191     llend = int128_add(llend, section->size);
1192     llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask()));
1193
1194     if (int128_ge(int128_make64(iova), llend)) {
1195         return;
1196     }
1197     end = int128_get64(int128_sub(llend, int128_one()));
1198
1199     llsize = int128_sub(llend, int128_make64(iova));
1200
1201     trace_vfio_listener_region_del(iova, end);
1202
1203     if (memory_region_is_ram_device(section->mr)) {
1204         hwaddr pgmask;
1205         VFIOHostDMAWindow *hostwin;
1206         bool hostwin_found = false;
1207
1208         QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
1209             if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
1210                 hostwin_found = true;
1211                 break;
1212             }
1213         }
1214         assert(hostwin_found); /* or region_add() would have failed */
1215
1216         pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
1217         try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask));
1218     } else if (memory_region_has_ram_discard_manager(section->mr)) {
1219         vfio_unregister_ram_discard_listener(container, section);
1220         /* Unregistering will trigger an unmap. */
1221         try_unmap = false;
1222     }
1223
1224     if (try_unmap) {
1225         if (int128_eq(llsize, int128_2_64())) {
1226             /* The unmap ioctl doesn't accept a full 64-bit span. */
1227             llsize = int128_rshift(llsize, 1);
1228             ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
1229             if (ret) {
1230                 error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
1231                              "0x%"HWADDR_PRIx") = %d (%m)",
1232                              container, iova, int128_get64(llsize), ret);
1233             }
1234             iova += int128_get64(llsize);
1235         }
1236         ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
1237         if (ret) {
1238             error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
1239                          "0x%"HWADDR_PRIx") = %d (%m)",
1240                          container, iova, int128_get64(llsize), ret);
1241         }
1242     }
1243
1244     memory_region_unref(section->mr);
1245
1246     if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
1247         vfio_spapr_remove_window(container,
1248                                  section->offset_within_address_space);
1249         if (vfio_host_win_del(container,
1250                               section->offset_within_address_space,
1251                               section->offset_within_address_space +
1252                               int128_get64(section->size) - 1) < 0) {
1253             hw_error("%s: Cannot delete missing window at %"HWADDR_PRIx,
1254                      __func__, section->offset_within_address_space);
1255         }
1256     }
1257 }
1258
1259 static void vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
1260 {
1261     int ret;
1262     struct vfio_iommu_type1_dirty_bitmap dirty = {
1263         .argsz = sizeof(dirty),
1264     };
1265
1266     if (!container->dirty_pages_supported) {
1267         return;
1268     }
1269
1270     if (start) {
1271         dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START;
1272     } else {
1273         dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP;
1274     }
1275
1276     ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, &dirty);
1277     if (ret) {
1278         error_report("Failed to set dirty tracking flag 0x%x errno: %d",
1279                      dirty.flags, errno);
1280     }
1281 }
1282
1283 static void vfio_listener_log_global_start(MemoryListener *listener)
1284 {
1285     VFIOContainer *container = container_of(listener, VFIOContainer, listener);
1286
1287     vfio_set_dirty_page_tracking(container, true);
1288 }
1289
1290 static void vfio_listener_log_global_stop(MemoryListener *listener)
1291 {
1292     VFIOContainer *container = container_of(listener, VFIOContainer, listener);
1293
1294     vfio_set_dirty_page_tracking(container, false);
1295 }
1296
1297 static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
1298                                  uint64_t size, ram_addr_t ram_addr)
1299 {
1300     struct vfio_iommu_type1_dirty_bitmap *dbitmap;
1301     struct vfio_iommu_type1_dirty_bitmap_get *range;
1302     uint64_t pages;
1303     int ret;
1304
1305     if (!container->dirty_pages_supported) {
1306         cpu_physical_memory_set_dirty_range(ram_addr, size,
1307                                             tcg_enabled() ? DIRTY_CLIENTS_ALL :
1308                                             DIRTY_CLIENTS_NOCODE);
1309         return 0;
1310     }
1311
1312     dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range));
1313
1314     dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range);
1315     dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
1316     range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data;
1317     range->iova = iova;
1318     range->size = size;
1319
1320     /*
1321      * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
1322      * qemu_real_host_page_size to mark those dirty. Hence set bitmap's pgsize
1323      * to qemu_real_host_page_size.
1324      */
1325     range->bitmap.pgsize = qemu_real_host_page_size();
1326
1327     pages = REAL_HOST_PAGE_ALIGN(range->size) / qemu_real_host_page_size();
1328     range->bitmap.size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) /
1329                                          BITS_PER_BYTE;
1330     range->bitmap.data = g_try_malloc0(range->bitmap.size);
1331     if (!range->bitmap.data) {
1332         ret = -ENOMEM;
1333         goto err_out;
1334     }
1335
1336     ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap);
1337     if (ret) {
1338         error_report("Failed to get dirty bitmap for iova: 0x%"PRIx64
1339                 " size: 0x%"PRIx64" err: %d", (uint64_t)range->iova,
1340                 (uint64_t)range->size, errno);
1341         goto err_out;
1342     }
1343
1344     cpu_physical_memory_set_dirty_lebitmap((unsigned long *)range->bitmap.data,
1345                                             ram_addr, pages);
1346
1347     trace_vfio_get_dirty_bitmap(container->fd, range->iova, range->size,
1348                                 range->bitmap.size, ram_addr);
1349 err_out:
1350     g_free(range->bitmap.data);
1351     g_free(dbitmap);
1352
1353     return ret;
1354 }
1355
1356 typedef struct {
1357     IOMMUNotifier n;
1358     VFIOGuestIOMMU *giommu;
1359 } vfio_giommu_dirty_notifier;
1360
1361 static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
1362 {
1363     vfio_giommu_dirty_notifier *gdn = container_of(n,
1364                                                 vfio_giommu_dirty_notifier, n);
1365     VFIOGuestIOMMU *giommu = gdn->giommu;
1366     VFIOContainer *container = giommu->container;
1367     hwaddr iova = iotlb->iova + giommu->iommu_offset;
1368     ram_addr_t translated_addr;
1369
1370     trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask);
1371
1372     if (iotlb->target_as != &address_space_memory) {
1373         error_report("Wrong target AS \"%s\", only system memory is allowed",
1374                      iotlb->target_as->name ? iotlb->target_as->name : "none");
1375         return;
1376     }
1377
1378     rcu_read_lock();
1379     if (vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL)) {
1380         int ret;
1381
1382         ret = vfio_get_dirty_bitmap(container, iova, iotlb->addr_mask + 1,
1383                                     translated_addr);
1384         if (ret) {
1385             error_report("vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", "
1386                          "0x%"HWADDR_PRIx") = %d (%m)",
1387                          container, iova,
1388                          iotlb->addr_mask + 1, ret);
1389         }
1390     }
1391     rcu_read_unlock();
1392 }
1393
1394 static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section,
1395                                              void *opaque)
1396 {
1397     const hwaddr size = int128_get64(section->size);
1398     const hwaddr iova = section->offset_within_address_space;
1399     const ram_addr_t ram_addr = memory_region_get_ram_addr(section->mr) +
1400                                 section->offset_within_region;
1401     VFIORamDiscardListener *vrdl = opaque;
1402
1403     /*
1404      * Sync the whole mapped region (spanning multiple individual mappings)
1405      * in one go.
1406      */
1407     return vfio_get_dirty_bitmap(vrdl->container, iova, size, ram_addr);
1408 }
1409
1410 static int vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainer *container,
1411                                                    MemoryRegionSection *section)
1412 {
1413     RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
1414     VFIORamDiscardListener *vrdl = NULL;
1415
1416     QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
1417         if (vrdl->mr == section->mr &&
1418             vrdl->offset_within_address_space ==
1419             section->offset_within_address_space) {
1420             break;
1421         }
1422     }
1423
1424     if (!vrdl) {
1425         hw_error("vfio: Trying to sync missing RAM discard listener");
1426     }
1427
1428     /*
1429      * We only want/can synchronize the bitmap for actually mapped parts -
1430      * which correspond to populated parts. Replay all populated parts.
1431      */
1432     return ram_discard_manager_replay_populated(rdm, section,
1433                                               vfio_ram_discard_get_dirty_bitmap,
1434                                                 &vrdl);
1435 }
1436
1437 static int vfio_sync_dirty_bitmap(VFIOContainer *container,
1438                                   MemoryRegionSection *section)
1439 {
1440     ram_addr_t ram_addr;
1441
1442     if (memory_region_is_iommu(section->mr)) {
1443         VFIOGuestIOMMU *giommu;
1444
1445         QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
1446             if (MEMORY_REGION(giommu->iommu_mr) == section->mr &&
1447                 giommu->n.start == section->offset_within_region) {
1448                 Int128 llend;
1449                 vfio_giommu_dirty_notifier gdn = { .giommu = giommu };
1450                 int idx = memory_region_iommu_attrs_to_index(giommu->iommu_mr,
1451                                                        MEMTXATTRS_UNSPECIFIED);
1452
1453                 llend = int128_add(int128_make64(section->offset_within_region),
1454                                    section->size);
1455                 llend = int128_sub(llend, int128_one());
1456
1457                 iommu_notifier_init(&gdn.n,
1458                                     vfio_iommu_map_dirty_notify,
1459                                     IOMMU_NOTIFIER_MAP,
1460                                     section->offset_within_region,
1461                                     int128_get64(llend),
1462                                     idx);
1463                 memory_region_iommu_replay(giommu->iommu_mr, &gdn.n);
1464                 break;
1465             }
1466         }
1467         return 0;
1468     } else if (memory_region_has_ram_discard_manager(section->mr)) {
1469         return vfio_sync_ram_discard_listener_dirty_bitmap(container, section);
1470     }
1471
1472     ram_addr = memory_region_get_ram_addr(section->mr) +
1473                section->offset_within_region;
1474
1475     return vfio_get_dirty_bitmap(container,
1476                    REAL_HOST_PAGE_ALIGN(section->offset_within_address_space),
1477                    int128_get64(section->size), ram_addr);
1478 }
1479
1480 static void vfio_listener_log_sync(MemoryListener *listener,
1481         MemoryRegionSection *section)
1482 {
1483     VFIOContainer *container = container_of(listener, VFIOContainer, listener);
1484
1485     if (vfio_listener_skipped_section(section)) {
1486         return;
1487     }
1488
1489     if (vfio_devices_all_dirty_tracking(container)) {
1490         vfio_sync_dirty_bitmap(container, section);
1491     }
1492 }
1493
1494 static const MemoryListener vfio_memory_listener = {
1495     .name = "vfio",
1496     .region_add = vfio_listener_region_add,
1497     .region_del = vfio_listener_region_del,
1498     .log_global_start = vfio_listener_log_global_start,
1499     .log_global_stop = vfio_listener_log_global_stop,
1500     .log_sync = vfio_listener_log_sync,
1501 };
1502
1503 static void vfio_listener_release(VFIOContainer *container)
1504 {
1505     memory_listener_unregister(&container->listener);
1506     if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
1507         memory_listener_unregister(&container->prereg_listener);
1508     }
1509 }
1510
1511 static struct vfio_info_cap_header *
1512 vfio_get_cap(void *ptr, uint32_t cap_offset, uint16_t id)
1513 {
1514     struct vfio_info_cap_header *hdr;
1515
1516     for (hdr = ptr + cap_offset; hdr != ptr; hdr = ptr + hdr->next) {
1517         if (hdr->id == id) {
1518             return hdr;
1519         }
1520     }
1521
1522     return NULL;
1523 }
1524
1525 struct vfio_info_cap_header *
1526 vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id)
1527 {
1528     if (!(info->flags & VFIO_REGION_INFO_FLAG_CAPS)) {
1529         return NULL;
1530     }
1531
1532     return vfio_get_cap((void *)info, info->cap_offset, id);
1533 }
1534
1535 static struct vfio_info_cap_header *
1536 vfio_get_iommu_type1_info_cap(struct vfio_iommu_type1_info *info, uint16_t id)
1537 {
1538     if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) {
1539         return NULL;
1540     }
1541
1542     return vfio_get_cap((void *)info, info->cap_offset, id);
1543 }
1544
1545 struct vfio_info_cap_header *
1546 vfio_get_device_info_cap(struct vfio_device_info *info, uint16_t id)
1547 {
1548     if (!(info->flags & VFIO_DEVICE_FLAGS_CAPS)) {
1549         return NULL;
1550     }
1551
1552     return vfio_get_cap((void *)info, info->cap_offset, id);
1553 }
1554
1555 bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info,
1556                              unsigned int *avail)
1557 {
1558     struct vfio_info_cap_header *hdr;
1559     struct vfio_iommu_type1_info_dma_avail *cap;
1560
1561     /* If the capability cannot be found, assume no DMA limiting */
1562     hdr = vfio_get_iommu_type1_info_cap(info,
1563                                         VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL);
1564     if (hdr == NULL) {
1565         return false;
1566     }
1567
1568     if (avail != NULL) {
1569         cap = (void *) hdr;
1570         *avail = cap->avail;
1571     }
1572
1573     return true;
1574 }
1575
1576 static int vfio_setup_region_sparse_mmaps(VFIORegion *region,
1577                                           struct vfio_region_info *info)
1578 {
1579     struct vfio_info_cap_header *hdr;
1580     struct vfio_region_info_cap_sparse_mmap *sparse;
1581     int i, j;
1582
1583     hdr = vfio_get_region_info_cap(info, VFIO_REGION_INFO_CAP_SPARSE_MMAP);
1584     if (!hdr) {
1585         return -ENODEV;
1586     }
1587
1588     sparse = container_of(hdr, struct vfio_region_info_cap_sparse_mmap, header);
1589
1590     trace_vfio_region_sparse_mmap_header(region->vbasedev->name,
1591                                          region->nr, sparse->nr_areas);
1592
1593     region->mmaps = g_new0(VFIOMmap, sparse->nr_areas);
1594
1595     for (i = 0, j = 0; i < sparse->nr_areas; i++) {
1596         if (sparse->areas[i].size) {
1597             trace_vfio_region_sparse_mmap_entry(i, sparse->areas[i].offset,
1598                                             sparse->areas[i].offset +
1599                                             sparse->areas[i].size - 1);
1600             region->mmaps[j].offset = sparse->areas[i].offset;
1601             region->mmaps[j].size = sparse->areas[i].size;
1602             j++;
1603         }
1604     }
1605
1606     region->nr_mmaps = j;
1607     region->mmaps = g_realloc(region->mmaps, j * sizeof(VFIOMmap));
1608
1609     return 0;
1610 }
1611
1612 int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
1613                       int index, const char *name)
1614 {
1615     struct vfio_region_info *info;
1616     int ret;
1617
1618     ret = vfio_get_region_info(vbasedev, index, &info);
1619     if (ret) {
1620         return ret;
1621     }
1622
1623     region->vbasedev = vbasedev;
1624     region->flags = info->flags;
1625     region->size = info->size;
1626     region->fd_offset = info->offset;
1627     region->nr = index;
1628
1629     if (region->size) {
1630         region->mem = g_new0(MemoryRegion, 1);
1631         memory_region_init_io(region->mem, obj, &vfio_region_ops,
1632                               region, name, region->size);
1633
1634         if (!vbasedev->no_mmap &&
1635             region->flags & VFIO_REGION_INFO_FLAG_MMAP) {
1636
1637             ret = vfio_setup_region_sparse_mmaps(region, info);
1638
1639             if (ret) {
1640                 region->nr_mmaps = 1;
1641                 region->mmaps = g_new0(VFIOMmap, region->nr_mmaps);
1642                 region->mmaps[0].offset = 0;
1643                 region->mmaps[0].size = region->size;
1644             }
1645         }
1646     }
1647
1648     g_free(info);
1649
1650     trace_vfio_region_setup(vbasedev->name, index, name,
1651                             region->flags, region->fd_offset, region->size);
1652     return 0;
1653 }
1654
1655 static void vfio_subregion_unmap(VFIORegion *region, int index)
1656 {
1657     trace_vfio_region_unmap(memory_region_name(&region->mmaps[index].mem),
1658                             region->mmaps[index].offset,
1659                             region->mmaps[index].offset +
1660                             region->mmaps[index].size - 1);
1661     memory_region_del_subregion(region->mem, &region->mmaps[index].mem);
1662     munmap(region->mmaps[index].mmap, region->mmaps[index].size);
1663     object_unparent(OBJECT(&region->mmaps[index].mem));
1664     region->mmaps[index].mmap = NULL;
1665 }
1666
1667 int vfio_region_mmap(VFIORegion *region)
1668 {
1669     int i, prot = 0;
1670     char *name;
1671
1672     if (!region->mem) {
1673         return 0;
1674     }
1675
1676     prot |= region->flags & VFIO_REGION_INFO_FLAG_READ ? PROT_READ : 0;
1677     prot |= region->flags & VFIO_REGION_INFO_FLAG_WRITE ? PROT_WRITE : 0;
1678
1679     for (i = 0; i < region->nr_mmaps; i++) {
1680         region->mmaps[i].mmap = mmap(NULL, region->mmaps[i].size, prot,
1681                                      MAP_SHARED, region->vbasedev->fd,
1682                                      region->fd_offset +
1683                                      region->mmaps[i].offset);
1684         if (region->mmaps[i].mmap == MAP_FAILED) {
1685             int ret = -errno;
1686
1687             trace_vfio_region_mmap_fault(memory_region_name(region->mem), i,
1688                                          region->fd_offset +
1689                                          region->mmaps[i].offset,
1690                                          region->fd_offset +
1691                                          region->mmaps[i].offset +
1692                                          region->mmaps[i].size - 1, ret);
1693
1694             region->mmaps[i].mmap = NULL;
1695
1696             for (i--; i >= 0; i--) {
1697                 vfio_subregion_unmap(region, i);
1698             }
1699
1700             return ret;
1701         }
1702
1703         name = g_strdup_printf("%s mmaps[%d]",
1704                                memory_region_name(region->mem), i);
1705         memory_region_init_ram_device_ptr(&region->mmaps[i].mem,
1706                                           memory_region_owner(region->mem),
1707                                           name, region->mmaps[i].size,
1708                                           region->mmaps[i].mmap);
1709         g_free(name);
1710         memory_region_add_subregion(region->mem, region->mmaps[i].offset,
1711                                     &region->mmaps[i].mem);
1712
1713         trace_vfio_region_mmap(memory_region_name(&region->mmaps[i].mem),
1714                                region->mmaps[i].offset,
1715                                region->mmaps[i].offset +
1716                                region->mmaps[i].size - 1);
1717     }
1718
1719     return 0;
1720 }
1721
1722 void vfio_region_unmap(VFIORegion *region)
1723 {
1724     int i;
1725
1726     if (!region->mem) {
1727         return;
1728     }
1729
1730     for (i = 0; i < region->nr_mmaps; i++) {
1731         if (region->mmaps[i].mmap) {
1732             vfio_subregion_unmap(region, i);
1733         }
1734     }
1735 }
1736
1737 void vfio_region_exit(VFIORegion *region)
1738 {
1739     int i;
1740
1741     if (!region->mem) {
1742         return;
1743     }
1744
1745     for (i = 0; i < region->nr_mmaps; i++) {
1746         if (region->mmaps[i].mmap) {
1747             memory_region_del_subregion(region->mem, &region->mmaps[i].mem);
1748         }
1749     }
1750
1751     trace_vfio_region_exit(region->vbasedev->name, region->nr);
1752 }
1753
1754 void vfio_region_finalize(VFIORegion *region)
1755 {
1756     int i;
1757
1758     if (!region->mem) {
1759         return;
1760     }
1761
1762     for (i = 0; i < region->nr_mmaps; i++) {
1763         if (region->mmaps[i].mmap) {
1764             munmap(region->mmaps[i].mmap, region->mmaps[i].size);
1765             object_unparent(OBJECT(&region->mmaps[i].mem));
1766         }
1767     }
1768
1769     object_unparent(OBJECT(region->mem));
1770
1771     g_free(region->mem);
1772     g_free(region->mmaps);
1773
1774     trace_vfio_region_finalize(region->vbasedev->name, region->nr);
1775
1776     region->mem = NULL;
1777     region->mmaps = NULL;
1778     region->nr_mmaps = 0;
1779     region->size = 0;
1780     region->flags = 0;
1781     region->nr = 0;
1782 }
1783
1784 void vfio_region_mmaps_set_enabled(VFIORegion *region, bool enabled)
1785 {
1786     int i;
1787
1788     if (!region->mem) {
1789         return;
1790     }
1791
1792     for (i = 0; i < region->nr_mmaps; i++) {
1793         if (region->mmaps[i].mmap) {
1794             memory_region_set_enabled(&region->mmaps[i].mem, enabled);
1795         }
1796     }
1797
1798     trace_vfio_region_mmaps_set_enabled(memory_region_name(region->mem),
1799                                         enabled);
1800 }
1801
1802 void vfio_reset_handler(void *opaque)
1803 {
1804     VFIOGroup *group;
1805     VFIODevice *vbasedev;
1806
1807     QLIST_FOREACH(group, &vfio_group_list, next) {
1808         QLIST_FOREACH(vbasedev, &group->device_list, next) {
1809             if (vbasedev->dev->realized) {
1810                 vbasedev->ops->vfio_compute_needs_reset(vbasedev);
1811             }
1812         }
1813     }
1814
1815     QLIST_FOREACH(group, &vfio_group_list, next) {
1816         QLIST_FOREACH(vbasedev, &group->device_list, next) {
1817             if (vbasedev->dev->realized && vbasedev->needs_reset) {
1818                 vbasedev->ops->vfio_hot_reset_multi(vbasedev);
1819             }
1820         }
1821     }
1822 }
1823
1824 static void vfio_kvm_device_add_group(VFIOGroup *group)
1825 {
1826 #ifdef CONFIG_KVM
1827     struct kvm_device_attr attr = {
1828         .group = KVM_DEV_VFIO_GROUP,
1829         .attr = KVM_DEV_VFIO_GROUP_ADD,
1830         .addr = (uint64_t)(unsigned long)&group->fd,
1831     };
1832
1833     if (!kvm_enabled()) {
1834         return;
1835     }
1836
1837     if (vfio_kvm_device_fd < 0) {
1838         struct kvm_create_device cd = {
1839             .type = KVM_DEV_TYPE_VFIO,
1840         };
1841
1842         if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) {
1843             error_report("Failed to create KVM VFIO device: %m");
1844             return;
1845         }
1846
1847         vfio_kvm_device_fd = cd.fd;
1848     }
1849
1850     if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
1851         error_report("Failed to add group %d to KVM VFIO device: %m",
1852                      group->groupid);
1853     }
1854 #endif
1855 }
1856
1857 static void vfio_kvm_device_del_group(VFIOGroup *group)
1858 {
1859 #ifdef CONFIG_KVM
1860     struct kvm_device_attr attr = {
1861         .group = KVM_DEV_VFIO_GROUP,
1862         .attr = KVM_DEV_VFIO_GROUP_DEL,
1863         .addr = (uint64_t)(unsigned long)&group->fd,
1864     };
1865
1866     if (vfio_kvm_device_fd < 0) {
1867         return;
1868     }
1869
1870     if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
1871         error_report("Failed to remove group %d from KVM VFIO device: %m",
1872                      group->groupid);
1873     }
1874 #endif
1875 }
1876
1877 static VFIOAddressSpace *vfio_get_address_space(AddressSpace *as)
1878 {
1879     VFIOAddressSpace *space;
1880
1881     QLIST_FOREACH(space, &vfio_address_spaces, list) {
1882         if (space->as == as) {
1883             return space;
1884         }
1885     }
1886
1887     /* No suitable VFIOAddressSpace, create a new one */
1888     space = g_malloc0(sizeof(*space));
1889     space->as = as;
1890     QLIST_INIT(&space->containers);
1891
1892     QLIST_INSERT_HEAD(&vfio_address_spaces, space, list);
1893
1894     return space;
1895 }
1896
1897 static void vfio_put_address_space(VFIOAddressSpace *space)
1898 {
1899     if (QLIST_EMPTY(&space->containers)) {
1900         QLIST_REMOVE(space, list);
1901         g_free(space);
1902     }
1903 }
1904
1905 /*
1906  * vfio_get_iommu_type - selects the richest iommu_type (v2 first)
1907  */
1908 static int vfio_get_iommu_type(VFIOContainer *container,
1909                                Error **errp)
1910 {
1911     int iommu_types[] = { VFIO_TYPE1v2_IOMMU, VFIO_TYPE1_IOMMU,
1912                           VFIO_SPAPR_TCE_v2_IOMMU, VFIO_SPAPR_TCE_IOMMU };
1913     int i;
1914
1915     for (i = 0; i < ARRAY_SIZE(iommu_types); i++) {
1916         if (ioctl(container->fd, VFIO_CHECK_EXTENSION, iommu_types[i])) {
1917             return iommu_types[i];
1918         }
1919     }
1920     error_setg(errp, "No available IOMMU models");
1921     return -EINVAL;
1922 }
1923
1924 static int vfio_init_container(VFIOContainer *container, int group_fd,
1925                                Error **errp)
1926 {
1927     int iommu_type, ret;
1928
1929     iommu_type = vfio_get_iommu_type(container, errp);
1930     if (iommu_type < 0) {
1931         return iommu_type;
1932     }
1933
1934     ret = ioctl(group_fd, VFIO_GROUP_SET_CONTAINER, &container->fd);
1935     if (ret) {
1936         error_setg_errno(errp, errno, "Failed to set group container");
1937         return -errno;
1938     }
1939
1940     while (ioctl(container->fd, VFIO_SET_IOMMU, iommu_type)) {
1941         if (iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
1942             /*
1943              * On sPAPR, despite the IOMMU subdriver always advertises v1 and
1944              * v2, the running platform may not support v2 and there is no
1945              * way to guess it until an IOMMU group gets added to the container.
1946              * So in case it fails with v2, try v1 as a fallback.
1947              */
1948             iommu_type = VFIO_SPAPR_TCE_IOMMU;
1949             continue;
1950         }
1951         error_setg_errno(errp, errno, "Failed to set iommu for container");
1952         return -errno;
1953     }
1954
1955     container->iommu_type = iommu_type;
1956     return 0;
1957 }
1958
1959 static int vfio_get_iommu_info(VFIOContainer *container,
1960                                struct vfio_iommu_type1_info **info)
1961 {
1962
1963     size_t argsz = sizeof(struct vfio_iommu_type1_info);
1964
1965     *info = g_new0(struct vfio_iommu_type1_info, 1);
1966 again:
1967     (*info)->argsz = argsz;
1968
1969     if (ioctl(container->fd, VFIO_IOMMU_GET_INFO, *info)) {
1970         g_free(*info);
1971         *info = NULL;
1972         return -errno;
1973     }
1974
1975     if (((*info)->argsz > argsz)) {
1976         argsz = (*info)->argsz;
1977         *info = g_realloc(*info, argsz);
1978         goto again;
1979     }
1980
1981     return 0;
1982 }
1983
1984 static struct vfio_info_cap_header *
1985 vfio_get_iommu_info_cap(struct vfio_iommu_type1_info *info, uint16_t id)
1986 {
1987     struct vfio_info_cap_header *hdr;
1988     void *ptr = info;
1989
1990     if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) {
1991         return NULL;
1992     }
1993
1994     for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) {
1995         if (hdr->id == id) {
1996             return hdr;
1997         }
1998     }
1999
2000     return NULL;
2001 }
2002
2003 static void vfio_get_iommu_info_migration(VFIOContainer *container,
2004                                          struct vfio_iommu_type1_info *info)
2005 {
2006     struct vfio_info_cap_header *hdr;
2007     struct vfio_iommu_type1_info_cap_migration *cap_mig;
2008
2009     hdr = vfio_get_iommu_info_cap(info, VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION);
2010     if (!hdr) {
2011         return;
2012     }
2013
2014     cap_mig = container_of(hdr, struct vfio_iommu_type1_info_cap_migration,
2015                             header);
2016
2017     /*
2018      * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
2019      * qemu_real_host_page_size to mark those dirty.
2020      */
2021     if (cap_mig->pgsize_bitmap & qemu_real_host_page_size()) {
2022         container->dirty_pages_supported = true;
2023         container->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size;
2024         container->dirty_pgsizes = cap_mig->pgsize_bitmap;
2025     }
2026 }
2027
2028 static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
2029                                   Error **errp)
2030 {
2031     VFIOContainer *container;
2032     int ret, fd;
2033     VFIOAddressSpace *space;
2034
2035     space = vfio_get_address_space(as);
2036
2037     /*
2038      * VFIO is currently incompatible with discarding of RAM insofar as the
2039      * madvise to purge (zap) the page from QEMU's address space does not
2040      * interact with the memory API and therefore leaves stale virtual to
2041      * physical mappings in the IOMMU if the page was previously pinned.  We
2042      * therefore set discarding broken for each group added to a container,
2043      * whether the container is used individually or shared.  This provides
2044      * us with options to allow devices within a group to opt-in and allow
2045      * discarding, so long as it is done consistently for a group (for instance
2046      * if the device is an mdev device where it is known that the host vendor
2047      * driver will never pin pages outside of the working set of the guest
2048      * driver, which would thus not be discarding candidates).
2049      *
2050      * The first opportunity to induce pinning occurs here where we attempt to
2051      * attach the group to existing containers within the AddressSpace.  If any
2052      * pages are already zapped from the virtual address space, such as from
2053      * previous discards, new pinning will cause valid mappings to be
2054      * re-established.  Likewise, when the overall MemoryListener for a new
2055      * container is registered, a replay of mappings within the AddressSpace
2056      * will occur, re-establishing any previously zapped pages as well.
2057      *
2058      * Especially virtio-balloon is currently only prevented from discarding
2059      * new memory, it will not yet set ram_block_discard_set_required() and
2060      * therefore, neither stops us here or deals with the sudden memory
2061      * consumption of inflated memory.
2062      *
2063      * We do support discarding of memory coordinated via the RamDiscardManager
2064      * with some IOMMU types. vfio_ram_block_discard_disable() handles the
2065      * details once we know which type of IOMMU we are using.
2066      */
2067
2068     QLIST_FOREACH(container, &space->containers, next) {
2069         if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
2070             ret = vfio_ram_block_discard_disable(container, true);
2071             if (ret) {
2072                 error_setg_errno(errp, -ret,
2073                                  "Cannot set discarding of RAM broken");
2074                 if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER,
2075                           &container->fd)) {
2076                     error_report("vfio: error disconnecting group %d from"
2077                                  " container", group->groupid);
2078                 }
2079                 return ret;
2080             }
2081             group->container = container;
2082             QLIST_INSERT_HEAD(&container->group_list, group, container_next);
2083             vfio_kvm_device_add_group(group);
2084             return 0;
2085         }
2086     }
2087
2088     fd = qemu_open_old("/dev/vfio/vfio", O_RDWR);
2089     if (fd < 0) {
2090         error_setg_errno(errp, errno, "failed to open /dev/vfio/vfio");
2091         ret = -errno;
2092         goto put_space_exit;
2093     }
2094
2095     ret = ioctl(fd, VFIO_GET_API_VERSION);
2096     if (ret != VFIO_API_VERSION) {
2097         error_setg(errp, "supported vfio version: %d, "
2098                    "reported version: %d", VFIO_API_VERSION, ret);
2099         ret = -EINVAL;
2100         goto close_fd_exit;
2101     }
2102
2103     container = g_malloc0(sizeof(*container));
2104     container->space = space;
2105     container->fd = fd;
2106     container->error = NULL;
2107     container->dirty_pages_supported = false;
2108     container->dma_max_mappings = 0;
2109     QLIST_INIT(&container->giommu_list);
2110     QLIST_INIT(&container->hostwin_list);
2111     QLIST_INIT(&container->vrdl_list);
2112
2113     ret = vfio_init_container(container, group->fd, errp);
2114     if (ret) {
2115         goto free_container_exit;
2116     }
2117
2118     ret = vfio_ram_block_discard_disable(container, true);
2119     if (ret) {
2120         error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken");
2121         goto free_container_exit;
2122     }
2123
2124     switch (container->iommu_type) {
2125     case VFIO_TYPE1v2_IOMMU:
2126     case VFIO_TYPE1_IOMMU:
2127     {
2128         struct vfio_iommu_type1_info *info;
2129
2130         ret = vfio_get_iommu_info(container, &info);
2131         if (ret) {
2132             error_setg_errno(errp, -ret, "Failed to get VFIO IOMMU info");
2133             goto enable_discards_exit;
2134         }
2135
2136         if (info->flags & VFIO_IOMMU_INFO_PGSIZES) {
2137             container->pgsizes = info->iova_pgsizes;
2138         } else {
2139             container->pgsizes = qemu_real_host_page_size();
2140         }
2141
2142         if (!vfio_get_info_dma_avail(info, &container->dma_max_mappings)) {
2143             container->dma_max_mappings = 65535;
2144         }
2145         vfio_get_iommu_info_migration(container, info);
2146         g_free(info);
2147
2148         /*
2149          * FIXME: We should parse VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE
2150          * information to get the actual window extent rather than assume
2151          * a 64-bit IOVA address space.
2152          */
2153         vfio_host_win_add(container, 0, (hwaddr)-1, container->pgsizes);
2154
2155         break;
2156     }
2157     case VFIO_SPAPR_TCE_v2_IOMMU:
2158     case VFIO_SPAPR_TCE_IOMMU:
2159     {
2160         struct vfio_iommu_spapr_tce_info info;
2161         bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU;
2162
2163         /*
2164          * The host kernel code implementing VFIO_IOMMU_DISABLE is called
2165          * when container fd is closed so we do not call it explicitly
2166          * in this file.
2167          */
2168         if (!v2) {
2169             ret = ioctl(fd, VFIO_IOMMU_ENABLE);
2170             if (ret) {
2171                 error_setg_errno(errp, errno, "failed to enable container");
2172                 ret = -errno;
2173                 goto enable_discards_exit;
2174             }
2175         } else {
2176             container->prereg_listener = vfio_prereg_listener;
2177
2178             memory_listener_register(&container->prereg_listener,
2179                                      &address_space_memory);
2180             if (container->error) {
2181                 memory_listener_unregister(&container->prereg_listener);
2182                 ret = -1;
2183                 error_propagate_prepend(errp, container->error,
2184                     "RAM memory listener initialization failed: ");
2185                 goto enable_discards_exit;
2186             }
2187         }
2188
2189         info.argsz = sizeof(info);
2190         ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
2191         if (ret) {
2192             error_setg_errno(errp, errno,
2193                              "VFIO_IOMMU_SPAPR_TCE_GET_INFO failed");
2194             ret = -errno;
2195             if (v2) {
2196                 memory_listener_unregister(&container->prereg_listener);
2197             }
2198             goto enable_discards_exit;
2199         }
2200
2201         if (v2) {
2202             container->pgsizes = info.ddw.pgsizes;
2203             /*
2204              * There is a default window in just created container.
2205              * To make region_add/del simpler, we better remove this
2206              * window now and let those iommu_listener callbacks
2207              * create/remove them when needed.
2208              */
2209             ret = vfio_spapr_remove_window(container, info.dma32_window_start);
2210             if (ret) {
2211                 error_setg_errno(errp, -ret,
2212                                  "failed to remove existing window");
2213                 goto enable_discards_exit;
2214             }
2215         } else {
2216             /* The default table uses 4K pages */
2217             container->pgsizes = 0x1000;
2218             vfio_host_win_add(container, info.dma32_window_start,
2219                               info.dma32_window_start +
2220                               info.dma32_window_size - 1,
2221                               0x1000);
2222         }
2223     }
2224     }
2225
2226     vfio_kvm_device_add_group(group);
2227
2228     QLIST_INIT(&container->group_list);
2229     QLIST_INSERT_HEAD(&space->containers, container, next);
2230
2231     group->container = container;
2232     QLIST_INSERT_HEAD(&container->group_list, group, container_next);
2233
2234     container->listener = vfio_memory_listener;
2235
2236     memory_listener_register(&container->listener, container->space->as);
2237
2238     if (container->error) {
2239         ret = -1;
2240         error_propagate_prepend(errp, container->error,
2241             "memory listener initialization failed: ");
2242         goto listener_release_exit;
2243     }
2244
2245     container->initialized = true;
2246
2247     return 0;
2248 listener_release_exit:
2249     QLIST_REMOVE(group, container_next);
2250     QLIST_REMOVE(container, next);
2251     vfio_kvm_device_del_group(group);
2252     vfio_listener_release(container);
2253
2254 enable_discards_exit:
2255     vfio_ram_block_discard_disable(container, false);
2256
2257 free_container_exit:
2258     g_free(container);
2259
2260 close_fd_exit:
2261     close(fd);
2262
2263 put_space_exit:
2264     vfio_put_address_space(space);
2265
2266     return ret;
2267 }
2268
2269 static void vfio_disconnect_container(VFIOGroup *group)
2270 {
2271     VFIOContainer *container = group->container;
2272
2273     QLIST_REMOVE(group, container_next);
2274     group->container = NULL;
2275
2276     /*
2277      * Explicitly release the listener first before unset container,
2278      * since unset may destroy the backend container if it's the last
2279      * group.
2280      */
2281     if (QLIST_EMPTY(&container->group_list)) {
2282         vfio_listener_release(container);
2283     }
2284
2285     if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
2286         error_report("vfio: error disconnecting group %d from container",
2287                      group->groupid);
2288     }
2289
2290     if (QLIST_EMPTY(&container->group_list)) {
2291         VFIOAddressSpace *space = container->space;
2292         VFIOGuestIOMMU *giommu, *tmp;
2293         VFIOHostDMAWindow *hostwin, *next;
2294
2295         QLIST_REMOVE(container, next);
2296
2297         QLIST_FOREACH_SAFE(giommu, &container->giommu_list, giommu_next, tmp) {
2298             memory_region_unregister_iommu_notifier(
2299                     MEMORY_REGION(giommu->iommu_mr), &giommu->n);
2300             QLIST_REMOVE(giommu, giommu_next);
2301             g_free(giommu);
2302         }
2303
2304         QLIST_FOREACH_SAFE(hostwin, &container->hostwin_list, hostwin_next,
2305                            next) {
2306             QLIST_REMOVE(hostwin, hostwin_next);
2307             g_free(hostwin);
2308         }
2309
2310         trace_vfio_disconnect_container(container->fd);
2311         close(container->fd);
2312         g_free(container);
2313
2314         vfio_put_address_space(space);
2315     }
2316 }
2317
2318 VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp)
2319 {
2320     VFIOGroup *group;
2321     char path[32];
2322     struct vfio_group_status status = { .argsz = sizeof(status) };
2323
2324     QLIST_FOREACH(group, &vfio_group_list, next) {
2325         if (group->groupid == groupid) {
2326             /* Found it.  Now is it already in the right context? */
2327             if (group->container->space->as == as) {
2328                 return group;
2329             } else {
2330                 error_setg(errp, "group %d used in multiple address spaces",
2331                            group->groupid);
2332                 return NULL;
2333             }
2334         }
2335     }
2336
2337     group = g_malloc0(sizeof(*group));
2338
2339     snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
2340     group->fd = qemu_open_old(path, O_RDWR);
2341     if (group->fd < 0) {
2342         error_setg_errno(errp, errno, "failed to open %s", path);
2343         goto free_group_exit;
2344     }
2345
2346     if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
2347         error_setg_errno(errp, errno, "failed to get group %d status", groupid);
2348         goto close_fd_exit;
2349     }
2350
2351     if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
2352         error_setg(errp, "group %d is not viable", groupid);
2353         error_append_hint(errp,
2354                           "Please ensure all devices within the iommu_group "
2355                           "are bound to their vfio bus driver.\n");
2356         goto close_fd_exit;
2357     }
2358
2359     group->groupid = groupid;
2360     QLIST_INIT(&group->device_list);
2361
2362     if (vfio_connect_container(group, as, errp)) {
2363         error_prepend(errp, "failed to setup container for group %d: ",
2364                       groupid);
2365         goto close_fd_exit;
2366     }
2367
2368     if (QLIST_EMPTY(&vfio_group_list)) {
2369         qemu_register_reset(vfio_reset_handler, NULL);
2370     }
2371
2372     QLIST_INSERT_HEAD(&vfio_group_list, group, next);
2373
2374     return group;
2375
2376 close_fd_exit:
2377     close(group->fd);
2378
2379 free_group_exit:
2380     g_free(group);
2381
2382     return NULL;
2383 }
2384
2385 void vfio_put_group(VFIOGroup *group)
2386 {
2387     if (!group || !QLIST_EMPTY(&group->device_list)) {
2388         return;
2389     }
2390
2391     if (!group->ram_block_discard_allowed) {
2392         vfio_ram_block_discard_disable(group->container, false);
2393     }
2394     vfio_kvm_device_del_group(group);
2395     vfio_disconnect_container(group);
2396     QLIST_REMOVE(group, next);
2397     trace_vfio_put_group(group->fd);
2398     close(group->fd);
2399     g_free(group);
2400
2401     if (QLIST_EMPTY(&vfio_group_list)) {
2402         qemu_unregister_reset(vfio_reset_handler, NULL);
2403     }
2404 }
2405
2406 int vfio_get_device(VFIOGroup *group, const char *name,
2407                     VFIODevice *vbasedev, Error **errp)
2408 {
2409     struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
2410     int ret, fd;
2411
2412     fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
2413     if (fd < 0) {
2414         error_setg_errno(errp, errno, "error getting device from group %d",
2415                          group->groupid);
2416         error_append_hint(errp,
2417                       "Verify all devices in group %d are bound to vfio-<bus> "
2418                       "or pci-stub and not already in use\n", group->groupid);
2419         return fd;
2420     }
2421
2422     ret = ioctl(fd, VFIO_DEVICE_GET_INFO, &dev_info);
2423     if (ret) {
2424         error_setg_errno(errp, errno, "error getting device info");
2425         close(fd);
2426         return ret;
2427     }
2428
2429     /*
2430      * Set discarding of RAM as not broken for this group if the driver knows
2431      * the device operates compatibly with discarding.  Setting must be
2432      * consistent per group, but since compatibility is really only possible
2433      * with mdev currently, we expect singleton groups.
2434      */
2435     if (vbasedev->ram_block_discard_allowed !=
2436         group->ram_block_discard_allowed) {
2437         if (!QLIST_EMPTY(&group->device_list)) {
2438             error_setg(errp, "Inconsistent setting of support for discarding "
2439                        "RAM (e.g., balloon) within group");
2440             close(fd);
2441             return -1;
2442         }
2443
2444         if (!group->ram_block_discard_allowed) {
2445             group->ram_block_discard_allowed = true;
2446             vfio_ram_block_discard_disable(group->container, false);
2447         }
2448     }
2449
2450     vbasedev->fd = fd;
2451     vbasedev->group = group;
2452     QLIST_INSERT_HEAD(&group->device_list, vbasedev, next);
2453
2454     vbasedev->num_irqs = dev_info.num_irqs;
2455     vbasedev->num_regions = dev_info.num_regions;
2456     vbasedev->flags = dev_info.flags;
2457
2458     trace_vfio_get_device(name, dev_info.flags, dev_info.num_regions,
2459                           dev_info.num_irqs);
2460
2461     vbasedev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
2462     return 0;
2463 }
2464
2465 void vfio_put_base_device(VFIODevice *vbasedev)
2466 {
2467     if (!vbasedev->group) {
2468         return;
2469     }
2470     QLIST_REMOVE(vbasedev, next);
2471     vbasedev->group = NULL;
2472     trace_vfio_put_base_device(vbasedev->fd);
2473     close(vbasedev->fd);
2474 }
2475
2476 int vfio_get_region_info(VFIODevice *vbasedev, int index,
2477                          struct vfio_region_info **info)
2478 {
2479     size_t argsz = sizeof(struct vfio_region_info);
2480
2481     *info = g_malloc0(argsz);
2482
2483     (*info)->index = index;
2484 retry:
2485     (*info)->argsz = argsz;
2486
2487     if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, *info)) {
2488         g_free(*info);
2489         *info = NULL;
2490         return -errno;
2491     }
2492
2493     if ((*info)->argsz > argsz) {
2494         argsz = (*info)->argsz;
2495         *info = g_realloc(*info, argsz);
2496
2497         goto retry;
2498     }
2499
2500     return 0;
2501 }
2502
2503 int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type,
2504                              uint32_t subtype, struct vfio_region_info **info)
2505 {
2506     int i;
2507
2508     for (i = 0; i < vbasedev->num_regions; i++) {
2509         struct vfio_info_cap_header *hdr;
2510         struct vfio_region_info_cap_type *cap_type;
2511
2512         if (vfio_get_region_info(vbasedev, i, info)) {
2513             continue;
2514         }
2515
2516         hdr = vfio_get_region_info_cap(*info, VFIO_REGION_INFO_CAP_TYPE);
2517         if (!hdr) {
2518             g_free(*info);
2519             continue;
2520         }
2521
2522         cap_type = container_of(hdr, struct vfio_region_info_cap_type, header);
2523
2524         trace_vfio_get_dev_region(vbasedev->name, i,
2525                                   cap_type->type, cap_type->subtype);
2526
2527         if (cap_type->type == type && cap_type->subtype == subtype) {
2528             return 0;
2529         }
2530
2531         g_free(*info);
2532     }
2533
2534     *info = NULL;
2535     return -ENODEV;
2536 }
2537
2538 bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type)
2539 {
2540     struct vfio_region_info *info = NULL;
2541     bool ret = false;
2542
2543     if (!vfio_get_region_info(vbasedev, region, &info)) {
2544         if (vfio_get_region_info_cap(info, cap_type)) {
2545             ret = true;
2546         }
2547         g_free(info);
2548     }
2549
2550     return ret;
2551 }
2552
2553 /*
2554  * Interfaces for IBM EEH (Enhanced Error Handling)
2555  */
2556 static bool vfio_eeh_container_ok(VFIOContainer *container)
2557 {
2558     /*
2559      * As of 2016-03-04 (linux-4.5) the host kernel EEH/VFIO
2560      * implementation is broken if there are multiple groups in a
2561      * container.  The hardware works in units of Partitionable
2562      * Endpoints (== IOMMU groups) and the EEH operations naively
2563      * iterate across all groups in the container, without any logic
2564      * to make sure the groups have their state synchronized.  For
2565      * certain operations (ENABLE) that might be ok, until an error
2566      * occurs, but for others (GET_STATE) it's clearly broken.
2567      */
2568
2569     /*
2570      * XXX Once fixed kernels exist, test for them here
2571      */
2572
2573     if (QLIST_EMPTY(&container->group_list)) {
2574         return false;
2575     }
2576
2577     if (QLIST_NEXT(QLIST_FIRST(&container->group_list), container_next)) {
2578         return false;
2579     }
2580
2581     return true;
2582 }
2583
2584 static int vfio_eeh_container_op(VFIOContainer *container, uint32_t op)
2585 {
2586     struct vfio_eeh_pe_op pe_op = {
2587         .argsz = sizeof(pe_op),
2588         .op = op,
2589     };
2590     int ret;
2591
2592     if (!vfio_eeh_container_ok(container)) {
2593         error_report("vfio/eeh: EEH_PE_OP 0x%x: "
2594                      "kernel requires a container with exactly one group", op);
2595         return -EPERM;
2596     }
2597
2598     ret = ioctl(container->fd, VFIO_EEH_PE_OP, &pe_op);
2599     if (ret < 0) {
2600         error_report("vfio/eeh: EEH_PE_OP 0x%x failed: %m", op);
2601         return -errno;
2602     }
2603
2604     return ret;
2605 }
2606
2607 static VFIOContainer *vfio_eeh_as_container(AddressSpace *as)
2608 {
2609     VFIOAddressSpace *space = vfio_get_address_space(as);
2610     VFIOContainer *container = NULL;
2611
2612     if (QLIST_EMPTY(&space->containers)) {
2613         /* No containers to act on */
2614         goto out;
2615     }
2616
2617     container = QLIST_FIRST(&space->containers);
2618
2619     if (QLIST_NEXT(container, next)) {
2620         /* We don't yet have logic to synchronize EEH state across
2621          * multiple containers */
2622         container = NULL;
2623         goto out;
2624     }
2625
2626 out:
2627     vfio_put_address_space(space);
2628     return container;
2629 }
2630
2631 bool vfio_eeh_as_ok(AddressSpace *as)
2632 {
2633     VFIOContainer *container = vfio_eeh_as_container(as);
2634
2635     return (container != NULL) && vfio_eeh_container_ok(container);
2636 }
2637
2638 int vfio_eeh_as_op(AddressSpace *as, uint32_t op)
2639 {
2640     VFIOContainer *container = vfio_eeh_as_container(as);
2641
2642     if (!container) {
2643         return -ENODEV;
2644     }
2645     return vfio_eeh_container_op(container, op);
2646 }