hw/misc/vfio.c

   1 /*
   2  * vfio based device assignment support
   3  *
   4  * Copyright Red Hat, Inc. 2012
   5  *
   6  * Authors:
   7  *  Alex Williamson <alex.williamson@redhat.com>
   8  *
   9  * This work is licensed under the terms of the GNU GPL, version 2.  See
  10  * the COPYING file in the top-level directory.
  11  *
  12  * Based on qemu-kvm device-assignment:
  13  *  Adapted for KVM by Qumranet.
  14  *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
  15  *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
  16  *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
  17  *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
  18  *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
  19  */
  20
  21 #include <dirent.h>
  22 #include <linux/vfio.h>
  23 #include <sys/ioctl.h>
  24 #include <sys/mman.h>
  25 #include <sys/stat.h>
  26 #include <sys/types.h>
  27 #include <unistd.h>
  28
  29 #include "config.h"
  30 #include "exec/address-spaces.h"
  31 #include "exec/memory.h"
  32 #include "hw/pci/msi.h"
  33 #include "hw/pci/msix.h"
  34 #include "hw/pci/pci.h"
  35 #include "qemu-common.h"
  36 #include "qemu/error-report.h"
  37 #include "qemu/event_notifier.h"
  38 #include "qemu/queue.h"
  39 #include "qemu/range.h"
  40 #include "sysemu/kvm.h"
  41 #include "sysemu/sysemu.h"
  42
  43 /* #define DEBUG_VFIO */
  44 #ifdef DEBUG_VFIO
  45 #define DPRINTF(fmt, ...) \
  46     do { fprintf(stderr, "vfio: " fmt, ## __VA_ARGS__); } while (0)
  47 #else
  48 #define DPRINTF(fmt, ...) \
  49     do { } while (0)
  50 #endif
  51
  52 /* Extra debugging, trap acceleration paths for more logging */
  53 #define VFIO_ALLOW_MMAP 1
  54 #define VFIO_ALLOW_KVM_INTX 1
  55 #define VFIO_ALLOW_KVM_MSI 1
  56 #define VFIO_ALLOW_KVM_MSIX 1
  57
  58 struct VFIODevice;
  59
  60 typedef struct VFIOQuirk {
  61     MemoryRegion mem;
  62     struct VFIODevice *vdev;
  63     QLIST_ENTRY(VFIOQuirk) next;
  64     struct {
  65         uint32_t base_offset:TARGET_PAGE_BITS;
  66         uint32_t address_offset:TARGET_PAGE_BITS;
  67         uint32_t address_size:3;
  68         uint32_t bar:3;
  69
  70         uint32_t address_match;
  71         uint32_t address_mask;
  72
  73         uint32_t address_val:TARGET_PAGE_BITS;
  74         uint32_t data_offset:TARGET_PAGE_BITS;
  75         uint32_t data_size:3;
  76
  77         uint8_t flags;
  78         uint8_t read_flags;
  79         uint8_t write_flags;
  80     } data;
  81 } VFIOQuirk;
  82
  83 typedef struct VFIOBAR {
  84     off_t fd_offset; /* offset of BAR within device fd */
  85     int fd; /* device fd, allows us to pass VFIOBAR as opaque data */
  86     MemoryRegion mem; /* slow, read/write access */
  87     MemoryRegion mmap_mem; /* direct mapped access */
  88     void *mmap;
  89     size_t size;
  90     uint32_t flags; /* VFIO region flags (rd/wr/mmap) */
  91     uint8_t nr; /* cache the BAR number for debug */
  92     bool ioport;
  93     bool mem64;
  94     QLIST_HEAD(, VFIOQuirk) quirks;
  95 } VFIOBAR;
  96
  97 typedef struct VFIOVGARegion {
  98     MemoryRegion mem;
  99     off_t offset;
 100     int nr;
 101     QLIST_HEAD(, VFIOQuirk) quirks;
 102 } VFIOVGARegion;
 103
 104 typedef struct VFIOVGA {
 105     off_t fd_offset;
 106     int fd;
 107     VFIOVGARegion region[QEMU_PCI_VGA_NUM_REGIONS];
 108 } VFIOVGA;
 109
 110 typedef struct VFIOINTx {
 111     bool pending; /* interrupt pending */
 112     bool kvm_accel; /* set when QEMU bypass through KVM enabled */
 113     uint8_t pin; /* which pin to pull for qemu_set_irq */
 114     EventNotifier interrupt; /* eventfd triggered on interrupt */
 115     EventNotifier unmask; /* eventfd for unmask on QEMU bypass */
 116     PCIINTxRoute route; /* routing info for QEMU bypass */
 117     uint32_t mmap_timeout; /* delay to re-enable mmaps after interrupt */
 118     QEMUTimer *mmap_timer; /* enable mmaps after periods w/o interrupts */
 119 } VFIOINTx;
 120
 121 typedef struct VFIOMSIVector {
 122     EventNotifier interrupt; /* eventfd triggered on interrupt */
 123     struct VFIODevice *vdev; /* back pointer to device */
 124     MSIMessage msg; /* cache the MSI message so we know when it changes */
 125     int virq; /* KVM irqchip route for QEMU bypass */
 126     bool use;
 127 } VFIOMSIVector;
 128
 129 enum {
 130     VFIO_INT_NONE = 0,
 131     VFIO_INT_INTx = 1,
 132     VFIO_INT_MSI  = 2,
 133     VFIO_INT_MSIX = 3,
 134 };
 135
 136 struct VFIOGroup;
 137
 138 typedef struct VFIOContainer {
 139     int fd; /* /dev/vfio/vfio, empowered by the attached groups */
 140     struct {
 141         /* enable abstraction to support various iommu backends */
 142         union {
 143             MemoryListener listener; /* Used by type1 iommu */
 144         };
 145         void (*release)(struct VFIOContainer *);
 146     } iommu_data;
 147     QLIST_HEAD(, VFIOGroup) group_list;
 148     QLIST_ENTRY(VFIOContainer) next;
 149 } VFIOContainer;
 150
 151 /* Cache of MSI-X setup plus extra mmap and memory region for split BAR map */
 152 typedef struct VFIOMSIXInfo {
 153     uint8_t table_bar;
 154     uint8_t pba_bar;
 155     uint16_t entries;
 156     uint32_t table_offset;
 157     uint32_t pba_offset;
 158     MemoryRegion mmap_mem;
 159     void *mmap;
 160 } VFIOMSIXInfo;
 161
 162 typedef struct VFIODevice {
 163     PCIDevice pdev;
 164     int fd;
 165     VFIOINTx intx;
 166     unsigned int config_size;
 167     uint8_t *emulated_config_bits; /* QEMU emulated bits, little-endian */
 168     off_t config_offset; /* Offset of config space region within device fd */
 169     unsigned int rom_size;
 170     off_t rom_offset; /* Offset of ROM region within device fd */
 171     void *rom;
 172     int msi_cap_size;
 173     VFIOMSIVector *msi_vectors;
 174     VFIOMSIXInfo *msix;
 175     int nr_vectors; /* Number of MSI/MSIX vectors currently in use */
 176     int interrupt; /* Current interrupt type */
 177     VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */
 178     VFIOVGA vga; /* 0xa0000, 0x3b0, 0x3c0 */
 179     PCIHostDeviceAddress host;
 180     QLIST_ENTRY(VFIODevice) next;
 181     struct VFIOGroup *group;
 182     EventNotifier err_notifier;
 183     uint32_t features;
 184 #define VFIO_FEATURE_ENABLE_VGA_BIT 0
 185 #define VFIO_FEATURE_ENABLE_VGA (1 << VFIO_FEATURE_ENABLE_VGA_BIT)
 186     int32_t bootindex;
 187     uint8_t pm_cap;
 188     bool reset_works;
 189     bool has_vga;
 190     bool pci_aer;
 191     bool has_flr;
 192     bool has_pm_reset;
 193     bool needs_reset;
 194 } VFIODevice;
 195
 196 typedef struct VFIOGroup {
 197     int fd;
 198     int groupid;
 199     VFIOContainer *container;
 200     QLIST_HEAD(, VFIODevice) device_list;
 201     QLIST_ENTRY(VFIOGroup) next;
 202     QLIST_ENTRY(VFIOGroup) container_next;
 203 } VFIOGroup;
 204
 205 #define MSIX_CAP_LENGTH 12
 206
 207 static QLIST_HEAD(, VFIOContainer)
 208     container_list = QLIST_HEAD_INITIALIZER(container_list);
 209
 210 static QLIST_HEAD(, VFIOGroup)
 211     group_list = QLIST_HEAD_INITIALIZER(group_list);
 212
 213 #ifdef CONFIG_KVM
 214 /*
 215  * We have a single VFIO pseudo device per KVM VM.  Once created it lives
 216  * for the life of the VM.  Closing the file descriptor only drops our
 217  * reference to it and the device's reference to kvm.  Therefore once
 218  * initialized, this file descriptor is only released on QEMU exit and
 219  * we'll re-use it should another vfio device be attached before then.
 220  */
 221 static int vfio_kvm_device_fd = -1;
 222 #endif
 223
 224 static void vfio_disable_interrupts(VFIODevice *vdev);
 225 static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
 226 static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
 227                                   uint32_t val, int len);
 228 static void vfio_mmap_set_enabled(VFIODevice *vdev, bool enabled);
 229
 230 /*
 231  * Common VFIO interrupt disable
 232  */
 233 static void vfio_disable_irqindex(VFIODevice *vdev, int index)
 234 {
 235     struct vfio_irq_set irq_set = {
 236         .argsz = sizeof(irq_set),
 237         .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
 238         .index = index,
 239         .start = 0,
 240         .count = 0,
 241     };
 242
 243     ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
 244 }
 245
 246 /*
 247  * INTx
 248  */
 249 static void vfio_unmask_intx(VFIODevice *vdev)
 250 {
 251     struct vfio_irq_set irq_set = {
 252         .argsz = sizeof(irq_set),
 253         .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK,
 254         .index = VFIO_PCI_INTX_IRQ_INDEX,
 255         .start = 0,
 256         .count = 1,
 257     };
 258
 259     ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
 260 }
 261
 262 #ifdef CONFIG_KVM /* Unused outside of CONFIG_KVM code */
 263 static void vfio_mask_intx(VFIODevice *vdev)
 264 {
 265     struct vfio_irq_set irq_set = {
 266         .argsz = sizeof(irq_set),
 267         .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
 268         .index = VFIO_PCI_INTX_IRQ_INDEX,
 269         .start = 0,
 270         .count = 1,
 271     };
 272
 273     ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
 274 }
 275 #endif
 276
 277 /*
 278  * Disabling BAR mmaping can be slow, but toggling it around INTx can
 279  * also be a huge overhead.  We try to get the best of both worlds by
 280  * waiting until an interrupt to disable mmaps (subsequent transitions
 281  * to the same state are effectively no overhead).  If the interrupt has
 282  * been serviced and the time gap is long enough, we re-enable mmaps for
 283  * performance.  This works well for things like graphics cards, which
 284  * may not use their interrupt at all and are penalized to an unusable
 285  * level by read/write BAR traps.  Other devices, like NICs, have more
 286  * regular interrupts and see much better latency by staying in non-mmap
 287  * mode.  We therefore set the default mmap_timeout such that a ping
 288  * is just enough to keep the mmap disabled.  Users can experiment with
 289  * other options with the x-intx-mmap-timeout-ms parameter (a value of
 290  * zero disables the timer).
 291  */
 292 static void vfio_intx_mmap_enable(void *opaque)
 293 {
 294     VFIODevice *vdev = opaque;
 295
 296     if (vdev->intx.pending) {
 297         timer_mod(vdev->intx.mmap_timer,
 298                        qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout);
 299         return;
 300     }
 301
 302     vfio_mmap_set_enabled(vdev, true);
 303 }
 304
 305 static void vfio_intx_interrupt(void *opaque)
 306 {
 307     VFIODevice *vdev = opaque;
 308
 309     if (!event_notifier_test_and_clear(&vdev->intx.interrupt)) {
 310         return;
 311     }
 312
 313     DPRINTF("%s(%04x:%02x:%02x.%x) Pin %c\n", __func__, vdev->host.domain,
 314             vdev->host.bus, vdev->host.slot, vdev->host.function,
 315             'A' + vdev->intx.pin);
 316
 317     vdev->intx.pending = true;
 318     pci_irq_assert(&vdev->pdev);
 319     vfio_mmap_set_enabled(vdev, false);
 320     if (vdev->intx.mmap_timeout) {
 321         timer_mod(vdev->intx.mmap_timer,
 322                        qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout);
 323     }
 324 }
 325
 326 static void vfio_eoi(VFIODevice *vdev)
 327 {
 328     if (!vdev->intx.pending) {
 329         return;
 330     }
 331
 332     DPRINTF("%s(%04x:%02x:%02x.%x) EOI\n", __func__, vdev->host.domain,
 333             vdev->host.bus, vdev->host.slot, vdev->host.function);
 334
 335     vdev->intx.pending = false;
 336     pci_irq_deassert(&vdev->pdev);
 337     vfio_unmask_intx(vdev);
 338 }
 339
 340 static void vfio_enable_intx_kvm(VFIODevice *vdev)
 341 {
 342 #ifdef CONFIG_KVM
 343     struct kvm_irqfd irqfd = {
 344         .fd = event_notifier_get_fd(&vdev->intx.interrupt),
 345         .gsi = vdev->intx.route.irq,
 346         .flags = KVM_IRQFD_FLAG_RESAMPLE,
 347     };
 348     struct vfio_irq_set *irq_set;
 349     int ret, argsz;
 350     int32_t *pfd;
 351
 352     if (!VFIO_ALLOW_KVM_INTX || !kvm_irqfds_enabled() ||
 353         vdev->intx.route.mode != PCI_INTX_ENABLED ||
 354         !kvm_check_extension(kvm_state, KVM_CAP_IRQFD_RESAMPLE)) {
 355         return;
 356     }
 357
 358     /* Get to a known interrupt state */
 359     qemu_set_fd_handler(irqfd.fd, NULL, NULL, vdev);
 360     vfio_mask_intx(vdev);
 361     vdev->intx.pending = false;
 362     pci_irq_deassert(&vdev->pdev);
 363
 364     /* Get an eventfd for resample/unmask */
 365     if (event_notifier_init(&vdev->intx.unmask, 0)) {
 366         error_report("vfio: Error: event_notifier_init failed eoi");
 367         goto fail;
 368     }
 369
 370     /* KVM triggers it, VFIO listens for it */
 371     irqfd.resamplefd = event_notifier_get_fd(&vdev->intx.unmask);
 372
 373     if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) {
 374         error_report("vfio: Error: Failed to setup resample irqfd: %m");
 375         goto fail_irqfd;
 376     }
 377
 378     argsz = sizeof(*irq_set) + sizeof(*pfd);
 379
 380     irq_set = g_malloc0(argsz);
 381     irq_set->argsz = argsz;
 382     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK;
 383     irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
 384     irq_set->start = 0;
 385     irq_set->count = 1;
 386     pfd = (int32_t *)&irq_set->data;
 387
 388     *pfd = irqfd.resamplefd;
 389
 390     ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
 391     g_free(irq_set);
 392     if (ret) {
 393         error_report("vfio: Error: Failed to setup INTx unmask fd: %m");
 394         goto fail_vfio;
 395     }
 396
 397     /* Let'em rip */
 398     vfio_unmask_intx(vdev);
 399
 400     vdev->intx.kvm_accel = true;
 401
 402     DPRINTF("%s(%04x:%02x:%02x.%x) KVM INTx accel enabled\n",
 403             __func__, vdev->host.domain, vdev->host.bus,
 404             vdev->host.slot, vdev->host.function);
 405
 406     return;
 407
 408 fail_vfio:
 409     irqfd.flags = KVM_IRQFD_FLAG_DEASSIGN;
 410     kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd);
 411 fail_irqfd:
 412     event_notifier_cleanup(&vdev->intx.unmask);
 413 fail:
 414     qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
 415     vfio_unmask_intx(vdev);
 416 #endif
 417 }
 418
 419 static void vfio_disable_intx_kvm(VFIODevice *vdev)
 420 {
 421 #ifdef CONFIG_KVM
 422     struct kvm_irqfd irqfd = {
 423         .fd = event_notifier_get_fd(&vdev->intx.interrupt),
 424         .gsi = vdev->intx.route.irq,
 425         .flags = KVM_IRQFD_FLAG_DEASSIGN,
 426     };
 427
 428     if (!vdev->intx.kvm_accel) {
 429         return;
 430     }
 431
 432     /*
 433      * Get to a known state, hardware masked, QEMU ready to accept new
 434      * interrupts, QEMU IRQ de-asserted.
 435      */
 436     vfio_mask_intx(vdev);
 437     vdev->intx.pending = false;
 438     pci_irq_deassert(&vdev->pdev);
 439
 440     /* Tell KVM to stop listening for an INTx irqfd */
 441     if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) {
 442         error_report("vfio: Error: Failed to disable INTx irqfd: %m");
 443     }
 444
 445     /* We only need to close the eventfd for VFIO to cleanup the kernel side */
 446     event_notifier_cleanup(&vdev->intx.unmask);
 447
 448     /* QEMU starts listening for interrupt events. */
 449     qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
 450
 451     vdev->intx.kvm_accel = false;
 452
 453     /* If we've missed an event, let it re-fire through QEMU */
 454     vfio_unmask_intx(vdev);
 455
 456     DPRINTF("%s(%04x:%02x:%02x.%x) KVM INTx accel disabled\n",
 457             __func__, vdev->host.domain, vdev->host.bus,
 458             vdev->host.slot, vdev->host.function);
 459 #endif
 460 }
 461
 462 static void vfio_update_irq(PCIDevice *pdev)
 463 {
 464     VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
 465     PCIINTxRoute route;
 466
 467     if (vdev->interrupt != VFIO_INT_INTx) {
 468         return;
 469     }
 470
 471     route = pci_device_route_intx_to_irq(&vdev->pdev, vdev->intx.pin);
 472
 473     if (!pci_intx_route_changed(&vdev->intx.route, &route)) {
 474         return; /* Nothing changed */
 475     }
 476
 477     DPRINTF("%s(%04x:%02x:%02x.%x) IRQ moved %d -> %d\n", __func__,
 478             vdev->host.domain, vdev->host.bus, vdev->host.slot,
 479             vdev->host.function, vdev->intx.route.irq, route.irq);
 480
 481     vfio_disable_intx_kvm(vdev);
 482
 483     vdev->intx.route = route;
 484
 485     if (route.mode != PCI_INTX_ENABLED) {
 486         return;
 487     }
 488
 489     vfio_enable_intx_kvm(vdev);
 490
 491     /* Re-enable the interrupt in cased we missed an EOI */
 492     vfio_eoi(vdev);
 493 }
 494
 495 static int vfio_enable_intx(VFIODevice *vdev)
 496 {
 497     uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1);
 498     int ret, argsz;
 499     struct vfio_irq_set *irq_set;
 500     int32_t *pfd;
 501
 502     if (!pin) {
 503         return 0;
 504     }
 505
 506     vfio_disable_interrupts(vdev);
 507
 508     vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
 509     pci_config_set_interrupt_pin(vdev->pdev.config, pin);
 510
 511 #ifdef CONFIG_KVM
 512     /*
 513      * Only conditional to avoid generating error messages on platforms
 514      * where we won't actually use the result anyway.
 515      */
 516     if (kvm_irqfds_enabled() &&
 517         kvm_check_extension(kvm_state, KVM_CAP_IRQFD_RESAMPLE)) {
 518         vdev->intx.route = pci_device_route_intx_to_irq(&vdev->pdev,
 519                                                         vdev->intx.pin);
 520     }
 521 #endif
 522
 523     ret = event_notifier_init(&vdev->intx.interrupt, 0);
 524     if (ret) {
 525         error_report("vfio: Error: event_notifier_init failed");
 526         return ret;
 527     }
 528
 529     argsz = sizeof(*irq_set) + sizeof(*pfd);
 530
 531     irq_set = g_malloc0(argsz);
 532     irq_set->argsz = argsz;
 533     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
 534     irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
 535     irq_set->start = 0;
 536     irq_set->count = 1;
 537     pfd = (int32_t *)&irq_set->data;
 538
 539     *pfd = event_notifier_get_fd(&vdev->intx.interrupt);
 540     qemu_set_fd_handler(*pfd, vfio_intx_interrupt, NULL, vdev);
 541
 542     ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
 543     g_free(irq_set);
 544     if (ret) {
 545         error_report("vfio: Error: Failed to setup INTx fd: %m");
 546         qemu_set_fd_handler(*pfd, NULL, NULL, vdev);
 547         event_notifier_cleanup(&vdev->intx.interrupt);
 548         return -errno;
 549     }
 550
 551     vfio_enable_intx_kvm(vdev);
 552
 553     vdev->interrupt = VFIO_INT_INTx;
 554
 555     DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
 556             vdev->host.bus, vdev->host.slot, vdev->host.function);
 557
 558     return 0;
 559 }
 560
 561 static void vfio_disable_intx(VFIODevice *vdev)
 562 {
 563     int fd;
 564
 565     timer_del(vdev->intx.mmap_timer);
 566     vfio_disable_intx_kvm(vdev);
 567     vfio_disable_irqindex(vdev, VFIO_PCI_INTX_IRQ_INDEX);
 568     vdev->intx.pending = false;
 569     pci_irq_deassert(&vdev->pdev);
 570     vfio_mmap_set_enabled(vdev, true);
 571
 572     fd = event_notifier_get_fd(&vdev->intx.interrupt);
 573     qemu_set_fd_handler(fd, NULL, NULL, vdev);
 574     event_notifier_cleanup(&vdev->intx.interrupt);
 575
 576     vdev->interrupt = VFIO_INT_NONE;
 577
 578     DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
 579             vdev->host.bus, vdev->host.slot, vdev->host.function);
 580 }
 581
 582 /*
 583  * MSI/X
 584  */
 585 static void vfio_msi_interrupt(void *opaque)
 586 {
 587     VFIOMSIVector *vector = opaque;
 588     VFIODevice *vdev = vector->vdev;
 589     int nr = vector - vdev->msi_vectors;
 590
 591     if (!event_notifier_test_and_clear(&vector->interrupt)) {
 592         return;
 593     }
 594
 595 #ifdef VFIO_DEBUG
 596     MSIMessage msg;
 597
 598     if (vdev->interrupt == VFIO_INT_MSIX) {
 599         msg = msi_get_message(&vdev->pdev, nr);
 600     } else if (vdev->interrupt == VFIO_INT_MSI) {
 601         msg = msix_get_message(&vdev->pdev, nr);
 602     } else {
 603         abort();
 604     }
 605
 606     DPRINTF("%s(%04x:%02x:%02x.%x) vector %d 0x%"PRIx64"/0x%x\n", __func__,
 607             vdev->host.domain, vdev->host.bus, vdev->host.slot,
 608             vdev->host.function, nr, msg.address, msg.data);
 609 #endif
 610
 611     if (vdev->interrupt == VFIO_INT_MSIX) {
 612         msix_notify(&vdev->pdev, nr);
 613     } else if (vdev->interrupt == VFIO_INT_MSI) {
 614         msi_notify(&vdev->pdev, nr);
 615     } else {
 616         error_report("vfio: MSI interrupt receieved, but not enabled?");
 617     }
 618 }
 619
 620 static int vfio_enable_vectors(VFIODevice *vdev, bool msix)
 621 {
 622     struct vfio_irq_set *irq_set;
 623     int ret = 0, i, argsz;
 624     int32_t *fds;
 625
 626     argsz = sizeof(*irq_set) + (vdev->nr_vectors * sizeof(*fds));
 627
 628     irq_set = g_malloc0(argsz);
 629     irq_set->argsz = argsz;
 630     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
 631     irq_set->index = msix ? VFIO_PCI_MSIX_IRQ_INDEX : VFIO_PCI_MSI_IRQ_INDEX;
 632     irq_set->start = 0;
 633     irq_set->count = vdev->nr_vectors;
 634     fds = (int32_t *)&irq_set->data;
 635
 636     for (i = 0; i < vdev->nr_vectors; i++) {
 637         if (!vdev->msi_vectors[i].use) {
 638             fds[i] = -1;
 639             continue;
 640         }
 641
 642         fds[i] = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
 643     }
 644
 645     ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
 646
 647     g_free(irq_set);
 648
 649     return ret;
 650 }
 651
 652 static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
 653                                    MSIMessage *msg, IOHandler *handler)
 654 {
 655     VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
 656     VFIOMSIVector *vector;
 657     int ret;
 658
 659     DPRINTF("%s(%04x:%02x:%02x.%x) vector %d used\n", __func__,
 660             vdev->host.domain, vdev->host.bus, vdev->host.slot,
 661             vdev->host.function, nr);
 662
 663     vector = &vdev->msi_vectors[nr];
 664     vector->vdev = vdev;
 665     vector->use = true;
 666
 667     msix_vector_use(pdev, nr);
 668
 669     if (event_notifier_init(&vector->interrupt, 0)) {
 670         error_report("vfio: Error: event_notifier_init failed");
 671     }
 672
 673     /*
 674      * Attempt to enable route through KVM irqchip,
 675      * default to userspace handling if unavailable.
 676      */
 677     vector->virq = msg && VFIO_ALLOW_KVM_MSIX ?
 678                    kvm_irqchip_add_msi_route(kvm_state, *msg) : -1;
 679     if (vector->virq < 0 ||
 680         kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
 681                                        NULL, vector->virq) < 0) {
 682         if (vector->virq >= 0) {
 683             kvm_irqchip_release_virq(kvm_state, vector->virq);
 684             vector->virq = -1;
 685         }
 686         qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
 687                             handler, NULL, vector);
 688     }
 689
 690     /*
 691      * We don't want to have the host allocate all possible MSI vectors
 692      * for a device if they're not in use, so we shutdown and incrementally
 693      * increase them as needed.
 694      */
 695     if (vdev->nr_vectors < nr + 1) {
 696         vfio_disable_irqindex(vdev, VFIO_PCI_MSIX_IRQ_INDEX);
 697         vdev->nr_vectors = nr + 1;
 698         ret = vfio_enable_vectors(vdev, true);
 699         if (ret) {
 700             error_report("vfio: failed to enable vectors, %d", ret);
 701         }
 702     } else {
 703         int argsz;
 704         struct vfio_irq_set *irq_set;
 705         int32_t *pfd;
 706
 707         argsz = sizeof(*irq_set) + sizeof(*pfd);
 708
 709         irq_set = g_malloc0(argsz);
 710         irq_set->argsz = argsz;
 711         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
 712                          VFIO_IRQ_SET_ACTION_TRIGGER;
 713         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
 714         irq_set->start = nr;
 715         irq_set->count = 1;
 716         pfd = (int32_t *)&irq_set->data;
 717
 718         *pfd = event_notifier_get_fd(&vector->interrupt);
 719
 720         ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
 721         g_free(irq_set);
 722         if (ret) {
 723             error_report("vfio: failed to modify vector, %d", ret);
 724         }
 725     }
 726
 727     return 0;
 728 }
 729
 730 static int vfio_msix_vector_use(PCIDevice *pdev,
 731                                 unsigned int nr, MSIMessage msg)
 732 {
 733     return vfio_msix_vector_do_use(pdev, nr, &msg, vfio_msi_interrupt);
 734 }
 735
 736 static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
 737 {
 738     VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
 739     VFIOMSIVector *vector = &vdev->msi_vectors[nr];
 740     int argsz;
 741     struct vfio_irq_set *irq_set;
 742     int32_t *pfd;
 743
 744     DPRINTF("%s(%04x:%02x:%02x.%x) vector %d released\n", __func__,
 745             vdev->host.domain, vdev->host.bus, vdev->host.slot,
 746             vdev->host.function, nr);
 747
 748     /*
 749      * XXX What's the right thing to do here?  This turns off the interrupt
 750      * completely, but do we really just want to switch the interrupt to
 751      * bouncing through userspace and let msix.c drop it?  Not sure.
 752      */
 753     msix_vector_unuse(pdev, nr);
 754
 755     argsz = sizeof(*irq_set) + sizeof(*pfd);
 756
 757     irq_set = g_malloc0(argsz);
 758     irq_set->argsz = argsz;
 759     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
 760                      VFIO_IRQ_SET_ACTION_TRIGGER;
 761     irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
 762     irq_set->start = nr;
 763     irq_set->count = 1;
 764     pfd = (int32_t *)&irq_set->data;
 765
 766     *pfd = -1;
 767
 768     ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
 769
 770     g_free(irq_set);
 771
 772     if (vector->virq < 0) {
 773         qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
 774                             NULL, NULL, NULL);
 775     } else {
 776         kvm_irqchip_remove_irqfd_notifier(kvm_state, &vector->interrupt,
 777                                           vector->virq);
 778         kvm_irqchip_release_virq(kvm_state, vector->virq);
 779         vector->virq = -1;
 780     }
 781
 782     event_notifier_cleanup(&vector->interrupt);
 783     vector->use = false;
 784 }
 785
 786 static void vfio_enable_msix(VFIODevice *vdev)
 787 {
 788     vfio_disable_interrupts(vdev);
 789
 790     vdev->msi_vectors = g_malloc0(vdev->msix->entries * sizeof(VFIOMSIVector));
 791
 792     vdev->interrupt = VFIO_INT_MSIX;
 793
 794     /*
 795      * Some communication channels between VF & PF or PF & fw rely on the
 796      * physical state of the device and expect that enabling MSI-X from the
 797      * guest enables the same on the host.  When our guest is Linux, the
 798      * guest driver call to pci_enable_msix() sets the enabling bit in the
 799      * MSI-X capability, but leaves the vector table masked.  We therefore
 800      * can't rely on a vector_use callback (from request_irq() in the guest)
 801      * to switch the physical device into MSI-X mode because that may come a
 802      * long time after pci_enable_msix().  This code enables vector 0 with
 803      * triggering to userspace, then immediately release the vector, leaving
 804      * the physical device with no vectors enabled, but MSI-X enabled, just
 805      * like the guest view.
 806      */
 807     vfio_msix_vector_do_use(&vdev->pdev, 0, NULL, NULL);
 808     vfio_msix_vector_release(&vdev->pdev, 0);
 809
 810     if (msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use,
 811                                   vfio_msix_vector_release, NULL)) {
 812         error_report("vfio: msix_set_vector_notifiers failed");
 813     }
 814
 815     DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
 816             vdev->host.bus, vdev->host.slot, vdev->host.function);
 817 }
 818
 819 static void vfio_enable_msi(VFIODevice *vdev)
 820 {
 821     int ret, i;
 822
 823     vfio_disable_interrupts(vdev);
 824
 825     vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev);
 826 retry:
 827     vdev->msi_vectors = g_malloc0(vdev->nr_vectors * sizeof(VFIOMSIVector));
 828
 829     for (i = 0; i < vdev->nr_vectors; i++) {
 830         VFIOMSIVector *vector = &vdev->msi_vectors[i];
 831
 832         vector->vdev = vdev;
 833         vector->use = true;
 834
 835         if (event_notifier_init(&vector->interrupt, 0)) {
 836             error_report("vfio: Error: event_notifier_init failed");
 837         }
 838
 839         vector->msg = msi_get_message(&vdev->pdev, i);
 840
 841         /*
 842          * Attempt to enable route through KVM irqchip,
 843          * default to userspace handling if unavailable.
 844          */
 845         vector->virq = VFIO_ALLOW_KVM_MSI ?
 846                        kvm_irqchip_add_msi_route(kvm_state, vector->msg) : -1;
 847         if (vector->virq < 0 ||
 848             kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
 849                                            NULL, vector->virq) < 0) {
 850             qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
 851                                 vfio_msi_interrupt, NULL, vector);
 852         }
 853     }
 854
 855     ret = vfio_enable_vectors(vdev, false);
 856     if (ret) {
 857         if (ret < 0) {
 858             error_report("vfio: Error: Failed to setup MSI fds: %m");
 859         } else if (ret != vdev->nr_vectors) {
 860             error_report("vfio: Error: Failed to enable %d "
 861                          "MSI vectors, retry with %d", vdev->nr_vectors, ret);
 862         }
 863
 864         for (i = 0; i < vdev->nr_vectors; i++) {
 865             VFIOMSIVector *vector = &vdev->msi_vectors[i];
 866             if (vector->virq >= 0) {
 867                 kvm_irqchip_remove_irqfd_notifier(kvm_state, &vector->interrupt,
 868                                                   vector->virq);
 869                 kvm_irqchip_release_virq(kvm_state, vector->virq);
 870                 vector->virq = -1;
 871             } else {
 872                 qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
 873                                     NULL, NULL, NULL);
 874             }
 875             event_notifier_cleanup(&vector->interrupt);
 876         }
 877
 878         g_free(vdev->msi_vectors);
 879
 880         if (ret > 0 && ret != vdev->nr_vectors) {
 881             vdev->nr_vectors = ret;
 882             goto retry;
 883         }
 884         vdev->nr_vectors = 0;
 885
 886         return;
 887     }
 888
 889     vdev->interrupt = VFIO_INT_MSI;
 890
 891     DPRINTF("%s(%04x:%02x:%02x.%x) Enabled %d MSI vectors\n", __func__,
 892             vdev->host.domain, vdev->host.bus, vdev->host.slot,
 893             vdev->host.function, vdev->nr_vectors);
 894 }
 895
 896 static void vfio_disable_msi_common(VFIODevice *vdev)
 897 {
 898     g_free(vdev->msi_vectors);
 899     vdev->msi_vectors = NULL;
 900     vdev->nr_vectors = 0;
 901     vdev->interrupt = VFIO_INT_NONE;
 902
 903     vfio_enable_intx(vdev);
 904 }
 905
 906 static void vfio_disable_msix(VFIODevice *vdev)
 907 {
 908     int i;
 909
 910     msix_unset_vector_notifiers(&vdev->pdev);
 911
 912     /*
 913      * MSI-X will only release vectors if MSI-X is still enabled on the
 914      * device, check through the rest and release it ourselves if necessary.
 915      */
 916     for (i = 0; i < vdev->nr_vectors; i++) {
 917         if (vdev->msi_vectors[i].use) {
 918             vfio_msix_vector_release(&vdev->pdev, i);
 919         }
 920     }
 921
 922     if (vdev->nr_vectors) {
 923         vfio_disable_irqindex(vdev, VFIO_PCI_MSIX_IRQ_INDEX);
 924     }
 925
 926     vfio_disable_msi_common(vdev);
 927
 928     DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
 929             vdev->host.bus, vdev->host.slot, vdev->host.function);
 930 }
 931
 932 static void vfio_disable_msi(VFIODevice *vdev)
 933 {
 934     int i;
 935
 936     vfio_disable_irqindex(vdev, VFIO_PCI_MSI_IRQ_INDEX);
 937
 938     for (i = 0; i < vdev->nr_vectors; i++) {
 939         VFIOMSIVector *vector = &vdev->msi_vectors[i];
 940
 941         if (!vector->use) {
 942             continue;
 943         }
 944
 945         if (vector->virq >= 0) {
 946             kvm_irqchip_remove_irqfd_notifier(kvm_state,
 947                                               &vector->interrupt, vector->virq);
 948             kvm_irqchip_release_virq(kvm_state, vector->virq);
 949             vector->virq = -1;
 950         } else {
 951             qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
 952                                 NULL, NULL, NULL);
 953         }
 954
 955         event_notifier_cleanup(&vector->interrupt);
 956     }
 957
 958     vfio_disable_msi_common(vdev);
 959
 960     DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
 961             vdev->host.bus, vdev->host.slot, vdev->host.function);
 962 }
 963
 964 static void vfio_update_msi(VFIODevice *vdev)
 965 {
 966     int i;
 967
 968     for (i = 0; i < vdev->nr_vectors; i++) {
 969         VFIOMSIVector *vector = &vdev->msi_vectors[i];
 970         MSIMessage msg;
 971
 972         if (!vector->use || vector->virq < 0) {
 973             continue;
 974         }
 975
 976         msg = msi_get_message(&vdev->pdev, i);
 977
 978         if (msg.address != vector->msg.address ||
 979             msg.data != vector->msg.data) {
 980
 981             DPRINTF("%s(%04x:%02x:%02x.%x) MSI vector %d changed\n",
 982                     __func__, vdev->host.domain, vdev->host.bus,
 983                     vdev->host.slot, vdev->host.function, i);
 984
 985             kvm_irqchip_update_msi_route(kvm_state, vector->virq, msg);
 986             vector->msg = msg;
 987         }
 988     }
 989 }
 990
 991 /*
 992  * IO Port/MMIO - Beware of the endians, VFIO is always little endian
 993  */
 994 static void vfio_bar_write(void *opaque, hwaddr addr,
 995                            uint64_t data, unsigned size)
 996 {
 997     VFIOBAR *bar = opaque;
 998     union {
 999         uint8_t byte;
1000         uint16_t word;
1001         uint32_t dword;
1002         uint64_t qword;
1003     } buf;
1004
1005     switch (size) {
1006     case 1:
1007         buf.byte = data;
1008         break;
1009     case 2:
1010         buf.word = cpu_to_le16(data);
1011         break;
1012     case 4:
1013         buf.dword = cpu_to_le32(data);
1014         break;
1015     default:
1016         hw_error("vfio: unsupported write size, %d bytes\n", size);
1017         break;
1018     }
1019
1020     if (pwrite(bar->fd, &buf, size, bar->fd_offset + addr) != size) {
1021         error_report("%s(,0x%"HWADDR_PRIx", 0x%"PRIx64", %d) failed: %m",
1022                      __func__, addr, data, size);
1023     }
1024
1025 #ifdef DEBUG_VFIO
1026     {
1027         VFIODevice *vdev = container_of(bar, VFIODevice, bars[bar->nr]);
1028
1029         DPRINTF("%s(%04x:%02x:%02x.%x:BAR%d+0x%"HWADDR_PRIx", 0x%"PRIx64
1030                 ", %d)\n", __func__, vdev->host.domain, vdev->host.bus,
1031                 vdev->host.slot, vdev->host.function, bar->nr, addr,
1032                 data, size);
1033     }
1034 #endif
1035
1036     /*
1037      * A read or write to a BAR always signals an INTx EOI.  This will
1038      * do nothing if not pending (including not in INTx mode).  We assume
1039      * that a BAR access is in response to an interrupt and that BAR
1040      * accesses will service the interrupt.  Unfortunately, we don't know
1041      * which access will service the interrupt, so we're potentially
1042      * getting quite a few host interrupts per guest interrupt.
1043      */
1044     vfio_eoi(container_of(bar, VFIODevice, bars[bar->nr]));
1045 }
1046
1047 static uint64_t vfio_bar_read(void *opaque,
1048                               hwaddr addr, unsigned size)
1049 {
1050     VFIOBAR *bar = opaque;
1051     union {
1052         uint8_t byte;
1053         uint16_t word;
1054         uint32_t dword;
1055         uint64_t qword;
1056     } buf;
1057     uint64_t data = 0;
1058
1059     if (pread(bar->fd, &buf, size, bar->fd_offset + addr) != size) {
1060         error_report("%s(,0x%"HWADDR_PRIx", %d) failed: %m",
1061                      __func__, addr, size);
1062         return (uint64_t)-1;
1063     }
1064
1065     switch (size) {
1066     case 1:
1067         data = buf.byte;
1068         break;
1069     case 2:
1070         data = le16_to_cpu(buf.word);
1071         break;
1072     case 4:
1073         data = le32_to_cpu(buf.dword);
1074         break;
1075     default:
1076         hw_error("vfio: unsupported read size, %d bytes\n", size);
1077         break;
1078     }
1079
1080 #ifdef DEBUG_VFIO
1081     {
1082         VFIODevice *vdev = container_of(bar, VFIODevice, bars[bar->nr]);
1083
1084         DPRINTF("%s(%04x:%02x:%02x.%x:BAR%d+0x%"HWADDR_PRIx
1085                 ", %d) = 0x%"PRIx64"\n", __func__, vdev->host.domain,
1086                 vdev->host.bus, vdev->host.slot, vdev->host.function,
1087                 bar->nr, addr, size, data);
1088     }
1089 #endif
1090
1091     /* Same as write above */
1092     vfio_eoi(container_of(bar, VFIODevice, bars[bar->nr]));
1093
1094     return data;
1095 }
1096
1097 static const MemoryRegionOps vfio_bar_ops = {
1098     .read = vfio_bar_read,
1099     .write = vfio_bar_write,
1100     .endianness = DEVICE_LITTLE_ENDIAN,
1101 };
1102
1103 static void vfio_pci_load_rom(VFIODevice *vdev)
1104 {
1105     struct vfio_region_info reg_info = {
1106         .argsz = sizeof(reg_info),
1107         .index = VFIO_PCI_ROM_REGION_INDEX
1108     };
1109     uint64_t size;
1110     off_t off = 0;
1111     size_t bytes;
1112
1113     if (ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info)) {
1114         error_report("vfio: Error getting ROM info: %m");
1115         return;
1116     }
1117
1118     DPRINTF("Device %04x:%02x:%02x.%x ROM:\n", vdev->host.domain,
1119             vdev->host.bus, vdev->host.slot, vdev->host.function);
1120     DPRINTF("  size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
1121             (unsigned long)reg_info.size, (unsigned long)reg_info.offset,
1122             (unsigned long)reg_info.flags);
1123
1124     vdev->rom_size = size = reg_info.size;
1125     vdev->rom_offset = reg_info.offset;
1126
1127     if (!vdev->rom_size) {
1128         return;
1129     }
1130
1131     vdev->rom = g_malloc(size);
1132     memset(vdev->rom, 0xff, size);
1133
1134     while (size) {
1135         bytes = pread(vdev->fd, vdev->rom + off, size, vdev->rom_offset + off);
1136         if (bytes == 0) {
1137             break;
1138         } else if (bytes > 0) {
1139             off += bytes;
1140             size -= bytes;
1141         } else {
1142             if (errno == EINTR || errno == EAGAIN) {
1143                 continue;
1144             }
1145             error_report("vfio: Error reading device ROM: %m");
1146             break;
1147         }
1148     }
1149 }
1150
1151 static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size)
1152 {
1153     VFIODevice *vdev = opaque;
1154     uint64_t val = ((uint64_t)1 << (size * 8)) - 1;
1155
1156     /* Load the ROM lazily when the guest tries to read it */
1157     if (unlikely(!vdev->rom)) {
1158         vfio_pci_load_rom(vdev);
1159     }
1160
1161     memcpy(&val, vdev->rom + addr,
1162            (addr < vdev->rom_size) ? MIN(size, vdev->rom_size - addr) : 0);
1163
1164     DPRINTF("%s(%04x:%02x:%02x.%x, 0x%"HWADDR_PRIx", 0x%x) = 0x%"PRIx64"\n",
1165             __func__, vdev->host.domain, vdev->host.bus, vdev->host.slot,
1166             vdev->host.function, addr, size, val);
1167
1168     return val;
1169 }
1170
1171 static void vfio_rom_write(void *opaque, hwaddr addr,
1172                            uint64_t data, unsigned size)
1173 {
1174 }
1175
1176 static const MemoryRegionOps vfio_rom_ops = {
1177     .read = vfio_rom_read,
1178     .write = vfio_rom_write,
1179     .endianness = DEVICE_LITTLE_ENDIAN,
1180 };
1181
1182 static void vfio_pci_size_rom(VFIODevice *vdev)
1183 {
1184     uint32_t orig, size = cpu_to_le32((uint32_t)PCI_ROM_ADDRESS_MASK);
1185     off_t offset = vdev->config_offset + PCI_ROM_ADDRESS;
1186     char name[32];
1187
1188     if (vdev->pdev.romfile || !vdev->pdev.rom_bar) {
1189         return;
1190     }
1191
1192     /*
1193      * Use the same size ROM BAR as the physical device.  The contents
1194      * will get filled in later when the guest tries to read it.
1195      */
1196     if (pread(vdev->fd, &orig, 4, offset) != 4 ||
1197         pwrite(vdev->fd, &size, 4, offset) != 4 ||
1198         pread(vdev->fd, &size, 4, offset) != 4 ||
1199         pwrite(vdev->fd, &orig, 4, offset) != 4) {
1200         error_report("%s(%04x:%02x:%02x.%x) failed: %m",
1201                      __func__, vdev->host.domain, vdev->host.bus,
1202                      vdev->host.slot, vdev->host.function);
1203         return;
1204     }
1205
1206     size = ~(le32_to_cpu(size) & PCI_ROM_ADDRESS_MASK) + 1;
1207
1208     if (!size) {
1209         return;
1210     }
1211
1212     DPRINTF("%04x:%02x:%02x.%x ROM size 0x%x\n", vdev->host.domain,
1213             vdev->host.bus, vdev->host.slot, vdev->host.function, size);
1214
1215     snprintf(name, sizeof(name), "vfio[%04x:%02x:%02x.%x].rom",
1216              vdev->host.domain, vdev->host.bus, vdev->host.slot,
1217              vdev->host.function);
1218
1219     memory_region_init_io(&vdev->pdev.rom, OBJECT(vdev),
1220                           &vfio_rom_ops, vdev, name, size);
1221
1222     pci_register_bar(&vdev->pdev, PCI_ROM_SLOT,
1223                      PCI_BASE_ADDRESS_SPACE_MEMORY, &vdev->pdev.rom);
1224
1225     vdev->pdev.has_rom = true;
1226 }
1227
1228 static void vfio_vga_write(void *opaque, hwaddr addr,
1229                            uint64_t data, unsigned size)
1230 {
1231     VFIOVGARegion *region = opaque;
1232     VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]);
1233     union {
1234         uint8_t byte;
1235         uint16_t word;
1236         uint32_t dword;
1237         uint64_t qword;
1238     } buf;
1239     off_t offset = vga->fd_offset + region->offset + addr;
1240
1241     switch (size) {
1242     case 1:
1243         buf.byte = data;
1244         break;
1245     case 2:
1246         buf.word = cpu_to_le16(data);
1247         break;
1248     case 4:
1249         buf.dword = cpu_to_le32(data);
1250         break;
1251     default:
1252         hw_error("vfio: unsupported write size, %d bytes\n", size);
1253         break;
1254     }
1255
1256     if (pwrite(vga->fd, &buf, size, offset) != size) {
1257         error_report("%s(,0x%"HWADDR_PRIx", 0x%"PRIx64", %d) failed: %m",
1258                      __func__, region->offset + addr, data, size);
1259     }
1260
1261     DPRINTF("%s(0x%"HWADDR_PRIx", 0x%"PRIx64", %d)\n",
1262             __func__, region->offset + addr, data, size);
1263 }
1264
1265 static uint64_t vfio_vga_read(void *opaque, hwaddr addr, unsigned size)
1266 {
1267     VFIOVGARegion *region = opaque;
1268     VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]);
1269     union {
1270         uint8_t byte;
1271         uint16_t word;
1272         uint32_t dword;
1273         uint64_t qword;
1274     } buf;
1275     uint64_t data = 0;
1276     off_t offset = vga->fd_offset + region->offset + addr;
1277
1278     if (pread(vga->fd, &buf, size, offset) != size) {
1279         error_report("%s(,0x%"HWADDR_PRIx", %d) failed: %m",
1280                      __func__, region->offset + addr, size);
1281         return (uint64_t)-1;
1282     }
1283
1284     switch (size) {
1285     case 1:
1286         data = buf.byte;
1287         break;
1288     case 2:
1289         data = le16_to_cpu(buf.word);
1290         break;
1291     case 4:
1292         data = le32_to_cpu(buf.dword);
1293         break;
1294     default:
1295         hw_error("vfio: unsupported read size, %d bytes\n", size);
1296         break;
1297     }
1298
1299     DPRINTF("%s(0x%"HWADDR_PRIx", %d) = 0x%"PRIx64"\n",
1300             __func__, region->offset + addr, size, data);
1301
1302     return data;
1303 }
1304
1305 static const MemoryRegionOps vfio_vga_ops = {
1306     .read = vfio_vga_read,
1307     .write = vfio_vga_write,
1308     .endianness = DEVICE_LITTLE_ENDIAN,
1309 };
1310
1311 /*
1312  * Device specific quirks
1313  */
1314
1315 /* Is range1 fully contained within range2?  */
1316 static bool vfio_range_contained(uint64_t first1, uint64_t len1,
1317                                  uint64_t first2, uint64_t len2) {
1318     return (first1 >= first2 && first1 + len1 <= first2 + len2);
1319 }
1320
1321 static bool vfio_flags_enabled(uint8_t flags, uint8_t mask)
1322 {
1323     return (mask && (flags & mask) == mask);
1324 }
1325
1326 static uint64_t vfio_generic_window_quirk_read(void *opaque,
1327                                                hwaddr addr, unsigned size)
1328 {
1329     VFIOQuirk *quirk = opaque;
1330     VFIODevice *vdev = quirk->vdev;
1331     uint64_t data;
1332
1333     if (vfio_flags_enabled(quirk->data.flags, quirk->data.read_flags) &&
1334         ranges_overlap(addr, size,
1335                        quirk->data.data_offset, quirk->data.data_size)) {
1336         hwaddr offset = addr - quirk->data.data_offset;
1337
1338         if (!vfio_range_contained(addr, size, quirk->data.data_offset,
1339                                   quirk->data.data_size)) {
1340             hw_error("%s: window data read not fully contained: %s\n",
1341                      __func__, memory_region_name(&quirk->mem));
1342         }
1343
1344         data = vfio_pci_read_config(&vdev->pdev,
1345                                     quirk->data.address_val + offset, size);
1346
1347         DPRINTF("%s read(%04x:%02x:%02x.%x:BAR%d+0x%"HWADDR_PRIx", %d) = 0x%"
1348                 PRIx64"\n", memory_region_name(&quirk->mem), vdev->host.domain,
1349                 vdev->host.bus, vdev->host.slot, vdev->host.function,
1350                 quirk->data.bar, addr, size, data);
1351     } else {
1352         data = vfio_bar_read(&vdev->bars[quirk->data.bar],
1353                              addr + quirk->data.base_offset, size);
1354     }
1355
1356     return data;
1357 }
1358
1359 static void vfio_generic_window_quirk_write(void *opaque, hwaddr addr,
1360                                             uint64_t data, unsigned size)
1361 {
1362     VFIOQuirk *quirk = opaque;
1363     VFIODevice *vdev = quirk->vdev;
1364
1365     if (ranges_overlap(addr, size,
1366                        quirk->data.address_offset, quirk->data.address_size)) {
1367
1368         if (addr != quirk->data.address_offset) {
1369             hw_error("%s: offset write into address window: %s\n",
1370                      __func__, memory_region_name(&quirk->mem));
1371         }
1372
1373         if ((data & ~quirk->data.address_mask) == quirk->data.address_match) {
1374             quirk->data.flags |= quirk->data.write_flags |
1375                                  quirk->data.read_flags;
1376             quirk->data.address_val = data & quirk->data.address_mask;
1377         } else {
1378             quirk->data.flags &= ~(quirk->data.write_flags |
1379                                    quirk->data.read_flags);
1380         }
1381     }
1382
1383     if (vfio_flags_enabled(quirk->data.flags, quirk->data.write_flags) &&
1384         ranges_overlap(addr, size,
1385                        quirk->data.data_offset, quirk->data.data_size)) {
1386         hwaddr offset = addr - quirk->data.data_offset;
1387
1388         if (!vfio_range_contained(addr, size, quirk->data.data_offset,
1389                                   quirk->data.data_size)) {
1390             hw_error("%s: window data write not fully contained: %s\n",
1391                      __func__, memory_region_name(&quirk->mem));
1392         }
1393
1394         vfio_pci_write_config(&vdev->pdev,
1395                               quirk->data.address_val + offset, data, size);
1396         DPRINTF("%s write(%04x:%02x:%02x.%x:BAR%d+0x%"HWADDR_PRIx", 0x%"
1397                 PRIx64", %d)\n", memory_region_name(&quirk->mem),
1398                 vdev->host.domain, vdev->host.bus, vdev->host.slot,
1399                 vdev->host.function, quirk->data.bar, addr, data, size);
1400         return;
1401     }
1402
1403     vfio_bar_write(&vdev->bars[quirk->data.bar],
1404                    addr + quirk->data.base_offset, data, size);
1405 }
1406
1407 static const MemoryRegionOps vfio_generic_window_quirk = {
1408     .read = vfio_generic_window_quirk_read,
1409     .write = vfio_generic_window_quirk_write,
1410     .endianness = DEVICE_LITTLE_ENDIAN,
1411 };
1412
1413 static uint64_t vfio_generic_quirk_read(void *opaque,
1414                                         hwaddr addr, unsigned size)
1415 {
1416     VFIOQuirk *quirk = opaque;
1417     VFIODevice *vdev = quirk->vdev;
1418     hwaddr base = quirk->data.address_match & TARGET_PAGE_MASK;
1419     hwaddr offset = quirk->data.address_match & ~TARGET_PAGE_MASK;
1420     uint64_t data;
1421
1422     if (vfio_flags_enabled(quirk->data.flags, quirk->data.read_flags) &&
1423         ranges_overlap(addr, size, offset, quirk->data.address_mask + 1)) {
1424         if (!vfio_range_contained(addr, size, offset,
1425                                   quirk->data.address_mask + 1)) {
1426             hw_error("%s: read not fully contained: %s\n",
1427                      __func__, memory_region_name(&quirk->mem));
1428         }
1429
1430         data = vfio_pci_read_config(&vdev->pdev, addr - offset, size);
1431
1432         DPRINTF("%s read(%04x:%02x:%02x.%x:BAR%d+0x%"HWADDR_PRIx", %d) = 0x%"
1433                 PRIx64"\n", memory_region_name(&quirk->mem), vdev->host.domain,
1434                 vdev->host.bus, vdev->host.slot, vdev->host.function,
1435                 quirk->data.bar, addr + base, size, data);
1436     } else {
1437         data = vfio_bar_read(&vdev->bars[quirk->data.bar], addr + base, size);
1438     }
1439
1440     return data;
1441 }
1442
1443 static void vfio_generic_quirk_write(void *opaque, hwaddr addr,
1444                                      uint64_t data, unsigned size)
1445 {
1446     VFIOQuirk *quirk = opaque;
1447     VFIODevice *vdev = quirk->vdev;
1448     hwaddr base = quirk->data.address_match & TARGET_PAGE_MASK;
1449     hwaddr offset = quirk->data.address_match & ~TARGET_PAGE_MASK;
1450
1451     if (vfio_flags_enabled(quirk->data.flags, quirk->data.write_flags) &&
1452         ranges_overlap(addr, size, offset, quirk->data.address_mask + 1)) {
1453         if (!vfio_range_contained(addr, size, offset,
1454                                   quirk->data.address_mask + 1)) {
1455             hw_error("%s: write not fully contained: %s\n",
1456                      __func__, memory_region_name(&quirk->mem));
1457         }
1458
1459         vfio_pci_write_config(&vdev->pdev, addr - offset, data, size);
1460
1461         DPRINTF("%s write(%04x:%02x:%02x.%x:BAR%d+0x%"HWADDR_PRIx", 0x%"
1462                 PRIx64", %d)\n", memory_region_name(&quirk->mem),
1463                 vdev->host.domain, vdev->host.bus, vdev->host.slot,
1464                 vdev->host.function, quirk->data.bar, addr + base, data, size);
1465     } else {
1466         vfio_bar_write(&vdev->bars[quirk->data.bar], addr + base, data, size);
1467     }
1468 }
1469
1470 static const MemoryRegionOps vfio_generic_quirk = {
1471     .read = vfio_generic_quirk_read,
1472     .write = vfio_generic_quirk_write,
1473     .endianness = DEVICE_LITTLE_ENDIAN,
1474 };
1475
1476 #define PCI_VENDOR_ID_ATI               0x1002
1477
1478 /*
1479  * Radeon HD cards (HD5450 & HD7850) report the upper byte of the I/O port BAR
1480  * through VGA register 0x3c3.  On newer cards, the I/O port BAR is always
1481  * BAR4 (older cards like the X550 used BAR1, but we don't care to support
1482  * those).  Note that on bare metal, a read of 0x3c3 doesn't always return the
1483  * I/O port BAR address.  Originally this was coded to return the virtual BAR
1484  * address only if the physical register read returns the actual BAR address,
1485  * but users have reported greater success if we return the virtual address
1486  * unconditionally.
1487  */
1488 static uint64_t vfio_ati_3c3_quirk_read(void *opaque,
1489                                         hwaddr addr, unsigned size)
1490 {
1491     VFIOQuirk *quirk = opaque;
1492     VFIODevice *vdev = quirk->vdev;
1493     uint64_t data = vfio_pci_read_config(&vdev->pdev,
1494                                          PCI_BASE_ADDRESS_0 + (4 * 4) + 1,
1495                                          size);
1496     DPRINTF("%s(0x3c3, 1) = 0x%"PRIx64"\n", __func__, data);
1497
1498     return data;
1499 }
1500
1501 static const MemoryRegionOps vfio_ati_3c3_quirk = {
1502     .read = vfio_ati_3c3_quirk_read,
1503     .endianness = DEVICE_LITTLE_ENDIAN,
1504 };
1505
1506 static void vfio_vga_probe_ati_3c3_quirk(VFIODevice *vdev)
1507 {
1508     PCIDevice *pdev = &vdev->pdev;
1509     VFIOQuirk *quirk;
1510
1511     if (pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_ATI) {
1512         return;
1513     }
1514
1515     /*
1516      * As long as the BAR is >= 256 bytes it will be aligned such that the
1517      * lower byte is always zero.  Filter out anything else, if it exists.
1518      */
1519     if (!vdev->bars[4].ioport || vdev->bars[4].size < 256) {
1520         return;
1521     }
1522
1523     quirk = g_malloc0(sizeof(*quirk));
1524     quirk->vdev = vdev;
1525
1526     memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_ati_3c3_quirk, quirk,
1527                           "vfio-ati-3c3-quirk", 1);
1528     memory_region_add_subregion(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem,
1529                                 3 /* offset 3 bytes from 0x3c0 */, &quirk->mem);
1530
1531     QLIST_INSERT_HEAD(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].quirks,
1532                       quirk, next);
1533
1534     DPRINTF("Enabled ATI/AMD quirk 0x3c3 BAR4for device %04x:%02x:%02x.%x\n",
1535             vdev->host.domain, vdev->host.bus, vdev->host.slot,
1536             vdev->host.function);
1537 }
1538
1539 /*
1540  * Newer ATI/AMD devices, including HD5450 and HD7850, have a window to PCI
1541  * config space through MMIO BAR2 at offset 0x4000.  Nothing seems to access
1542  * the MMIO space directly, but a window to this space is provided through
1543  * I/O port BAR4.  Offset 0x0 is the address register and offset 0x4 is the
1544  * data register.  When the address is programmed to a range of 0x4000-0x4fff
1545  * PCI configuration space is available.  Experimentation seems to indicate
1546  * that only read-only access is provided, but we drop writes when the window
1547  * is enabled to config space nonetheless.
1548  */
1549 static void vfio_probe_ati_bar4_window_quirk(VFIODevice *vdev, int nr)
1550 {
1551     PCIDevice *pdev = &vdev->pdev;
1552     VFIOQuirk *quirk;
1553
1554     if (!vdev->has_vga || nr != 4 ||
1555         pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_ATI) {
1556         return;
1557     }
1558
1559     quirk = g_malloc0(sizeof(*quirk));
1560     quirk->vdev = vdev;
1561     quirk->data.address_size = 4;
1562     quirk->data.data_offset = 4;
1563     quirk->data.data_size = 4;
1564     quirk->data.address_match = 0x4000;
1565     quirk->data.address_mask = PCIE_CONFIG_SPACE_SIZE - 1;
1566     quirk->data.bar = nr;
1567     quirk->data.read_flags = quirk->data.write_flags = 1;
1568
1569     memory_region_init_io(&quirk->mem, OBJECT(vdev),
1570                           &vfio_generic_window_quirk, quirk,
1571                           "vfio-ati-bar4-window-quirk", 8);
1572     memory_region_add_subregion_overlap(&vdev->bars[nr].mem,
1573                           quirk->data.base_offset, &quirk->mem, 1);
1574
1575     QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1576
1577     DPRINTF("Enabled ATI/AMD BAR4 window quirk for device %04x:%02x:%02x.%x\n",
1578             vdev->host.domain, vdev->host.bus, vdev->host.slot,
1579             vdev->host.function);
1580 }
1581
1582 /*
1583  * Trap the BAR2 MMIO window to config space as well.
1584  */
1585 static void vfio_probe_ati_bar2_4000_quirk(VFIODevice *vdev, int nr)
1586 {
1587     PCIDevice *pdev = &vdev->pdev;
1588     VFIOQuirk *quirk;
1589
1590     /* Only enable on newer devices where BAR2 is 64bit */
1591     if (!vdev->has_vga || nr != 2 || !vdev->bars[2].mem64 ||
1592         pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_ATI) {
1593         return;
1594     }
1595
1596     quirk = g_malloc0(sizeof(*quirk));
1597     quirk->vdev = vdev;
1598     quirk->data.flags = quirk->data.read_flags = quirk->data.write_flags = 1;
1599     quirk->data.address_match = 0x4000;
1600     quirk->data.address_mask = PCIE_CONFIG_SPACE_SIZE - 1;
1601     quirk->data.bar = nr;
1602
1603     memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_generic_quirk, quirk,
1604                           "vfio-ati-bar2-4000-quirk",
1605                           TARGET_PAGE_ALIGN(quirk->data.address_mask + 1));
1606     memory_region_add_subregion_overlap(&vdev->bars[nr].mem,
1607                           quirk->data.address_match & TARGET_PAGE_MASK,
1608                           &quirk->mem, 1);
1609
1610     QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1611
1612     DPRINTF("Enabled ATI/AMD BAR2 0x4000 quirk for device %04x:%02x:%02x.%x\n",
1613             vdev->host.domain, vdev->host.bus, vdev->host.slot,
1614             vdev->host.function);
1615 }
1616
1617 /*
1618  * Older ATI/AMD cards like the X550 have a similar window to that above.
1619  * I/O port BAR1 provides a window to a mirror of PCI config space located
1620  * in BAR2 at offset 0xf00.  We don't care to support such older cards, but
1621  * note it for future reference.
1622  */
1623
1624 #define PCI_VENDOR_ID_NVIDIA                    0x10de
1625
1626 /*
1627  * Nvidia has several different methods to get to config space, the
1628  * nouveu project has several of these documented here:
1629  * https://github.com/pathscale/envytools/tree/master/hwdocs
1630  *
1631  * The first quirk is actually not documented in envytools and is found
1632  * on 10de:01d1 (NVIDIA Corporation G72 [GeForce 7300 LE]).  This is an
1633  * NV46 chipset.  The backdoor uses the legacy VGA I/O ports to access
1634  * the mirror of PCI config space found at BAR0 offset 0x1800.  The access
1635  * sequence first writes 0x338 to I/O port 0x3d4.  The target offset is
1636  * then written to 0x3d0.  Finally 0x538 is written for a read and 0x738
1637  * is written for a write to 0x3d4.  The BAR0 offset is then accessible
1638  * through 0x3d0.  This quirk doesn't seem to be necessary on newer cards
1639  * that use the I/O port BAR5 window but it doesn't hurt to leave it.
1640  */
1641 enum {
1642     NV_3D0_NONE = 0,
1643     NV_3D0_SELECT,
1644     NV_3D0_WINDOW,
1645     NV_3D0_READ,
1646     NV_3D0_WRITE,
1647 };
1648
1649 static uint64_t vfio_nvidia_3d0_quirk_read(void *opaque,
1650                                            hwaddr addr, unsigned size)
1651 {
1652     VFIOQuirk *quirk = opaque;
1653     VFIODevice *vdev = quirk->vdev;
1654     PCIDevice *pdev = &vdev->pdev;
1655     uint64_t data = vfio_vga_read(&vdev->vga.region[QEMU_PCI_VGA_IO_HI],
1656                                   addr + quirk->data.base_offset, size);
1657
1658     if (quirk->data.flags == NV_3D0_READ && addr == quirk->data.data_offset) {
1659         data = vfio_pci_read_config(pdev, quirk->data.address_val, size);
1660         DPRINTF("%s(0x3d0, %d) = 0x%"PRIx64"\n", __func__, size, data);
1661     }
1662
1663     quirk->data.flags = NV_3D0_NONE;
1664
1665     return data;
1666 }
1667
1668 static void vfio_nvidia_3d0_quirk_write(void *opaque, hwaddr addr,
1669                                         uint64_t data, unsigned size)
1670 {
1671     VFIOQuirk *quirk = opaque;
1672     VFIODevice *vdev = quirk->vdev;
1673     PCIDevice *pdev = &vdev->pdev;
1674
1675     switch (quirk->data.flags) {
1676     case NV_3D0_NONE:
1677         if (addr == quirk->data.address_offset && data == 0x338) {
1678             quirk->data.flags = NV_3D0_SELECT;
1679         }
1680         break;
1681     case NV_3D0_SELECT:
1682         quirk->data.flags = NV_3D0_NONE;
1683         if (addr == quirk->data.data_offset &&
1684             (data & ~quirk->data.address_mask) == quirk->data.address_match) {
1685             quirk->data.flags = NV_3D0_WINDOW;
1686             quirk->data.address_val = data & quirk->data.address_mask;
1687         }
1688         break;
1689     case NV_3D0_WINDOW:
1690         quirk->data.flags = NV_3D0_NONE;
1691         if (addr == quirk->data.address_offset) {
1692             if (data == 0x538) {
1693                 quirk->data.flags = NV_3D0_READ;
1694             } else if (data == 0x738) {
1695                 quirk->data.flags = NV_3D0_WRITE;
1696             }
1697         }
1698         break;
1699     case NV_3D0_WRITE:
1700         quirk->data.flags = NV_3D0_NONE;
1701         if (addr == quirk->data.data_offset) {
1702             vfio_pci_write_config(pdev, quirk->data.address_val, data, size);
1703             DPRINTF("%s(0x3d0, 0x%"PRIx64", %d)\n", __func__, data, size);
1704             return;
1705         }
1706         break;
1707     }
1708
1709     vfio_vga_write(&vdev->vga.region[QEMU_PCI_VGA_IO_HI],
1710                    addr + quirk->data.base_offset, data, size);
1711 }
1712
1713 static const MemoryRegionOps vfio_nvidia_3d0_quirk = {
1714     .read = vfio_nvidia_3d0_quirk_read,
1715     .write = vfio_nvidia_3d0_quirk_write,
1716     .endianness = DEVICE_LITTLE_ENDIAN,
1717 };
1718
1719 static void vfio_vga_probe_nvidia_3d0_quirk(VFIODevice *vdev)
1720 {
1721     PCIDevice *pdev = &vdev->pdev;
1722     VFIOQuirk *quirk;
1723
1724     if (pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_NVIDIA ||
1725         !vdev->bars[1].size) {
1726         return;
1727     }
1728
1729     quirk = g_malloc0(sizeof(*quirk));
1730     quirk->vdev = vdev;
1731     quirk->data.base_offset = 0x10;
1732     quirk->data.address_offset = 4;
1733     quirk->data.address_size = 2;
1734     quirk->data.address_match = 0x1800;
1735     quirk->data.address_mask = PCI_CONFIG_SPACE_SIZE - 1;
1736     quirk->data.data_offset = 0;
1737     quirk->data.data_size = 4;
1738
1739     memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_nvidia_3d0_quirk,
1740                           quirk, "vfio-nvidia-3d0-quirk", 6);
1741     memory_region_add_subregion(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem,
1742                                 quirk->data.base_offset, &quirk->mem);
1743
1744     QLIST_INSERT_HEAD(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].quirks,
1745                       quirk, next);
1746
1747     DPRINTF("Enabled NVIDIA VGA 0x3d0 quirk for device %04x:%02x:%02x.%x\n",
1748             vdev->host.domain, vdev->host.bus, vdev->host.slot,
1749             vdev->host.function);
1750 }
1751
1752 /*
1753  * The second quirk is documented in envytools.  The I/O port BAR5 is just
1754  * a set of address/data ports to the MMIO BARs.  The BAR we care about is
1755  * again BAR0.  This backdoor is apparently a bit newer than the one above
1756  * so we need to not only trap 256 bytes @0x1800, but all of PCI config
1757  * space, including extended space is available at the 4k @0x88000.
1758  */
1759 enum {
1760     NV_BAR5_ADDRESS = 0x1,
1761     NV_BAR5_ENABLE = 0x2,
1762     NV_BAR5_MASTER = 0x4,
1763     NV_BAR5_VALID = 0x7,
1764 };
1765
1766 static void vfio_nvidia_bar5_window_quirk_write(void *opaque, hwaddr addr,
1767                                                 uint64_t data, unsigned size)
1768 {
1769     VFIOQuirk *quirk = opaque;
1770
1771     switch (addr) {
1772     case 0x0:
1773         if (data & 0x1) {
1774             quirk->data.flags |= NV_BAR5_MASTER;
1775         } else {
1776             quirk->data.flags &= ~NV_BAR5_MASTER;
1777         }
1778         break;
1779     case 0x4:
1780         if (data & 0x1) {
1781             quirk->data.flags |= NV_BAR5_ENABLE;
1782         } else {
1783             quirk->data.flags &= ~NV_BAR5_ENABLE;
1784         }
1785         break;
1786     case 0x8:
1787         if (quirk->data.flags & NV_BAR5_MASTER) {
1788             if ((data & ~0xfff) == 0x88000) {
1789                 quirk->data.flags |= NV_BAR5_ADDRESS;
1790                 quirk->data.address_val = data & 0xfff;
1791             } else if ((data & ~0xff) == 0x1800) {
1792                 quirk->data.flags |= NV_BAR5_ADDRESS;
1793                 quirk->data.address_val = data & 0xff;
1794             } else {
1795                 quirk->data.flags &= ~NV_BAR5_ADDRESS;
1796             }
1797         }
1798         break;
1799     }
1800
1801     vfio_generic_window_quirk_write(opaque, addr, data, size);
1802 }
1803
1804 static const MemoryRegionOps vfio_nvidia_bar5_window_quirk = {
1805     .read = vfio_generic_window_quirk_read,
1806     .write = vfio_nvidia_bar5_window_quirk_write,
1807     .valid.min_access_size = 4,
1808     .endianness = DEVICE_LITTLE_ENDIAN,
1809 };
1810
1811 static void vfio_probe_nvidia_bar5_window_quirk(VFIODevice *vdev, int nr)
1812 {
1813     PCIDevice *pdev = &vdev->pdev;
1814     VFIOQuirk *quirk;
1815
1816     if (!vdev->has_vga || nr != 5 ||
1817         pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_NVIDIA) {
1818         return;
1819     }
1820
1821     quirk = g_malloc0(sizeof(*quirk));
1822     quirk->vdev = vdev;
1823     quirk->data.read_flags = quirk->data.write_flags = NV_BAR5_VALID;
1824     quirk->data.address_offset = 0x8;
1825     quirk->data.address_size = 0; /* actually 4, but avoids generic code */
1826     quirk->data.data_offset = 0xc;
1827     quirk->data.data_size = 4;
1828     quirk->data.bar = nr;
1829
1830     memory_region_init_io(&quirk->mem, OBJECT(vdev),
1831                           &vfio_nvidia_bar5_window_quirk, quirk,
1832                           "vfio-nvidia-bar5-window-quirk", 16);
1833     memory_region_add_subregion_overlap(&vdev->bars[nr].mem, 0, &quirk->mem, 1);
1834
1835     QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1836
1837     DPRINTF("Enabled NVIDIA BAR5 window quirk for device %04x:%02x:%02x.%x\n",
1838             vdev->host.domain, vdev->host.bus, vdev->host.slot,
1839             vdev->host.function);
1840 }
1841
1842 static void vfio_nvidia_88000_quirk_write(void *opaque, hwaddr addr,
1843                                           uint64_t data, unsigned size)
1844 {
1845     VFIOQuirk *quirk = opaque;
1846     VFIODevice *vdev = quirk->vdev;
1847     PCIDevice *pdev = &vdev->pdev;
1848     hwaddr base = quirk->data.address_match & TARGET_PAGE_MASK;
1849
1850     vfio_generic_quirk_write(opaque, addr, data, size);
1851
1852     /*
1853      * Nvidia seems to acknowledge MSI interrupts by writing 0xff to the
1854      * MSI capability ID register.  Both the ID and next register are
1855      * read-only, so we allow writes covering either of those to real hw.
1856      * NB - only fixed for the 0x88000 MMIO window.
1857      */
1858     if ((pdev->cap_present & QEMU_PCI_CAP_MSI) &&
1859         vfio_range_contained(addr, size, pdev->msi_cap, PCI_MSI_FLAGS)) {
1860         vfio_bar_write(&vdev->bars[quirk->data.bar], addr + base, data, size);
1861     }
1862 }
1863
1864 static const MemoryRegionOps vfio_nvidia_88000_quirk = {
1865     .read = vfio_generic_quirk_read,
1866     .write = vfio_nvidia_88000_quirk_write,
1867     .endianness = DEVICE_LITTLE_ENDIAN,
1868 };
1869
1870 /*
1871  * Finally, BAR0 itself.  We want to redirect any accesses to either
1872  * 0x1800 or 0x88000 through the PCI config space access functions.
1873  *
1874  * NB - quirk at a page granularity or else they don't seem to work when
1875  *      BARs are mmap'd
1876  *
1877  * Here's offset 0x88000...
1878  */
1879 static void vfio_probe_nvidia_bar0_88000_quirk(VFIODevice *vdev, int nr)
1880 {
1881     PCIDevice *pdev = &vdev->pdev;
1882     VFIOQuirk *quirk;
1883
1884     if (!vdev->has_vga || nr != 0 ||
1885         pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_NVIDIA) {
1886         return;
1887     }
1888
1889     quirk = g_malloc0(sizeof(*quirk));
1890     quirk->vdev = vdev;
1891     quirk->data.flags = quirk->data.read_flags = quirk->data.write_flags = 1;
1892     quirk->data.address_match = 0x88000;
1893     quirk->data.address_mask = PCIE_CONFIG_SPACE_SIZE - 1;
1894     quirk->data.bar = nr;
1895
1896     memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_nvidia_88000_quirk,
1897                           quirk, "vfio-nvidia-bar0-88000-quirk",
1898                           TARGET_PAGE_ALIGN(quirk->data.address_mask + 1));
1899     memory_region_add_subregion_overlap(&vdev->bars[nr].mem,
1900                           quirk->data.address_match & TARGET_PAGE_MASK,
1901                           &quirk->mem, 1);
1902
1903     QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1904
1905     DPRINTF("Enabled NVIDIA BAR0 0x88000 quirk for device %04x:%02x:%02x.%x\n",
1906             vdev->host.domain, vdev->host.bus, vdev->host.slot,
1907             vdev->host.function);
1908 }
1909
1910 /*
1911  * And here's the same for BAR0 offset 0x1800...
1912  */
1913 static void vfio_probe_nvidia_bar0_1800_quirk(VFIODevice *vdev, int nr)
1914 {
1915     PCIDevice *pdev = &vdev->pdev;
1916     VFIOQuirk *quirk;
1917
1918     if (!vdev->has_vga || nr != 0 ||
1919         pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_NVIDIA) {
1920         return;
1921     }
1922
1923     /* Log the chipset ID */
1924     DPRINTF("Nvidia NV%02x\n",
1925             (unsigned int)(vfio_bar_read(&vdev->bars[0], 0, 4) >> 20) & 0xff);
1926
1927     quirk = g_malloc0(sizeof(*quirk));
1928     quirk->vdev = vdev;
1929     quirk->data.flags = quirk->data.read_flags = quirk->data.write_flags = 1;
1930     quirk->data.address_match = 0x1800;
1931     quirk->data.address_mask = PCI_CONFIG_SPACE_SIZE - 1;
1932     quirk->data.bar = nr;
1933
1934     memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_generic_quirk, quirk,
1935                           "vfio-nvidia-bar0-1800-quirk",
1936                           TARGET_PAGE_ALIGN(quirk->data.address_mask + 1));
1937     memory_region_add_subregion_overlap(&vdev->bars[nr].mem,
1938                           quirk->data.address_match & TARGET_PAGE_MASK,
1939                           &quirk->mem, 1);
1940
1941     QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1942
1943     DPRINTF("Enabled NVIDIA BAR0 0x1800 quirk for device %04x:%02x:%02x.%x\n",
1944             vdev->host.domain, vdev->host.bus, vdev->host.slot,
1945             vdev->host.function);
1946 }
1947
1948 /*
1949  * TODO - Some Nvidia devices provide config access to their companion HDA
1950  * device and even to their parent bridge via these config space mirrors.
1951  * Add quirks for those regions.
1952  */
1953
1954 /*
1955  * Common quirk probe entry points.
1956  */
1957 static void vfio_vga_quirk_setup(VFIODevice *vdev)
1958 {
1959     vfio_vga_probe_ati_3c3_quirk(vdev);
1960     vfio_vga_probe_nvidia_3d0_quirk(vdev);
1961 }
1962
1963 static void vfio_vga_quirk_teardown(VFIODevice *vdev)
1964 {
1965     int i;
1966
1967     for (i = 0; i < ARRAY_SIZE(vdev->vga.region); i++) {
1968         while (!QLIST_EMPTY(&vdev->vga.region[i].quirks)) {
1969             VFIOQuirk *quirk = QLIST_FIRST(&vdev->vga.region[i].quirks);
1970             memory_region_del_subregion(&vdev->vga.region[i].mem, &quirk->mem);
1971             memory_region_destroy(&quirk->mem);
1972             QLIST_REMOVE(quirk, next);
1973             g_free(quirk);
1974         }
1975     }
1976 }
1977
1978 static void vfio_bar_quirk_setup(VFIODevice *vdev, int nr)
1979 {
1980     vfio_probe_ati_bar4_window_quirk(vdev, nr);
1981     vfio_probe_ati_bar2_4000_quirk(vdev, nr);
1982     vfio_probe_nvidia_bar5_window_quirk(vdev, nr);
1983     vfio_probe_nvidia_bar0_88000_quirk(vdev, nr);
1984     vfio_probe_nvidia_bar0_1800_quirk(vdev, nr);
1985 }
1986
1987 static void vfio_bar_quirk_teardown(VFIODevice *vdev, int nr)
1988 {
1989     VFIOBAR *bar = &vdev->bars[nr];
1990
1991     while (!QLIST_EMPTY(&bar->quirks)) {
1992         VFIOQuirk *quirk = QLIST_FIRST(&bar->quirks);
1993         memory_region_del_subregion(&bar->mem, &quirk->mem);
1994         memory_region_destroy(&quirk->mem);
1995         QLIST_REMOVE(quirk, next);
1996         g_free(quirk);
1997     }
1998 }
1999
2000 /*
2001  * PCI config space
2002  */
2003 static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
2004 {
2005     VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
2006     uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val;
2007
2008     memcpy(&emu_bits, vdev->emulated_config_bits + addr, len);
2009     emu_bits = le32_to_cpu(emu_bits);
2010
2011     if (emu_bits) {
2012         emu_val = pci_default_read_config(pdev, addr, len);
2013     }
2014
2015     if (~emu_bits & (0xffffffffU >> (32 - len * 8))) {
2016         ssize_t ret;
2017
2018         ret = pread(vdev->fd, &phys_val, len, vdev->config_offset + addr);
2019         if (ret != len) {
2020             error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x) failed: %m",
2021                          __func__, vdev->host.domain, vdev->host.bus,
2022                          vdev->host.slot, vdev->host.function, addr, len);
2023             return -errno;
2024         }
2025         phys_val = le32_to_cpu(phys_val);
2026     }
2027
2028     val = (emu_val & emu_bits) | (phys_val & ~emu_bits);
2029
2030     DPRINTF("%s(%04x:%02x:%02x.%x, @0x%x, len=0x%x) %x\n", __func__,
2031             vdev->host.domain, vdev->host.bus, vdev->host.slot,
2032             vdev->host.function, addr, len, val);
2033
2034     return val;
2035 }
2036
2037 static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
2038                                   uint32_t val, int len)
2039 {
2040     VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
2041     uint32_t val_le = cpu_to_le32(val);
2042
2043     DPRINTF("%s(%04x:%02x:%02x.%x, @0x%x, 0x%x, len=0x%x)\n", __func__,
2044             vdev->host.domain, vdev->host.bus, vdev->host.slot,
2045             vdev->host.function, addr, val, len);
2046
2047     /* Write everything to VFIO, let it filter out what we can't write */
2048     if (pwrite(vdev->fd, &val_le, len, vdev->config_offset + addr) != len) {
2049         error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x, 0x%x) failed: %m",
2050                      __func__, vdev->host.domain, vdev->host.bus,
2051                      vdev->host.slot, vdev->host.function, addr, val, len);
2052     }
2053
2054     /* MSI/MSI-X Enabling/Disabling */
2055     if (pdev->cap_present & QEMU_PCI_CAP_MSI &&
2056         ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size)) {
2057         int is_enabled, was_enabled = msi_enabled(pdev);
2058
2059         pci_default_write_config(pdev, addr, val, len);
2060
2061         is_enabled = msi_enabled(pdev);
2062
2063         if (!was_enabled) {
2064             if (is_enabled) {
2065                 vfio_enable_msi(vdev);
2066             }
2067         } else {
2068             if (!is_enabled) {
2069                 vfio_disable_msi(vdev);
2070             } else {
2071                 vfio_update_msi(vdev);
2072             }
2073         }
2074     } else if (pdev->cap_present & QEMU_PCI_CAP_MSIX &&
2075         ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) {
2076         int is_enabled, was_enabled = msix_enabled(pdev);
2077
2078         pci_default_write_config(pdev, addr, val, len);
2079
2080         is_enabled = msix_enabled(pdev);
2081
2082         if (!was_enabled && is_enabled) {
2083             vfio_enable_msix(vdev);
2084         } else if (was_enabled && !is_enabled) {
2085             vfio_disable_msix(vdev);
2086         }
2087     } else {
2088         /* Write everything to QEMU to keep emulated bits correct */
2089         pci_default_write_config(pdev, addr, val, len);
2090     }
2091 }
2092
2093 /*
2094  * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
2095  */
2096 static int vfio_dma_unmap(VFIOContainer *container,
2097                           hwaddr iova, ram_addr_t size)
2098 {
2099     struct vfio_iommu_type1_dma_unmap unmap = {
2100         .argsz = sizeof(unmap),
2101         .flags = 0,
2102         .iova = iova,
2103         .size = size,
2104     };
2105
2106     if (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
2107         DPRINTF("VFIO_UNMAP_DMA: %d\n", -errno);
2108         return -errno;
2109     }
2110
2111     return 0;
2112 }
2113
2114 static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
2115                         ram_addr_t size, void *vaddr, bool readonly)
2116 {
2117     struct vfio_iommu_type1_dma_map map = {
2118         .argsz = sizeof(map),
2119         .flags = VFIO_DMA_MAP_FLAG_READ,
2120         .vaddr = (__u64)(uintptr_t)vaddr,
2121         .iova = iova,
2122         .size = size,
2123     };
2124
2125     if (!readonly) {
2126         map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
2127     }
2128
2129     /*
2130      * Try the mapping, if it fails with EBUSY, unmap the region and try
2131      * again.  This shouldn't be necessary, but we sometimes see it in
2132      * the the VGA ROM space.
2133      */
2134     if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
2135         (errno == EBUSY && vfio_dma_unmap(container, iova, size) == 0 &&
2136          ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
2137         return 0;
2138     }
2139
2140     DPRINTF("VFIO_MAP_DMA: %d\n", -errno);
2141     return -errno;
2142 }
2143
2144 static bool vfio_listener_skipped_section(MemoryRegionSection *section)
2145 {
2146     return !memory_region_is_ram(section->mr);
2147 }
2148
2149 static void vfio_listener_region_add(MemoryListener *listener,
2150                                      MemoryRegionSection *section)
2151 {
2152     VFIOContainer *container = container_of(listener, VFIOContainer,
2153                                             iommu_data.listener);
2154     hwaddr iova, end;
2155     void *vaddr;
2156     int ret;
2157
2158     assert(!memory_region_is_iommu(section->mr));
2159
2160     if (vfio_listener_skipped_section(section)) {
2161         DPRINTF("SKIPPING region_add %"HWADDR_PRIx" - %"PRIx64"\n",
2162                 section->offset_within_address_space,
2163                 section->offset_within_address_space +
2164                 int128_get64(int128_sub(section->size, int128_one())));
2165         return;
2166     }
2167
2168     if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
2169                  (section->offset_within_region & ~TARGET_PAGE_MASK))) {
2170         error_report("%s received unaligned region", __func__);
2171         return;
2172     }
2173
2174     iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
2175     end = (section->offset_within_address_space + int128_get64(section->size)) &
2176           TARGET_PAGE_MASK;
2177
2178     if (iova >= end) {
2179         return;
2180     }
2181
2182     vaddr = memory_region_get_ram_ptr(section->mr) +
2183             section->offset_within_region +
2184             (iova - section->offset_within_address_space);
2185
2186     DPRINTF("region_add %"HWADDR_PRIx" - %"HWADDR_PRIx" [%p]\n",
2187             iova, end - 1, vaddr);
2188
2189     memory_region_ref(section->mr);
2190     ret = vfio_dma_map(container, iova, end - iova, vaddr, section->readonly);
2191     if (ret) {
2192         error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
2193                      "0x%"HWADDR_PRIx", %p) = %d (%m)",
2194                      container, iova, end - iova, vaddr, ret);
2195     }
2196 }
2197
2198 static void vfio_listener_region_del(MemoryListener *listener,
2199                                      MemoryRegionSection *section)
2200 {
2201     VFIOContainer *container = container_of(listener, VFIOContainer,
2202                                             iommu_data.listener);
2203     hwaddr iova, end;
2204     int ret;
2205
2206     if (vfio_listener_skipped_section(section)) {
2207         DPRINTF("SKIPPING region_del %"HWADDR_PRIx" - %"PRIx64"\n",
2208                 section->offset_within_address_space,
2209                 section->offset_within_address_space +
2210                 int128_get64(int128_sub(section->size, int128_one())));
2211         return;
2212     }
2213
2214     if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
2215                  (section->offset_within_region & ~TARGET_PAGE_MASK))) {
2216         error_report("%s received unaligned region", __func__);
2217         return;
2218     }
2219
2220     iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
2221     end = (section->offset_within_address_space + int128_get64(section->size)) &
2222           TARGET_PAGE_MASK;
2223
2224     if (iova >= end) {
2225         return;
2226     }
2227
2228     DPRINTF("region_del %"HWADDR_PRIx" - %"HWADDR_PRIx"\n",
2229             iova, end - 1);
2230
2231     ret = vfio_dma_unmap(container, iova, end - iova);
2232     memory_region_unref(section->mr);
2233     if (ret) {
2234         error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
2235                      "0x%"HWADDR_PRIx") = %d (%m)",
2236                      container, iova, end - iova, ret);
2237     }
2238 }
2239
2240 static MemoryListener vfio_memory_listener = {
2241     .region_add = vfio_listener_region_add,
2242     .region_del = vfio_listener_region_del,
2243 };
2244
2245 static void vfio_listener_release(VFIOContainer *container)
2246 {
2247     memory_listener_unregister(&container->iommu_data.listener);
2248 }
2249
2250 /*
2251  * Interrupt setup
2252  */
2253 static void vfio_disable_interrupts(VFIODevice *vdev)
2254 {
2255     switch (vdev->interrupt) {
2256     case VFIO_INT_INTx:
2257         vfio_disable_intx(vdev);
2258         break;
2259     case VFIO_INT_MSI:
2260         vfio_disable_msi(vdev);
2261         break;
2262     case VFIO_INT_MSIX:
2263         vfio_disable_msix(vdev);
2264         break;
2265     }
2266 }
2267
2268 static int vfio_setup_msi(VFIODevice *vdev, int pos)
2269 {
2270     uint16_t ctrl;
2271     bool msi_64bit, msi_maskbit;
2272     int ret, entries;
2273
2274     if (pread(vdev->fd, &ctrl, sizeof(ctrl),
2275               vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
2276         return -errno;
2277     }
2278     ctrl = le16_to_cpu(ctrl);
2279
2280     msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT);
2281     msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT);
2282     entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1);
2283
2284     DPRINTF("%04x:%02x:%02x.%x PCI MSI CAP @0x%x\n", vdev->host.domain,
2285             vdev->host.bus, vdev->host.slot, vdev->host.function, pos);
2286
2287     ret = msi_init(&vdev->pdev, pos, entries, msi_64bit, msi_maskbit);
2288     if (ret < 0) {
2289         if (ret == -ENOTSUP) {
2290             return 0;
2291         }
2292         error_report("vfio: msi_init failed");
2293         return ret;
2294     }
2295     vdev->msi_cap_size = 0xa + (msi_maskbit ? 0xa : 0) + (msi_64bit ? 0x4 : 0);
2296
2297     return 0;
2298 }
2299
2300 /*
2301  * We don't have any control over how pci_add_capability() inserts
2302  * capabilities into the chain.  In order to setup MSI-X we need a
2303  * MemoryRegion for the BAR.  In order to setup the BAR and not
2304  * attempt to mmap the MSI-X table area, which VFIO won't allow, we
2305  * need to first look for where the MSI-X table lives.  So we
2306  * unfortunately split MSI-X setup across two functions.
2307  */
2308 static int vfio_early_setup_msix(VFIODevice *vdev)
2309 {
2310     uint8_t pos;
2311     uint16_t ctrl;
2312     uint32_t table, pba;
2313
2314     pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX);
2315     if (!pos) {
2316         return 0;
2317     }
2318
2319     if (pread(vdev->fd, &ctrl, sizeof(ctrl),
2320               vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
2321         return -errno;
2322     }
2323
2324     if (pread(vdev->fd, &table, sizeof(table),
2325               vdev->config_offset + pos + PCI_MSIX_TABLE) != sizeof(table)) {
2326         return -errno;
2327     }
2328
2329     if (pread(vdev->fd, &pba, sizeof(pba),
2330               vdev->config_offset + pos + PCI_MSIX_PBA) != sizeof(pba)) {
2331         return -errno;
2332     }
2333
2334     ctrl = le16_to_cpu(ctrl);
2335     table = le32_to_cpu(table);
2336     pba = le32_to_cpu(pba);
2337
2338     vdev->msix = g_malloc0(sizeof(*(vdev->msix)));
2339     vdev->msix->table_bar = table & PCI_MSIX_FLAGS_BIRMASK;
2340     vdev->msix->table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
2341     vdev->msix->pba_bar = pba & PCI_MSIX_FLAGS_BIRMASK;
2342     vdev->msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK;
2343     vdev->msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
2344
2345     DPRINTF("%04x:%02x:%02x.%x "
2346             "PCI MSI-X CAP @0x%x, BAR %d, offset 0x%x, entries %d\n",
2347             vdev->host.domain, vdev->host.bus, vdev->host.slot,
2348             vdev->host.function, pos, vdev->msix->table_bar,
2349             vdev->msix->table_offset, vdev->msix->entries);
2350
2351     return 0;
2352 }
2353
2354 static int vfio_setup_msix(VFIODevice *vdev, int pos)
2355 {
2356     int ret;
2357
2358     ret = msix_init(&vdev->pdev, vdev->msix->entries,
2359                     &vdev->bars[vdev->msix->table_bar].mem,
2360                     vdev->msix->table_bar, vdev->msix->table_offset,
2361                     &vdev->bars[vdev->msix->pba_bar].mem,
2362                     vdev->msix->pba_bar, vdev->msix->pba_offset, pos);
2363     if (ret < 0) {
2364         if (ret == -ENOTSUP) {
2365             return 0;
2366         }
2367         error_report("vfio: msix_init failed");
2368         return ret;
2369     }
2370
2371     return 0;
2372 }
2373
2374 static void vfio_teardown_msi(VFIODevice *vdev)
2375 {
2376     msi_uninit(&vdev->pdev);
2377
2378     if (vdev->msix) {
2379         msix_uninit(&vdev->pdev, &vdev->bars[vdev->msix->table_bar].mem,
2380                     &vdev->bars[vdev->msix->pba_bar].mem);
2381     }
2382 }
2383
2384 /*
2385  * Resource setup
2386  */
2387 static void vfio_mmap_set_enabled(VFIODevice *vdev, bool enabled)
2388 {
2389     int i;
2390
2391     for (i = 0; i < PCI_ROM_SLOT; i++) {
2392         VFIOBAR *bar = &vdev->bars[i];
2393
2394         if (!bar->size) {
2395             continue;
2396         }
2397
2398         memory_region_set_enabled(&bar->mmap_mem, enabled);
2399         if (vdev->msix && vdev->msix->table_bar == i) {
2400             memory_region_set_enabled(&vdev->msix->mmap_mem, enabled);
2401         }
2402     }
2403 }
2404
2405 static void vfio_unmap_bar(VFIODevice *vdev, int nr)
2406 {
2407     VFIOBAR *bar = &vdev->bars[nr];
2408
2409     if (!bar->size) {
2410         return;
2411     }
2412
2413     vfio_bar_quirk_teardown(vdev, nr);
2414
2415     memory_region_del_subregion(&bar->mem, &bar->mmap_mem);
2416     munmap(bar->mmap, memory_region_size(&bar->mmap_mem));
2417     memory_region_destroy(&bar->mmap_mem);
2418
2419     if (vdev->msix && vdev->msix->table_bar == nr) {
2420         memory_region_del_subregion(&bar->mem, &vdev->msix->mmap_mem);
2421         munmap(vdev->msix->mmap, memory_region_size(&vdev->msix->mmap_mem));
2422         memory_region_destroy(&vdev->msix->mmap_mem);
2423     }
2424
2425     memory_region_destroy(&bar->mem);
2426 }
2427
2428 static int vfio_mmap_bar(VFIODevice *vdev, VFIOBAR *bar,
2429                          MemoryRegion *mem, MemoryRegion *submem,
2430                          void **map, size_t size, off_t offset,
2431                          const char *name)
2432 {
2433     int ret = 0;
2434
2435     if (VFIO_ALLOW_MMAP && size && bar->flags & VFIO_REGION_INFO_FLAG_MMAP) {
2436         int prot = 0;
2437
2438         if (bar->flags & VFIO_REGION_INFO_FLAG_READ) {
2439             prot |= PROT_READ;
2440         }
2441
2442         if (bar->flags & VFIO_REGION_INFO_FLAG_WRITE) {
2443             prot |= PROT_WRITE;
2444         }
2445
2446         *map = mmap(NULL, size, prot, MAP_SHARED,
2447                     bar->fd, bar->fd_offset + offset);
2448         if (*map == MAP_FAILED) {
2449             *map = NULL;
2450             ret = -errno;
2451             goto empty_region;
2452         }
2453
2454         memory_region_init_ram_ptr(submem, OBJECT(vdev), name, size, *map);
2455     } else {
2456 empty_region:
2457         /* Create a zero sized sub-region to make cleanup easy. */
2458         memory_region_init(submem, OBJECT(vdev), name, 0);
2459     }
2460
2461     memory_region_add_subregion(mem, offset, submem);
2462
2463     return ret;
2464 }
2465
2466 static void vfio_map_bar(VFIODevice *vdev, int nr)
2467 {
2468     VFIOBAR *bar = &vdev->bars[nr];
2469     unsigned size = bar->size;
2470     char name[64];
2471     uint32_t pci_bar;
2472     uint8_t type;
2473     int ret;
2474
2475     /* Skip both unimplemented BARs and the upper half of 64bit BARS. */
2476     if (!size) {
2477         return;
2478     }
2479
2480     snprintf(name, sizeof(name), "VFIO %04x:%02x:%02x.%x BAR %d",
2481              vdev->host.domain, vdev->host.bus, vdev->host.slot,
2482              vdev->host.function, nr);
2483
2484     /* Determine what type of BAR this is for registration */
2485     ret = pread(vdev->fd, &pci_bar, sizeof(pci_bar),
2486                 vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr));
2487     if (ret != sizeof(pci_bar)) {
2488         error_report("vfio: Failed to read BAR %d (%m)", nr);
2489         return;
2490     }
2491
2492     pci_bar = le32_to_cpu(pci_bar);
2493     bar->ioport = (pci_bar & PCI_BASE_ADDRESS_SPACE_IO);
2494     bar->mem64 = bar->ioport ? 0 : (pci_bar & PCI_BASE_ADDRESS_MEM_TYPE_64);
2495     type = pci_bar & (bar->ioport ? ~PCI_BASE_ADDRESS_IO_MASK :
2496                                     ~PCI_BASE_ADDRESS_MEM_MASK);
2497
2498     /* A "slow" read/write mapping underlies all BARs */
2499     memory_region_init_io(&bar->mem, OBJECT(vdev), &vfio_bar_ops,
2500                           bar, name, size);
2501     pci_register_bar(&vdev->pdev, nr, type, &bar->mem);
2502
2503     /*
2504      * We can't mmap areas overlapping the MSIX vector table, so we
2505      * potentially insert a direct-mapped subregion before and after it.
2506      */
2507     if (vdev->msix && vdev->msix->table_bar == nr) {
2508         size = vdev->msix->table_offset & TARGET_PAGE_MASK;
2509     }
2510
2511     strncat(name, " mmap", sizeof(name) - strlen(name) - 1);
2512     if (vfio_mmap_bar(vdev, bar, &bar->mem,
2513                       &bar->mmap_mem, &bar->mmap, size, 0, name)) {
2514         error_report("%s unsupported. Performance may be slow", name);
2515     }
2516
2517     if (vdev->msix && vdev->msix->table_bar == nr) {
2518         unsigned start;
2519
2520         start = TARGET_PAGE_ALIGN(vdev->msix->table_offset +
2521                                   (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE));
2522
2523         size = start < bar->size ? bar->size - start : 0;
2524         strncat(name, " msix-hi", sizeof(name) - strlen(name) - 1);
2525         /* VFIOMSIXInfo contains another MemoryRegion for this mapping */
2526         if (vfio_mmap_bar(vdev, bar, &bar->mem, &vdev->msix->mmap_mem,
2527                           &vdev->msix->mmap, size, start, name)) {
2528             error_report("%s unsupported. Performance may be slow", name);
2529         }
2530     }
2531
2532     vfio_bar_quirk_setup(vdev, nr);
2533 }
2534
2535 static void vfio_map_bars(VFIODevice *vdev)
2536 {
2537     int i;
2538
2539     for (i = 0; i < PCI_ROM_SLOT; i++) {
2540         vfio_map_bar(vdev, i);
2541     }
2542
2543     if (vdev->has_vga) {
2544         memory_region_init_io(&vdev->vga.region[QEMU_PCI_VGA_MEM].mem,
2545                               OBJECT(vdev), &vfio_vga_ops,
2546                               &vdev->vga.region[QEMU_PCI_VGA_MEM],
2547                               "vfio-vga-mmio@0xa0000",
2548                               QEMU_PCI_VGA_MEM_SIZE);
2549         memory_region_init_io(&vdev->vga.region[QEMU_PCI_VGA_IO_LO].mem,
2550                               OBJECT(vdev), &vfio_vga_ops,
2551                               &vdev->vga.region[QEMU_PCI_VGA_IO_LO],
2552                               "vfio-vga-io@0x3b0",
2553                               QEMU_PCI_VGA_IO_LO_SIZE);
2554         memory_region_init_io(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem,
2555                               OBJECT(vdev), &vfio_vga_ops,
2556                               &vdev->vga.region[QEMU_PCI_VGA_IO_HI],
2557                               "vfio-vga-io@0x3c0",
2558                               QEMU_PCI_VGA_IO_HI_SIZE);
2559
2560         pci_register_vga(&vdev->pdev, &vdev->vga.region[QEMU_PCI_VGA_MEM].mem,
2561                          &vdev->vga.region[QEMU_PCI_VGA_IO_LO].mem,
2562                          &vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem);
2563         vfio_vga_quirk_setup(vdev);
2564     }
2565 }
2566
2567 static void vfio_unmap_bars(VFIODevice *vdev)
2568 {
2569     int i;
2570
2571     for (i = 0; i < PCI_ROM_SLOT; i++) {
2572         vfio_unmap_bar(vdev, i);
2573     }
2574
2575     if (vdev->has_vga) {
2576         vfio_vga_quirk_teardown(vdev);
2577         pci_unregister_vga(&vdev->pdev);
2578         memory_region_destroy(&vdev->vga.region[QEMU_PCI_VGA_MEM].mem);
2579         memory_region_destroy(&vdev->vga.region[QEMU_PCI_VGA_IO_LO].mem);
2580         memory_region_destroy(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem);
2581     }
2582 }
2583
2584 /*
2585  * General setup
2586  */
2587 static uint8_t vfio_std_cap_max_size(PCIDevice *pdev, uint8_t pos)
2588 {
2589     uint8_t tmp, next = 0xff;
2590
2591     for (tmp = pdev->config[PCI_CAPABILITY_LIST]; tmp;
2592          tmp = pdev->config[tmp + 1]) {
2593         if (tmp > pos && tmp < next) {
2594             next = tmp;
2595         }
2596     }
2597
2598     return next - pos;
2599 }
2600
2601 static void vfio_set_word_bits(uint8_t *buf, uint16_t val, uint16_t mask)
2602 {
2603     pci_set_word(buf, (pci_get_word(buf) & ~mask) | val);
2604 }
2605
2606 static void vfio_add_emulated_word(VFIODevice *vdev, int pos,
2607                                    uint16_t val, uint16_t mask)
2608 {
2609     vfio_set_word_bits(vdev->pdev.config + pos, val, mask);
2610     vfio_set_word_bits(vdev->pdev.wmask + pos, ~mask, mask);
2611     vfio_set_word_bits(vdev->emulated_config_bits + pos, mask, mask);
2612 }
2613
2614 static void vfio_set_long_bits(uint8_t *buf, uint32_t val, uint32_t mask)
2615 {
2616     pci_set_long(buf, (pci_get_long(buf) & ~mask) | val);
2617 }
2618
2619 static void vfio_add_emulated_long(VFIODevice *vdev, int pos,
2620                                    uint32_t val, uint32_t mask)
2621 {
2622     vfio_set_long_bits(vdev->pdev.config + pos, val, mask);
2623     vfio_set_long_bits(vdev->pdev.wmask + pos, ~mask, mask);
2624     vfio_set_long_bits(vdev->emulated_config_bits + pos, mask, mask);
2625 }
2626
2627 static int vfio_setup_pcie_cap(VFIODevice *vdev, int pos, uint8_t size)
2628 {
2629     uint16_t flags;
2630     uint8_t type;
2631
2632     flags = pci_get_word(vdev->pdev.config + pos + PCI_CAP_FLAGS);
2633     type = (flags & PCI_EXP_FLAGS_TYPE) >> 4;
2634
2635     if (type != PCI_EXP_TYPE_ENDPOINT &&
2636         type != PCI_EXP_TYPE_LEG_END &&
2637         type != PCI_EXP_TYPE_RC_END) {
2638
2639         error_report("vfio: Assignment of PCIe type 0x%x "
2640                      "devices is not currently supported", type);
2641         return -EINVAL;
2642     }
2643
2644     if (!pci_bus_is_express(vdev->pdev.bus)) {
2645         /*
2646          * Use express capability as-is on PCI bus.  It doesn't make much
2647          * sense to even expose, but some drivers (ex. tg3) depend on it
2648          * and guests don't seem to be particular about it.  We'll need
2649          * to revist this or force express devices to express buses if we
2650          * ever expose an IOMMU to the guest.
2651          */
2652     } else if (pci_bus_is_root(vdev->pdev.bus)) {
2653         /*
2654          * On a Root Complex bus Endpoints become Root Complex Integrated
2655          * Endpoints, which changes the type and clears the LNK & LNK2 fields.
2656          */
2657         if (type == PCI_EXP_TYPE_ENDPOINT) {
2658             vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
2659                                    PCI_EXP_TYPE_RC_END << 4,
2660                                    PCI_EXP_FLAGS_TYPE);
2661
2662             /* Link Capabilities, Status, and Control goes away */
2663             if (size > PCI_EXP_LNKCTL) {
2664                 vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP, 0, ~0);
2665                 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0);
2666                 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA, 0, ~0);
2667
2668 #ifndef PCI_EXP_LNKCAP2
2669 #define PCI_EXP_LNKCAP2 44
2670 #endif
2671 #ifndef PCI_EXP_LNKSTA2
2672 #define PCI_EXP_LNKSTA2 50
2673 #endif
2674                 /* Link 2 Capabilities, Status, and Control goes away */
2675                 if (size > PCI_EXP_LNKCAP2) {
2676                     vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP2, 0, ~0);
2677                     vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL2, 0, ~0);
2678                     vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA2, 0, ~0);
2679                 }
2680             }
2681
2682         } else if (type == PCI_EXP_TYPE_LEG_END) {
2683             /*
2684              * Legacy endpoints don't belong on the root complex.  Windows
2685              * seems to be happier with devices if we skip the capability.
2686              */
2687             return 0;
2688         }
2689
2690     } else {
2691         /*
2692          * Convert Root Complex Integrated Endpoints to regular endpoints.
2693          * These devices don't support LNK/LNK2 capabilities, so make them up.
2694          */
2695         if (type == PCI_EXP_TYPE_RC_END) {
2696             vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
2697                                    PCI_EXP_TYPE_ENDPOINT << 4,
2698                                    PCI_EXP_FLAGS_TYPE);
2699             vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP,
2700                                    PCI_EXP_LNK_MLW_1 | PCI_EXP_LNK_LS_25, ~0);
2701             vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0);
2702         }
2703
2704         /* Mark the Link Status bits as emulated to allow virtual negotiation */
2705         vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA,
2706                                pci_get_word(vdev->pdev.config + pos +
2707                                             PCI_EXP_LNKSTA),
2708                                PCI_EXP_LNKCAP_MLW | PCI_EXP_LNKCAP_SLS);
2709     }
2710
2711     pos = pci_add_capability(&vdev->pdev, PCI_CAP_ID_EXP, pos, size);
2712     if (pos >= 0) {
2713         vdev->pdev.exp.exp_cap = pos;
2714     }
2715
2716     return pos;
2717 }
2718
2719 static void vfio_check_pcie_flr(VFIODevice *vdev, uint8_t pos)
2720 {
2721     uint32_t cap = pci_get_long(vdev->pdev.config + pos + PCI_EXP_DEVCAP);
2722
2723     if (cap & PCI_EXP_DEVCAP_FLR) {
2724         DPRINTF("%04x:%02x:%02x.%x Supports FLR via PCIe cap\n",
2725                 vdev->host.domain, vdev->host.bus, vdev->host.slot,
2726                 vdev->host.function);
2727         vdev->has_flr = true;
2728     }
2729 }
2730
2731 static void vfio_check_pm_reset(VFIODevice *vdev, uint8_t pos)
2732 {
2733     uint16_t csr = pci_get_word(vdev->pdev.config + pos + PCI_PM_CTRL);
2734
2735     if (!(csr & PCI_PM_CTRL_NO_SOFT_RESET)) {
2736         DPRINTF("%04x:%02x:%02x.%x Supports PM reset\n",
2737                 vdev->host.domain, vdev->host.bus, vdev->host.slot,
2738                 vdev->host.function);
2739         vdev->has_pm_reset = true;
2740     }
2741 }
2742
2743 static void vfio_check_af_flr(VFIODevice *vdev, uint8_t pos)
2744 {
2745     uint8_t cap = pci_get_byte(vdev->pdev.config + pos + PCI_AF_CAP);
2746
2747     if ((cap & PCI_AF_CAP_TP) && (cap & PCI_AF_CAP_FLR)) {
2748         DPRINTF("%04x:%02x:%02x.%x Supports FLR via AF cap\n",
2749                 vdev->host.domain, vdev->host.bus, vdev->host.slot,
2750                 vdev->host.function);
2751         vdev->has_flr = true;
2752     }
2753 }
2754
2755 static int vfio_add_std_cap(VFIODevice *vdev, uint8_t pos)
2756 {
2757     PCIDevice *pdev = &vdev->pdev;
2758     uint8_t cap_id, next, size;
2759     int ret;
2760
2761     cap_id = pdev->config[pos];
2762     next = pdev->config[pos + 1];
2763
2764     /*
2765      * If it becomes important to configure capabilities to their actual
2766      * size, use this as the default when it's something we don't recognize.
2767      * Since QEMU doesn't actually handle many of the config accesses,
2768      * exact size doesn't seem worthwhile.
2769      */
2770     size = vfio_std_cap_max_size(pdev, pos);
2771
2772     /*
2773      * pci_add_capability always inserts the new capability at the head
2774      * of the chain.  Therefore to end up with a chain that matches the
2775      * physical device, we insert from the end by making this recursive.
2776      * This is also why we pre-caclulate size above as cached config space
2777      * will be changed as we unwind the stack.
2778      */
2779     if (next) {
2780         ret = vfio_add_std_cap(vdev, next);
2781         if (ret) {
2782             return ret;
2783         }
2784     } else {
2785         /* Begin the rebuild, use QEMU emulated list bits */
2786         pdev->config[PCI_CAPABILITY_LIST] = 0;
2787         vdev->emulated_config_bits[PCI_CAPABILITY_LIST] = 0xff;
2788         vdev->emulated_config_bits[PCI_STATUS] |= PCI_STATUS_CAP_LIST;
2789     }
2790
2791     /* Use emulated next pointer to allow dropping caps */
2792     pci_set_byte(vdev->emulated_config_bits + pos + 1, 0xff);
2793
2794     switch (cap_id) {
2795     case PCI_CAP_ID_MSI:
2796         ret = vfio_setup_msi(vdev, pos);
2797         break;
2798     case PCI_CAP_ID_EXP:
2799         vfio_check_pcie_flr(vdev, pos);
2800         ret = vfio_setup_pcie_cap(vdev, pos, size);
2801         break;
2802     case PCI_CAP_ID_MSIX:
2803         ret = vfio_setup_msix(vdev, pos);
2804         break;
2805     case PCI_CAP_ID_PM:
2806         vfio_check_pm_reset(vdev, pos);
2807         vdev->pm_cap = pos;
2808         ret = pci_add_capability(pdev, cap_id, pos, size);
2809         break;
2810     case PCI_CAP_ID_AF:
2811         vfio_check_af_flr(vdev, pos);
2812         ret = pci_add_capability(pdev, cap_id, pos, size);
2813         break;
2814     default:
2815         ret = pci_add_capability(pdev, cap_id, pos, size);
2816         break;
2817     }
2818
2819     if (ret < 0) {
2820         error_report("vfio: %04x:%02x:%02x.%x Error adding PCI capability "
2821                      "0x%x[0x%x]@0x%x: %d", vdev->host.domain,
2822                      vdev->host.bus, vdev->host.slot, vdev->host.function,
2823                      cap_id, size, pos, ret);
2824         return ret;
2825     }
2826
2827     return 0;
2828 }
2829
2830 static int vfio_add_capabilities(VFIODevice *vdev)
2831 {
2832     PCIDevice *pdev = &vdev->pdev;
2833
2834     if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST) ||
2835         !pdev->config[PCI_CAPABILITY_LIST]) {
2836         return 0; /* Nothing to add */
2837     }
2838
2839     return vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]);
2840 }
2841
2842 static void vfio_pci_pre_reset(VFIODevice *vdev)
2843 {
2844     PCIDevice *pdev = &vdev->pdev;
2845     uint16_t cmd;
2846
2847     vfio_disable_interrupts(vdev);
2848
2849     /* Make sure the device is in D0 */
2850     if (vdev->pm_cap) {
2851         uint16_t pmcsr;
2852         uint8_t state;
2853
2854         pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
2855         state = pmcsr & PCI_PM_CTRL_STATE_MASK;
2856         if (state) {
2857             pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
2858             vfio_pci_write_config(pdev, vdev->pm_cap + PCI_PM_CTRL, pmcsr, 2);
2859             /* vfio handles the necessary delay here */
2860             pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
2861             state = pmcsr & PCI_PM_CTRL_STATE_MASK;
2862             if (state) {
2863                 error_report("vfio: Unable to power on device, stuck in D%d\n",
2864                              state);
2865             }
2866         }
2867     }
2868
2869     /*
2870      * Stop any ongoing DMA by disconecting I/O, MMIO, and bus master.
2871      * Also put INTx Disable in known state.
2872      */
2873     cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2);
2874     cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER |
2875              PCI_COMMAND_INTX_DISABLE);
2876     vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2);
2877 }
2878
2879 static void vfio_pci_post_reset(VFIODevice *vdev)
2880 {
2881     vfio_enable_intx(vdev);
2882 }
2883
2884 static bool vfio_pci_host_match(PCIHostDeviceAddress *host1,
2885                                 PCIHostDeviceAddress *host2)
2886 {
2887     return (host1->domain == host2->domain && host1->bus == host2->bus &&
2888             host1->slot == host2->slot && host1->function == host2->function);
2889 }
2890
2891 static int vfio_pci_hot_reset(VFIODevice *vdev, bool single)
2892 {
2893     VFIOGroup *group;
2894     struct vfio_pci_hot_reset_info *info;
2895     struct vfio_pci_dependent_device *devices;
2896     struct vfio_pci_hot_reset *reset;
2897     int32_t *fds;
2898     int ret, i, count;
2899     bool multi = false;
2900
2901     DPRINTF("%s(%04x:%02x:%02x.%x) %s\n", __func__, vdev->host.domain,
2902             vdev->host.bus, vdev->host.slot, vdev->host.function,
2903             single ? "one" : "multi");
2904
2905     vfio_pci_pre_reset(vdev);
2906     vdev->needs_reset = false;
2907
2908     info = g_malloc0(sizeof(*info));
2909     info->argsz = sizeof(*info);
2910
2911     ret = ioctl(vdev->fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
2912     if (ret && errno != ENOSPC) {
2913         ret = -errno;
2914         if (!vdev->has_pm_reset) {
2915             error_report("vfio: Cannot reset device %04x:%02x:%02x.%x, "
2916                          "no available reset mechanism.", vdev->host.domain,
2917                          vdev->host.bus, vdev->host.slot, vdev->host.function);
2918         }
2919         goto out_single;
2920     }
2921
2922     count = info->count;
2923     info = g_realloc(info, sizeof(*info) + (count * sizeof(*devices)));
2924     info->argsz = sizeof(*info) + (count * sizeof(*devices));
2925     devices = &info->devices[0];
2926
2927     ret = ioctl(vdev->fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
2928     if (ret) {
2929         ret = -errno;
2930         error_report("vfio: hot reset info failed: %m");
2931         goto out_single;
2932     }
2933
2934     DPRINTF("%04x:%02x:%02x.%x: hot reset dependent devices:\n",
2935             vdev->host.domain, vdev->host.bus, vdev->host.slot,
2936             vdev->host.function);
2937
2938     /* Verify that we have all the groups required */
2939     for (i = 0; i < info->count; i++) {
2940         PCIHostDeviceAddress host;
2941         VFIODevice *tmp;
2942
2943         host.domain = devices[i].segment;
2944         host.bus = devices[i].bus;
2945         host.slot = PCI_SLOT(devices[i].devfn);
2946         host.function = PCI_FUNC(devices[i].devfn);
2947
2948         DPRINTF("\t%04x:%02x:%02x.%x group %d\n", host.domain,
2949                 host.bus, host.slot, host.function, devices[i].group_id);
2950
2951         if (vfio_pci_host_match(&host, &vdev->host)) {
2952             continue;
2953         }
2954
2955         QLIST_FOREACH(group, &group_list, next) {
2956             if (group->groupid == devices[i].group_id) {
2957                 break;
2958             }
2959         }
2960
2961         if (!group) {
2962             if (!vdev->has_pm_reset) {
2963                 error_report("vfio: Cannot reset device %04x:%02x:%02x.%x, "
2964                              "depends on group %d which is not owned.",
2965                              vdev->host.domain, vdev->host.bus, vdev->host.slot,
2966                              vdev->host.function, devices[i].group_id);
2967             }
2968             ret = -EPERM;
2969             goto out;
2970         }
2971
2972         /* Prep dependent devices for reset and clear our marker. */
2973         QLIST_FOREACH(tmp, &group->device_list, next) {
2974             if (vfio_pci_host_match(&host, &tmp->host)) {
2975                 if (single) {
2976                     DPRINTF("vfio: found another in-use device "
2977                             "%04x:%02x:%02x.%x\n", host.domain, host.bus,
2978                             host.slot, host.function);
2979                     ret = -EINVAL;
2980                     goto out_single;
2981                 }
2982                 vfio_pci_pre_reset(tmp);
2983                 tmp->needs_reset = false;
2984                 multi = true;
2985                 break;
2986             }
2987         }
2988     }
2989
2990     if (!single && !multi) {
2991         DPRINTF("vfio: No other in-use devices for multi hot reset\n");
2992         ret = -EINVAL;
2993         goto out_single;
2994     }
2995
2996     /* Determine how many group fds need to be passed */
2997     count = 0;
2998     QLIST_FOREACH(group, &group_list, next) {
2999         for (i = 0; i < info->count; i++) {
3000             if (group->groupid == devices[i].group_id) {
3001                 count++;
3002                 break;
3003             }
3004         }
3005     }
3006
3007     reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds)));
3008     reset->argsz = sizeof(*reset) + (count * sizeof(*fds));
3009     fds = &reset->group_fds[0];
3010
3011     /* Fill in group fds */
3012     QLIST_FOREACH(group, &group_list, next) {
3013         for (i = 0; i < info->count; i++) {
3014             if (group->groupid == devices[i].group_id) {
3015                 fds[reset->count++] = group->fd;
3016                 break;
3017             }
3018         }
3019     }
3020
3021     /* Bus reset! */
3022     ret = ioctl(vdev->fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
3023     g_free(reset);
3024
3025     DPRINTF("%04x:%02x:%02x.%x hot reset: %s\n", vdev->host.domain,
3026             vdev->host.bus, vdev->host.slot, vdev->host.function,
3027             ret ? "%m" : "Success");
3028
3029 out:
3030     /* Re-enable INTx on affected devices */
3031     for (i = 0; i < info->count; i++) {
3032         PCIHostDeviceAddress host;
3033         VFIODevice *tmp;
3034
3035         host.domain = devices[i].segment;
3036         host.bus = devices[i].bus;
3037         host.slot = PCI_SLOT(devices[i].devfn);
3038         host.function = PCI_FUNC(devices[i].devfn);
3039
3040         if (vfio_pci_host_match(&host, &vdev->host)) {
3041             continue;
3042         }
3043
3044         QLIST_FOREACH(group, &group_list, next) {
3045             if (group->groupid == devices[i].group_id) {
3046                 break;
3047             }
3048         }
3049
3050         if (!group) {
3051             break;
3052         }
3053
3054         QLIST_FOREACH(tmp, &group->device_list, next) {
3055             if (vfio_pci_host_match(&host, &tmp->host)) {
3056                 vfio_pci_post_reset(tmp);
3057                 break;
3058             }
3059         }
3060     }
3061 out_single:
3062     vfio_pci_post_reset(vdev);
3063     g_free(info);
3064
3065     return ret;
3066 }
3067
3068 /*
3069  * We want to differentiate hot reset of mulitple in-use devices vs hot reset
3070  * of a single in-use device.  VFIO_DEVICE_RESET will already handle the case
3071  * of doing hot resets when there is only a single device per bus.  The in-use
3072  * here refers to how many VFIODevices are affected.  A hot reset that affects
3073  * multiple devices, but only a single in-use device, means that we can call
3074  * it from our bus ->reset() callback since the extent is effectively a single
3075  * device.  This allows us to make use of it in the hotplug path.  When there
3076  * are multiple in-use devices, we can only trigger the hot reset during a
3077  * system reset and thus from our reset handler.  We separate _one vs _multi
3078  * here so that we don't overlap and do a double reset on the system reset
3079  * path where both our reset handler and ->reset() callback are used.  Calling
3080  * _one() will only do a hot reset for the one in-use devices case, calling
3081  * _multi() will do nothing if a _one() would have been sufficient.
3082  */
3083 static int vfio_pci_hot_reset_one(VFIODevice *vdev)
3084 {
3085     return vfio_pci_hot_reset(vdev, true);
3086 }
3087
3088 static int vfio_pci_hot_reset_multi(VFIODevice *vdev)
3089 {
3090     return vfio_pci_hot_reset(vdev, false);
3091 }
3092
3093 static void vfio_pci_reset_handler(void *opaque)
3094 {
3095     VFIOGroup *group;
3096     VFIODevice *vdev;
3097
3098     QLIST_FOREACH(group, &group_list, next) {
3099         QLIST_FOREACH(vdev, &group->device_list, next) {
3100             if (!vdev->reset_works || (!vdev->has_flr && vdev->has_pm_reset)) {
3101                 vdev->needs_reset = true;
3102             }
3103         }
3104     }
3105
3106     QLIST_FOREACH(group, &group_list, next) {
3107         QLIST_FOREACH(vdev, &group->device_list, next) {
3108             if (vdev->needs_reset) {
3109                 vfio_pci_hot_reset_multi(vdev);
3110             }
3111         }
3112     }
3113 }
3114
3115 static void vfio_kvm_device_add_group(VFIOGroup *group)
3116 {
3117 #ifdef CONFIG_KVM
3118     struct kvm_device_attr attr = {
3119         .group = KVM_DEV_VFIO_GROUP,
3120         .attr = KVM_DEV_VFIO_GROUP_ADD,
3121         .addr = (uint64_t)(unsigned long)&group->fd,
3122     };
3123
3124     if (!kvm_enabled()) {
3125         return;
3126     }
3127
3128     if (vfio_kvm_device_fd < 0) {
3129         struct kvm_create_device cd = {
3130             .type = KVM_DEV_TYPE_VFIO,
3131         };
3132
3133         if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) {
3134             DPRINTF("KVM_CREATE_DEVICE: %m\n");
3135             return;
3136         }
3137
3138         vfio_kvm_device_fd = cd.fd;
3139     }
3140
3141     if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
3142         error_report("Failed to add group %d to KVM VFIO device: %m",
3143                      group->groupid);
3144     }
3145 #endif
3146 }
3147
3148 static void vfio_kvm_device_del_group(VFIOGroup *group)
3149 {
3150 #ifdef CONFIG_KVM
3151     struct kvm_device_attr attr = {
3152         .group = KVM_DEV_VFIO_GROUP,
3153         .attr = KVM_DEV_VFIO_GROUP_DEL,
3154         .addr = (uint64_t)(unsigned long)&group->fd,
3155     };
3156
3157     if (vfio_kvm_device_fd < 0) {
3158         return;
3159     }
3160
3161     if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
3162         error_report("Failed to remove group %d to KVM VFIO device: %m",
3163                      group->groupid);
3164     }
3165 #endif
3166 }
3167
3168 static int vfio_connect_container(VFIOGroup *group)
3169 {
3170     VFIOContainer *container;
3171     int ret, fd;
3172
3173     if (group->container) {
3174         return 0;
3175     }
3176
3177     QLIST_FOREACH(container, &container_list, next) {
3178         if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
3179             group->container = container;
3180             QLIST_INSERT_HEAD(&container->group_list, group, container_next);
3181             return 0;
3182         }
3183     }
3184
3185     fd = qemu_open("/dev/vfio/vfio", O_RDWR);
3186     if (fd < 0) {
3187         error_report("vfio: failed to open /dev/vfio/vfio: %m");
3188         return -errno;
3189     }
3190
3191     ret = ioctl(fd, VFIO_GET_API_VERSION);
3192     if (ret != VFIO_API_VERSION) {
3193         error_report("vfio: supported vfio version: %d, "
3194                      "reported version: %d", VFIO_API_VERSION, ret);
3195         close(fd);
3196         return -EINVAL;
3197     }
3198
3199     container = g_malloc0(sizeof(*container));
3200     container->fd = fd;
3201
3202     if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
3203         ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
3204         if (ret) {
3205             error_report("vfio: failed to set group container: %m");
3206             g_free(container);
3207             close(fd);
3208             return -errno;
3209         }
3210
3211         ret = ioctl(fd, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU);
3212         if (ret) {
3213             error_report("vfio: failed to set iommu for container: %m");
3214             g_free(container);
3215             close(fd);
3216             return -errno;
3217         }
3218
3219         container->iommu_data.listener = vfio_memory_listener;
3220         container->iommu_data.release = vfio_listener_release;
3221
3222         memory_listener_register(&container->iommu_data.listener, &address_space_memory);
3223     } else {
3224         error_report("vfio: No available IOMMU models");
3225         g_free(container);
3226         close(fd);
3227         return -EINVAL;
3228     }
3229
3230     QLIST_INIT(&container->group_list);
3231     QLIST_INSERT_HEAD(&container_list, container, next);
3232
3233     group->container = container;
3234     QLIST_INSERT_HEAD(&container->group_list, group, container_next);
3235
3236     return 0;
3237 }
3238
3239 static void vfio_disconnect_container(VFIOGroup *group)
3240 {
3241     VFIOContainer *container = group->container;
3242
3243     if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
3244         error_report("vfio: error disconnecting group %d from container",
3245                      group->groupid);
3246     }
3247
3248     QLIST_REMOVE(group, container_next);
3249     group->container = NULL;
3250
3251     if (QLIST_EMPTY(&container->group_list)) {
3252         if (container->iommu_data.release) {
3253             container->iommu_data.release(container);
3254         }
3255         QLIST_REMOVE(container, next);
3256         DPRINTF("vfio_disconnect_container: close container->fd\n");
3257         close(container->fd);
3258         g_free(container);
3259     }
3260 }
3261
3262 static VFIOGroup *vfio_get_group(int groupid)
3263 {
3264     VFIOGroup *group;
3265     char path[32];
3266     struct vfio_group_status status = { .argsz = sizeof(status) };
3267
3268     QLIST_FOREACH(group, &group_list, next) {
3269         if (group->groupid == groupid) {
3270             return group;
3271         }
3272     }
3273
3274     group = g_malloc0(sizeof(*group));
3275
3276     snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
3277     group->fd = qemu_open(path, O_RDWR);
3278     if (group->fd < 0) {
3279         error_report("vfio: error opening %s: %m", path);
3280         g_free(group);
3281         return NULL;
3282     }
3283
3284     if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
3285         error_report("vfio: error getting group status: %m");
3286         close(group->fd);
3287         g_free(group);
3288         return NULL;
3289     }
3290
3291     if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
3292         error_report("vfio: error, group %d is not viable, please ensure "
3293                      "all devices within the iommu_group are bound to their "
3294                      "vfio bus driver.", groupid);
3295         close(group->fd);
3296         g_free(group);
3297         return NULL;
3298     }
3299
3300     group->groupid = groupid;
3301     QLIST_INIT(&group->device_list);
3302
3303     if (vfio_connect_container(group)) {
3304         error_report("vfio: failed to setup container for group %d", groupid);
3305         close(group->fd);
3306         g_free(group);
3307         return NULL;
3308     }
3309
3310     if (QLIST_EMPTY(&group_list)) {
3311         qemu_register_reset(vfio_pci_reset_handler, NULL);
3312     }
3313
3314     QLIST_INSERT_HEAD(&group_list, group, next);
3315
3316     vfio_kvm_device_add_group(group);
3317
3318     return group;
3319 }
3320
3321 static void vfio_put_group(VFIOGroup *group)
3322 {
3323     if (!QLIST_EMPTY(&group->device_list)) {
3324         return;
3325     }
3326
3327     vfio_kvm_device_del_group(group);
3328     vfio_disconnect_container(group);
3329     QLIST_REMOVE(group, next);
3330     DPRINTF("vfio_put_group: close group->fd\n");
3331     close(group->fd);
3332     g_free(group);
3333
3334     if (QLIST_EMPTY(&group_list)) {
3335         qemu_unregister_reset(vfio_pci_reset_handler, NULL);
3336     }
3337 }
3338
3339 static int vfio_get_device(VFIOGroup *group, const char *name, VFIODevice *vdev)
3340 {
3341     struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
3342     struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) };
3343     struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
3344     int ret, i;
3345
3346     ret = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
3347     if (ret < 0) {
3348         error_report("vfio: error getting device %s from group %d: %m",
3349                      name, group->groupid);
3350         error_printf("Verify all devices in group %d are bound to vfio-pci "
3351                      "or pci-stub and not already in use\n", group->groupid);
3352         return ret;
3353     }
3354
3355     vdev->fd = ret;
3356     vdev->group = group;
3357     QLIST_INSERT_HEAD(&group->device_list, vdev, next);
3358
3359     /* Sanity check device */
3360     ret = ioctl(vdev->fd, VFIO_DEVICE_GET_INFO, &dev_info);
3361     if (ret) {
3362         error_report("vfio: error getting device info: %m");
3363         goto error;
3364     }
3365
3366     DPRINTF("Device %s flags: %u, regions: %u, irgs: %u\n", name,
3367             dev_info.flags, dev_info.num_regions, dev_info.num_irqs);
3368
3369     if (!(dev_info.flags & VFIO_DEVICE_FLAGS_PCI)) {
3370         error_report("vfio: Um, this isn't a PCI device");
3371         goto error;
3372     }
3373
3374     vdev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
3375
3376     if (dev_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1) {
3377         error_report("vfio: unexpected number of io regions %u",
3378                      dev_info.num_regions);
3379         goto error;
3380     }
3381
3382     if (dev_info.num_irqs < VFIO_PCI_MSIX_IRQ_INDEX + 1) {
3383         error_report("vfio: unexpected number of irqs %u", dev_info.num_irqs);
3384         goto error;
3385     }
3386
3387     for (i = VFIO_PCI_BAR0_REGION_INDEX; i < VFIO_PCI_ROM_REGION_INDEX; i++) {
3388         reg_info.index = i;
3389
3390         ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
3391         if (ret) {
3392             error_report("vfio: Error getting region %d info: %m", i);
3393             goto error;
3394         }
3395
3396         DPRINTF("Device %s region %d:\n", name, i);
3397         DPRINTF("  size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
3398                 (unsigned long)reg_info.size, (unsigned long)reg_info.offset,
3399                 (unsigned long)reg_info.flags);
3400
3401         vdev->bars[i].flags = reg_info.flags;
3402         vdev->bars[i].size = reg_info.size;
3403         vdev->bars[i].fd_offset = reg_info.offset;
3404         vdev->bars[i].fd = vdev->fd;
3405         vdev->bars[i].nr = i;
3406         QLIST_INIT(&vdev->bars[i].quirks);
3407     }
3408
3409     reg_info.index = VFIO_PCI_CONFIG_REGION_INDEX;
3410
3411     ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
3412     if (ret) {
3413         error_report("vfio: Error getting config info: %m");
3414         goto error;
3415     }
3416
3417     DPRINTF("Device %s config:\n", name);
3418     DPRINTF("  size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
3419             (unsigned long)reg_info.size, (unsigned long)reg_info.offset,
3420             (unsigned long)reg_info.flags);
3421
3422     vdev->config_size = reg_info.size;
3423     if (vdev->config_size == PCI_CONFIG_SPACE_SIZE) {
3424         vdev->pdev.cap_present &= ~QEMU_PCI_CAP_EXPRESS;
3425     }
3426     vdev->config_offset = reg_info.offset;
3427
3428     if ((vdev->features & VFIO_FEATURE_ENABLE_VGA) &&
3429         dev_info.num_regions > VFIO_PCI_VGA_REGION_INDEX) {
3430         struct vfio_region_info vga_info = {
3431             .argsz = sizeof(vga_info),
3432             .index = VFIO_PCI_VGA_REGION_INDEX,
3433          };
3434
3435         ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &vga_info);
3436         if (ret) {
3437             error_report(
3438                 "vfio: Device does not support requested feature x-vga");
3439             goto error;
3440         }
3441
3442         if (!(vga_info.flags & VFIO_REGION_INFO_FLAG_READ) ||
3443             !(vga_info.flags & VFIO_REGION_INFO_FLAG_WRITE) ||
3444             vga_info.size < 0xbffff + 1) {
3445             error_report("vfio: Unexpected VGA info, flags 0x%lx, size 0x%lx",
3446                          (unsigned long)vga_info.flags,
3447                          (unsigned long)vga_info.size);
3448             goto error;
3449         }
3450
3451         vdev->vga.fd_offset = vga_info.offset;
3452         vdev->vga.fd = vdev->fd;
3453
3454         vdev->vga.region[QEMU_PCI_VGA_MEM].offset = QEMU_PCI_VGA_MEM_BASE;
3455         vdev->vga.region[QEMU_PCI_VGA_MEM].nr = QEMU_PCI_VGA_MEM;
3456         QLIST_INIT(&vdev->vga.region[QEMU_PCI_VGA_MEM].quirks);
3457
3458         vdev->vga.region[QEMU_PCI_VGA_IO_LO].offset = QEMU_PCI_VGA_IO_LO_BASE;
3459         vdev->vga.region[QEMU_PCI_VGA_IO_LO].nr = QEMU_PCI_VGA_IO_LO;
3460         QLIST_INIT(&vdev->vga.region[QEMU_PCI_VGA_IO_LO].quirks);
3461
3462         vdev->vga.region[QEMU_PCI_VGA_IO_HI].offset = QEMU_PCI_VGA_IO_HI_BASE;
3463         vdev->vga.region[QEMU_PCI_VGA_IO_HI].nr = QEMU_PCI_VGA_IO_HI;
3464         QLIST_INIT(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].quirks);
3465
3466         vdev->has_vga = true;
3467     }
3468     irq_info.index = VFIO_PCI_ERR_IRQ_INDEX;
3469
3470     ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
3471     if (ret) {
3472         /* This can fail for an old kernel or legacy PCI dev */
3473         DPRINTF("VFIO_DEVICE_GET_IRQ_INFO failure: %m\n");
3474         ret = 0;
3475     } else if (irq_info.count == 1) {
3476         vdev->pci_aer = true;
3477     } else {
3478         error_report("vfio: %04x:%02x:%02x.%x "
3479                      "Could not enable error recovery for the device",
3480                      vdev->host.domain, vdev->host.bus, vdev->host.slot,
3481                      vdev->host.function);
3482     }
3483
3484 error:
3485     if (ret) {
3486         QLIST_REMOVE(vdev, next);
3487         vdev->group = NULL;
3488         close(vdev->fd);
3489     }
3490     return ret;
3491 }
3492
3493 static void vfio_put_device(VFIODevice *vdev)
3494 {
3495     QLIST_REMOVE(vdev, next);
3496     vdev->group = NULL;
3497     DPRINTF("vfio_put_device: close vdev->fd\n");
3498     close(vdev->fd);
3499     if (vdev->msix) {
3500         g_free(vdev->msix);
3501         vdev->msix = NULL;
3502     }
3503 }
3504
3505 static void vfio_err_notifier_handler(void *opaque)
3506 {
3507     VFIODevice *vdev = opaque;
3508
3509     if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
3510         return;
3511     }
3512
3513     /*
3514      * TBD. Retrieve the error details and decide what action
3515      * needs to be taken. One of the actions could be to pass
3516      * the error to the guest and have the guest driver recover
3517      * from the error. This requires that PCIe capabilities be
3518      * exposed to the guest. For now, we just terminate the
3519      * guest to contain the error.
3520      */
3521
3522     error_report("%s(%04x:%02x:%02x.%x) Unrecoverable error detected.  "
3523                  "Please collect any data possible and then kill the guest",
3524                  __func__, vdev->host.domain, vdev->host.bus,
3525                  vdev->host.slot, vdev->host.function);
3526
3527     vm_stop(RUN_STATE_IO_ERROR);
3528 }
3529
3530 /*
3531  * Registers error notifier for devices supporting error recovery.
3532  * If we encounter a failure in this function, we report an error
3533  * and continue after disabling error recovery support for the
3534  * device.
3535  */
3536 static void vfio_register_err_notifier(VFIODevice *vdev)
3537 {
3538     int ret;
3539     int argsz;
3540     struct vfio_irq_set *irq_set;
3541     int32_t *pfd;
3542
3543     if (!vdev->pci_aer) {
3544         return;
3545     }
3546
3547     if (event_notifier_init(&vdev->err_notifier, 0)) {
3548         error_report("vfio: Unable to init event notifier for error detection");
3549         vdev->pci_aer = false;
3550         return;
3551     }
3552
3553     argsz = sizeof(*irq_set) + sizeof(*pfd);
3554
3555     irq_set = g_malloc0(argsz);
3556     irq_set->argsz = argsz;
3557     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
3558                      VFIO_IRQ_SET_ACTION_TRIGGER;
3559     irq_set->index = VFIO_PCI_ERR_IRQ_INDEX;
3560     irq_set->start = 0;
3561     irq_set->count = 1;
3562     pfd = (int32_t *)&irq_set->data;
3563
3564     *pfd = event_notifier_get_fd(&vdev->err_notifier);
3565     qemu_set_fd_handler(*pfd, vfio_err_notifier_handler, NULL, vdev);
3566
3567     ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
3568     if (ret) {
3569         error_report("vfio: Failed to set up error notification");
3570         qemu_set_fd_handler(*pfd, NULL, NULL, vdev);
3571         event_notifier_cleanup(&vdev->err_notifier);
3572         vdev->pci_aer = false;
3573     }
3574     g_free(irq_set);
3575 }
3576
3577 static void vfio_unregister_err_notifier(VFIODevice *vdev)
3578 {
3579     int argsz;
3580     struct vfio_irq_set *irq_set;
3581     int32_t *pfd;
3582     int ret;
3583
3584     if (!vdev->pci_aer) {
3585         return;
3586     }
3587
3588     argsz = sizeof(*irq_set) + sizeof(*pfd);
3589
3590     irq_set = g_malloc0(argsz);
3591     irq_set->argsz = argsz;
3592     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
3593                      VFIO_IRQ_SET_ACTION_TRIGGER;
3594     irq_set->index = VFIO_PCI_ERR_IRQ_INDEX;
3595     irq_set->start = 0;
3596     irq_set->count = 1;
3597     pfd = (int32_t *)&irq_set->data;
3598     *pfd = -1;
3599
3600     ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
3601     if (ret) {
3602         error_report("vfio: Failed to de-assign error fd: %m");
3603     }
3604     g_free(irq_set);
3605     qemu_set_fd_handler(event_notifier_get_fd(&vdev->err_notifier),
3606                         NULL, NULL, vdev);
3607     event_notifier_cleanup(&vdev->err_notifier);
3608 }
3609
3610 static int vfio_initfn(PCIDevice *pdev)
3611 {
3612     VFIODevice *pvdev, *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
3613     VFIOGroup *group;
3614     char path[PATH_MAX], iommu_group_path[PATH_MAX], *group_name;
3615     ssize_t len;
3616     struct stat st;
3617     int groupid;
3618     int ret;
3619
3620     /* Check that the host device exists */
3621     snprintf(path, sizeof(path),
3622              "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
3623              vdev->host.domain, vdev->host.bus, vdev->host.slot,
3624              vdev->host.function);
3625     if (stat(path, &st) < 0) {
3626         error_report("vfio: error: no such host device: %s", path);
3627         return -errno;
3628     }
3629
3630     strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1);
3631
3632     len = readlink(path, iommu_group_path, PATH_MAX);
3633     if (len <= 0) {
3634         error_report("vfio: error no iommu_group for device");
3635         return -errno;
3636     }
3637
3638     iommu_group_path[len] = 0;
3639     group_name = basename(iommu_group_path);
3640
3641     if (sscanf(group_name, "%d", &groupid) != 1) {
3642         error_report("vfio: error reading %s: %m", path);
3643         return -errno;
3644     }
3645
3646     DPRINTF("%s(%04x:%02x:%02x.%x) group %d\n", __func__, vdev->host.domain,
3647             vdev->host.bus, vdev->host.slot, vdev->host.function, groupid);
3648
3649     group = vfio_get_group(groupid);
3650     if (!group) {
3651         error_report("vfio: failed to get group %d", groupid);
3652         return -ENOENT;
3653     }
3654
3655     snprintf(path, sizeof(path), "%04x:%02x:%02x.%01x",
3656             vdev->host.domain, vdev->host.bus, vdev->host.slot,
3657             vdev->host.function);
3658
3659     QLIST_FOREACH(pvdev, &group->device_list, next) {
3660         if (pvdev->host.domain == vdev->host.domain &&
3661             pvdev->host.bus == vdev->host.bus &&
3662             pvdev->host.slot == vdev->host.slot &&
3663             pvdev->host.function == vdev->host.function) {
3664
3665             error_report("vfio: error: device %s is already attached", path);
3666             vfio_put_group(group);
3667             return -EBUSY;
3668         }
3669     }
3670
3671     ret = vfio_get_device(group, path, vdev);
3672     if (ret) {
3673         error_report("vfio: failed to get device %s", path);
3674         vfio_put_group(group);
3675         return ret;
3676     }
3677
3678     /* Get a copy of config space */
3679     ret = pread(vdev->fd, vdev->pdev.config,
3680                 MIN(pci_config_size(&vdev->pdev), vdev->config_size),
3681                 vdev->config_offset);
3682     if (ret < (int)MIN(pci_config_size(&vdev->pdev), vdev->config_size)) {
3683         ret = ret < 0 ? -errno : -EFAULT;
3684         error_report("vfio: Failed to read device config space");
3685         goto out_put;
3686     }
3687
3688     /* vfio emulates a lot for us, but some bits need extra love */
3689     vdev->emulated_config_bits = g_malloc0(vdev->config_size);
3690
3691     /* QEMU can choose to expose the ROM or not */
3692     memset(vdev->emulated_config_bits + PCI_ROM_ADDRESS, 0xff, 4);
3693
3694     /* QEMU can change multi-function devices to single function, or reverse */
3695     vdev->emulated_config_bits[PCI_HEADER_TYPE] =
3696                                               PCI_HEADER_TYPE_MULTI_FUNCTION;
3697
3698     /* Restore or clear multifunction, this is always controlled by QEMU */
3699     if (vdev->pdev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
3700         vdev->pdev.config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION;
3701     } else {
3702         vdev->pdev.config[PCI_HEADER_TYPE] &= ~PCI_HEADER_TYPE_MULTI_FUNCTION;
3703     }
3704
3705     /*
3706      * Clear host resource mapping info.  If we choose not to register a
3707      * BAR, such as might be the case with the option ROM, we can get
3708      * confusing, unwritable, residual addresses from the host here.
3709      */
3710     memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24);
3711     memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4);
3712
3713     vfio_pci_size_rom(vdev);
3714
3715     ret = vfio_early_setup_msix(vdev);
3716     if (ret) {
3717         goto out_put;
3718     }
3719
3720     vfio_map_bars(vdev);
3721
3722     ret = vfio_add_capabilities(vdev);
3723     if (ret) {
3724         goto out_teardown;
3725     }
3726
3727     /* QEMU emulates all of MSI & MSIX */
3728     if (pdev->cap_present & QEMU_PCI_CAP_MSIX) {
3729         memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff,
3730                MSIX_CAP_LENGTH);
3731     }
3732
3733     if (pdev->cap_present & QEMU_PCI_CAP_MSI) {
3734         memset(vdev->emulated_config_bits + pdev->msi_cap, 0xff,
3735                vdev->msi_cap_size);
3736     }
3737
3738     if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
3739         vdev->intx.mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL,
3740                                                   vfio_intx_mmap_enable, vdev);
3741         pci_device_set_intx_routing_notifier(&vdev->pdev, vfio_update_irq);
3742         ret = vfio_enable_intx(vdev);
3743         if (ret) {
3744             goto out_teardown;
3745         }
3746     }
3747
3748     add_boot_device_path(vdev->bootindex, &pdev->qdev, NULL);
3749     vfio_register_err_notifier(vdev);
3750
3751     return 0;
3752
3753 out_teardown:
3754     pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
3755     vfio_teardown_msi(vdev);
3756     vfio_unmap_bars(vdev);
3757 out_put:
3758     g_free(vdev->emulated_config_bits);
3759     vfio_put_device(vdev);
3760     vfio_put_group(group);
3761     return ret;
3762 }
3763
3764 static void vfio_exitfn(PCIDevice *pdev)
3765 {
3766     VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
3767     VFIOGroup *group = vdev->group;
3768
3769     vfio_unregister_err_notifier(vdev);
3770     pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
3771     vfio_disable_interrupts(vdev);
3772     if (vdev->intx.mmap_timer) {
3773         timer_free(vdev->intx.mmap_timer);
3774     }
3775     vfio_teardown_msi(vdev);
3776     vfio_unmap_bars(vdev);
3777     g_free(vdev->emulated_config_bits);
3778     g_free(vdev->rom);
3779     vfio_put_device(vdev);
3780     vfio_put_group(group);
3781 }
3782
3783 static void vfio_pci_reset(DeviceState *dev)
3784 {
3785     PCIDevice *pdev = DO_UPCAST(PCIDevice, qdev, dev);
3786     VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
3787
3788     DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
3789             vdev->host.bus, vdev->host.slot, vdev->host.function);
3790
3791     vfio_pci_pre_reset(vdev);
3792
3793     if (vdev->reset_works && (vdev->has_flr || !vdev->has_pm_reset) &&
3794         !ioctl(vdev->fd, VFIO_DEVICE_RESET)) {
3795         DPRINTF("%04x:%02x:%02x.%x FLR/VFIO_DEVICE_RESET\n", vdev->host.domain,
3796             vdev->host.bus, vdev->host.slot, vdev->host.function);
3797         goto post_reset;
3798     }
3799
3800     /* See if we can do our own bus reset */
3801     if (!vfio_pci_hot_reset_one(vdev)) {
3802         goto post_reset;
3803     }
3804
3805     /* If nothing else works and the device supports PM reset, use it */
3806     if (vdev->reset_works && vdev->has_pm_reset &&
3807         !ioctl(vdev->fd, VFIO_DEVICE_RESET)) {
3808         DPRINTF("%04x:%02x:%02x.%x PCI PM Reset\n", vdev->host.domain,
3809             vdev->host.bus, vdev->host.slot, vdev->host.function);
3810         goto post_reset;
3811     }
3812
3813 post_reset:
3814     vfio_pci_post_reset(vdev);
3815 }
3816
3817 static Property vfio_pci_dev_properties[] = {
3818     DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIODevice, host),
3819     DEFINE_PROP_UINT32("x-intx-mmap-timeout-ms", VFIODevice,
3820                        intx.mmap_timeout, 1100),
3821     DEFINE_PROP_BIT("x-vga", VFIODevice, features,
3822                     VFIO_FEATURE_ENABLE_VGA_BIT, false),
3823     DEFINE_PROP_INT32("bootindex", VFIODevice, bootindex, -1),
3824     /*
3825      * TODO - support passed fds... is this necessary?
3826      * DEFINE_PROP_STRING("vfiofd", VFIODevice, vfiofd_name),
3827      * DEFINE_PROP_STRING("vfiogroupfd, VFIODevice, vfiogroupfd_name),
3828      */
3829     DEFINE_PROP_END_OF_LIST(),
3830 };
3831
3832 static const VMStateDescription vfio_pci_vmstate = {
3833     .name = "vfio-pci",
3834     .unmigratable = 1,
3835 };
3836
3837 static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
3838 {
3839     DeviceClass *dc = DEVICE_CLASS(klass);
3840     PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass);
3841
3842     dc->reset = vfio_pci_reset;
3843     dc->props = vfio_pci_dev_properties;
3844     dc->vmsd = &vfio_pci_vmstate;
3845     dc->desc = "VFIO-based PCI device assignment";
3846     set_bit(DEVICE_CATEGORY_MISC, dc->categories);
3847     pdc->init = vfio_initfn;
3848     pdc->exit = vfio_exitfn;
3849     pdc->config_read = vfio_pci_read_config;
3850     pdc->config_write = vfio_pci_write_config;
3851     pdc->is_express = 1; /* We might be */
3852 }
3853
3854 static const TypeInfo vfio_pci_dev_info = {
3855     .name = "vfio-pci",
3856     .parent = TYPE_PCI_DEVICE,
3857     .instance_size = sizeof(VFIODevice),
3858     .class_init = vfio_pci_dev_class_init,
3859 };
3860
3861 static void register_vfio_pci_dev_type(void)
3862 {
3863     type_register_static(&vfio_pci_dev_info);
3864 }
3865
3866 type_init(register_vfio_pci_dev_type)