hw/vfio/pci.c

   1 /*
   2  * vfio based device assignment support
   3  *
   4  * Copyright Red Hat, Inc. 2012
   5  *
   6  * Authors:
   7  *  Alex Williamson <alex.williamson@redhat.com>
   8  *
   9  * This work is licensed under the terms of the GNU GPL, version 2.  See
  10  * the COPYING file in the top-level directory.
  11  *
  12  * Based on qemu-kvm device-assignment:
  13  *  Adapted for KVM by Qumranet.
  14  *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
  15  *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
  16  *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
  17  *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
  18  *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
  19  */
  20
  21 #include <linux/vfio.h>
  22 #include <sys/ioctl.h>
  23 #include <sys/mman.h>
  24 #include <sys/stat.h>
  25 #include <sys/types.h>
  26 #include <unistd.h>
  27
  28 #include "config.h"
  29 #include "exec/address-spaces.h"
  30 #include "exec/memory.h"
  31 #include "hw/pci/msi.h"
  32 #include "hw/pci/msix.h"
  33 #include "hw/pci/pci.h"
  34 #include "qemu-common.h"
  35 #include "qemu/error-report.h"
  36 #include "qemu/event_notifier.h"
  37 #include "qemu/queue.h"
  38 #include "qemu/range.h"
  39 #include "sysemu/kvm.h"
  40 #include "sysemu/sysemu.h"
  41 #include "trace.h"
  42 #include "hw/vfio/vfio.h"
  43 #include "hw/vfio/vfio-common.h"
  44
  45 struct VFIOPCIDevice;
  46
  47 typedef struct VFIOQuirk {
  48     MemoryRegion mem;
  49     struct VFIOPCIDevice *vdev;
  50     QLIST_ENTRY(VFIOQuirk) next;
  51     struct {
  52         uint32_t base_offset:TARGET_PAGE_BITS;
  53         uint32_t address_offset:TARGET_PAGE_BITS;
  54         uint32_t address_size:3;
  55         uint32_t bar:3;
  56
  57         uint32_t address_match;
  58         uint32_t address_mask;
  59
  60         uint32_t address_val:TARGET_PAGE_BITS;
  61         uint32_t data_offset:TARGET_PAGE_BITS;
  62         uint32_t data_size:3;
  63
  64         uint8_t flags;
  65         uint8_t read_flags;
  66         uint8_t write_flags;
  67     } data;
  68 } VFIOQuirk;
  69
  70 typedef struct VFIOBAR {
  71     VFIORegion region;
  72     bool ioport;
  73     bool mem64;
  74     QLIST_HEAD(, VFIOQuirk) quirks;
  75 } VFIOBAR;
  76
  77 typedef struct VFIOVGARegion {
  78     MemoryRegion mem;
  79     off_t offset;
  80     int nr;
  81     QLIST_HEAD(, VFIOQuirk) quirks;
  82 } VFIOVGARegion;
  83
  84 typedef struct VFIOVGA {
  85     off_t fd_offset;
  86     int fd;
  87     VFIOVGARegion region[QEMU_PCI_VGA_NUM_REGIONS];
  88 } VFIOVGA;
  89
  90 typedef struct VFIOINTx {
  91     bool pending; /* interrupt pending */
  92     bool kvm_accel; /* set when QEMU bypass through KVM enabled */
  93     uint8_t pin; /* which pin to pull for qemu_set_irq */
  94     EventNotifier interrupt; /* eventfd triggered on interrupt */
  95     EventNotifier unmask; /* eventfd for unmask on QEMU bypass */
  96     PCIINTxRoute route; /* routing info for QEMU bypass */
  97     uint32_t mmap_timeout; /* delay to re-enable mmaps after interrupt */
  98     QEMUTimer *mmap_timer; /* enable mmaps after periods w/o interrupts */
  99 } VFIOINTx;
 100
 101 typedef struct VFIOMSIVector {
 102     /*
 103      * Two interrupt paths are configured per vector.  The first, is only used
 104      * for interrupts injected via QEMU.  This is typically the non-accel path,
 105      * but may also be used when we want QEMU to handle masking and pending
 106      * bits.  The KVM path bypasses QEMU and is therefore higher performance,
 107      * but requires masking at the device.  virq is used to track the MSI route
 108      * through KVM, thus kvm_interrupt is only available when virq is set to a
 109      * valid (>= 0) value.
 110      */
 111     EventNotifier interrupt;
 112     EventNotifier kvm_interrupt;
 113     struct VFIOPCIDevice *vdev; /* back pointer to device */
 114     int virq;
 115     bool use;
 116 } VFIOMSIVector;
 117
 118 enum {
 119     VFIO_INT_NONE = 0,
 120     VFIO_INT_INTx = 1,
 121     VFIO_INT_MSI  = 2,
 122     VFIO_INT_MSIX = 3,
 123 };
 124
 125 /* Cache of MSI-X setup plus extra mmap and memory region for split BAR map */
 126 typedef struct VFIOMSIXInfo {
 127     uint8_t table_bar;
 128     uint8_t pba_bar;
 129     uint16_t entries;
 130     uint32_t table_offset;
 131     uint32_t pba_offset;
 132     MemoryRegion mmap_mem;
 133     void *mmap;
 134 } VFIOMSIXInfo;
 135
 136 typedef struct VFIOPCIDevice {
 137     PCIDevice pdev;
 138     VFIODevice vbasedev;
 139     VFIOINTx intx;
 140     unsigned int config_size;
 141     uint8_t *emulated_config_bits; /* QEMU emulated bits, little-endian */
 142     off_t config_offset; /* Offset of config space region within device fd */
 143     unsigned int rom_size;
 144     off_t rom_offset; /* Offset of ROM region within device fd */
 145     void *rom;
 146     int msi_cap_size;
 147     VFIOMSIVector *msi_vectors;
 148     VFIOMSIXInfo *msix;
 149     int nr_vectors; /* Number of MSI/MSIX vectors currently in use */
 150     int interrupt; /* Current interrupt type */
 151     VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */
 152     VFIOVGA vga; /* 0xa0000, 0x3b0, 0x3c0 */
 153     PCIHostDeviceAddress host;
 154     EventNotifier err_notifier;
 155     EventNotifier req_notifier;
 156     int (*resetfn)(struct VFIOPCIDevice *);
 157     uint32_t features;
 158 #define VFIO_FEATURE_ENABLE_VGA_BIT 0
 159 #define VFIO_FEATURE_ENABLE_VGA (1 << VFIO_FEATURE_ENABLE_VGA_BIT)
 160 #define VFIO_FEATURE_ENABLE_REQ_BIT 1
 161 #define VFIO_FEATURE_ENABLE_REQ (1 << VFIO_FEATURE_ENABLE_REQ_BIT)
 162     int32_t bootindex;
 163     uint8_t pm_cap;
 164     bool has_vga;
 165     bool pci_aer;
 166     bool req_enabled;
 167     bool has_flr;
 168     bool has_pm_reset;
 169     bool rom_read_failed;
 170     bool no_kvm_intx;
 171     bool no_kvm_msi;
 172     bool no_kvm_msix;
 173 } VFIOPCIDevice;
 174
 175 typedef struct VFIORomBlacklistEntry {
 176     uint16_t vendor_id;
 177     uint16_t device_id;
 178 } VFIORomBlacklistEntry;
 179
 180 /*
 181  * List of device ids/vendor ids for which to disable
 182  * option rom loading. This avoids the guest hangs during rom
 183  * execution as noticed with the BCM 57810 card for lack of a
 184  * more better way to handle such issues.
 185  * The  user can still override by specifying a romfile or
 186  * rombar=1.
 187  * Please see https://bugs.launchpad.net/qemu/+bug/1284874
 188  * for an analysis of the 57810 card hang. When adding
 189  * a new vendor id/device id combination below, please also add
 190  * your card/environment details and information that could
 191  * help in debugging to the bug tracking this issue
 192  */
 193 static const VFIORomBlacklistEntry romblacklist[] = {
 194     /* Broadcom BCM 57810 */
 195     { 0x14e4, 0x168e }
 196 };
 197
 198 #define MSIX_CAP_LENGTH 12
 199
 200 static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
 201 static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
 202 static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
 203                                   uint32_t val, int len);
 204 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
 205
 206 /*
 207  * Disabling BAR mmaping can be slow, but toggling it around INTx can
 208  * also be a huge overhead.  We try to get the best of both worlds by
 209  * waiting until an interrupt to disable mmaps (subsequent transitions
 210  * to the same state are effectively no overhead).  If the interrupt has
 211  * been serviced and the time gap is long enough, we re-enable mmaps for
 212  * performance.  This works well for things like graphics cards, which
 213  * may not use their interrupt at all and are penalized to an unusable
 214  * level by read/write BAR traps.  Other devices, like NICs, have more
 215  * regular interrupts and see much better latency by staying in non-mmap
 216  * mode.  We therefore set the default mmap_timeout such that a ping
 217  * is just enough to keep the mmap disabled.  Users can experiment with
 218  * other options with the x-intx-mmap-timeout-ms parameter (a value of
 219  * zero disables the timer).
 220  */
 221 static void vfio_intx_mmap_enable(void *opaque)
 222 {
 223     VFIOPCIDevice *vdev = opaque;
 224
 225     if (vdev->intx.pending) {
 226         timer_mod(vdev->intx.mmap_timer,
 227                        qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout);
 228         return;
 229     }
 230
 231     vfio_mmap_set_enabled(vdev, true);
 232 }
 233
 234 static void vfio_intx_interrupt(void *opaque)
 235 {
 236     VFIOPCIDevice *vdev = opaque;
 237
 238     if (!event_notifier_test_and_clear(&vdev->intx.interrupt)) {
 239         return;
 240     }
 241
 242     trace_vfio_intx_interrupt(vdev->vbasedev.name, 'A' + vdev->intx.pin);
 243
 244     vdev->intx.pending = true;
 245     pci_irq_assert(&vdev->pdev);
 246     vfio_mmap_set_enabled(vdev, false);
 247     if (vdev->intx.mmap_timeout) {
 248         timer_mod(vdev->intx.mmap_timer,
 249                        qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout);
 250     }
 251 }
 252
 253 static void vfio_intx_eoi(VFIODevice *vbasedev)
 254 {
 255     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
 256
 257     if (!vdev->intx.pending) {
 258         return;
 259     }
 260
 261     trace_vfio_intx_eoi(vbasedev->name);
 262
 263     vdev->intx.pending = false;
 264     pci_irq_deassert(&vdev->pdev);
 265     vfio_unmask_single_irqindex(vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
 266 }
 267
 268 static void vfio_intx_enable_kvm(VFIOPCIDevice *vdev)
 269 {
 270 #ifdef CONFIG_KVM
 271     struct kvm_irqfd irqfd = {
 272         .fd = event_notifier_get_fd(&vdev->intx.interrupt),
 273         .gsi = vdev->intx.route.irq,
 274         .flags = KVM_IRQFD_FLAG_RESAMPLE,
 275     };
 276     struct vfio_irq_set *irq_set;
 277     int ret, argsz;
 278     int32_t *pfd;
 279
 280     if (vdev->no_kvm_intx || !kvm_irqfds_enabled() ||
 281         vdev->intx.route.mode != PCI_INTX_ENABLED ||
 282         !kvm_resamplefds_enabled()) {
 283         return;
 284     }
 285
 286     /* Get to a known interrupt state */
 287     qemu_set_fd_handler(irqfd.fd, NULL, NULL, vdev);
 288     vfio_mask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
 289     vdev->intx.pending = false;
 290     pci_irq_deassert(&vdev->pdev);
 291
 292     /* Get an eventfd for resample/unmask */
 293     if (event_notifier_init(&vdev->intx.unmask, 0)) {
 294         error_report("vfio: Error: event_notifier_init failed eoi");
 295         goto fail;
 296     }
 297
 298     /* KVM triggers it, VFIO listens for it */
 299     irqfd.resamplefd = event_notifier_get_fd(&vdev->intx.unmask);
 300
 301     if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) {
 302         error_report("vfio: Error: Failed to setup resample irqfd: %m");
 303         goto fail_irqfd;
 304     }
 305
 306     argsz = sizeof(*irq_set) + sizeof(*pfd);
 307
 308     irq_set = g_malloc0(argsz);
 309     irq_set->argsz = argsz;
 310     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK;
 311     irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
 312     irq_set->start = 0;
 313     irq_set->count = 1;
 314     pfd = (int32_t *)&irq_set->data;
 315
 316     *pfd = irqfd.resamplefd;
 317
 318     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
 319     g_free(irq_set);
 320     if (ret) {
 321         error_report("vfio: Error: Failed to setup INTx unmask fd: %m");
 322         goto fail_vfio;
 323     }
 324
 325     /* Let'em rip */
 326     vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
 327
 328     vdev->intx.kvm_accel = true;
 329
 330     trace_vfio_intx_enable_kvm(vdev->vbasedev.name);
 331
 332     return;
 333
 334 fail_vfio:
 335     irqfd.flags = KVM_IRQFD_FLAG_DEASSIGN;
 336     kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd);
 337 fail_irqfd:
 338     event_notifier_cleanup(&vdev->intx.unmask);
 339 fail:
 340     qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
 341     vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
 342 #endif
 343 }
 344
 345 static void vfio_intx_disable_kvm(VFIOPCIDevice *vdev)
 346 {
 347 #ifdef CONFIG_KVM
 348     struct kvm_irqfd irqfd = {
 349         .fd = event_notifier_get_fd(&vdev->intx.interrupt),
 350         .gsi = vdev->intx.route.irq,
 351         .flags = KVM_IRQFD_FLAG_DEASSIGN,
 352     };
 353
 354     if (!vdev->intx.kvm_accel) {
 355         return;
 356     }
 357
 358     /*
 359      * Get to a known state, hardware masked, QEMU ready to accept new
 360      * interrupts, QEMU IRQ de-asserted.
 361      */
 362     vfio_mask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
 363     vdev->intx.pending = false;
 364     pci_irq_deassert(&vdev->pdev);
 365
 366     /* Tell KVM to stop listening for an INTx irqfd */
 367     if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) {
 368         error_report("vfio: Error: Failed to disable INTx irqfd: %m");
 369     }
 370
 371     /* We only need to close the eventfd for VFIO to cleanup the kernel side */
 372     event_notifier_cleanup(&vdev->intx.unmask);
 373
 374     /* QEMU starts listening for interrupt events. */
 375     qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
 376
 377     vdev->intx.kvm_accel = false;
 378
 379     /* If we've missed an event, let it re-fire through QEMU */
 380     vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
 381
 382     trace_vfio_intx_disable_kvm(vdev->vbasedev.name);
 383 #endif
 384 }
 385
 386 static void vfio_intx_update(PCIDevice *pdev)
 387 {
 388     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
 389     PCIINTxRoute route;
 390
 391     if (vdev->interrupt != VFIO_INT_INTx) {
 392         return;
 393     }
 394
 395     route = pci_device_route_intx_to_irq(&vdev->pdev, vdev->intx.pin);
 396
 397     if (!pci_intx_route_changed(&vdev->intx.route, &route)) {
 398         return; /* Nothing changed */
 399     }
 400
 401     trace_vfio_intx_update(vdev->vbasedev.name,
 402                            vdev->intx.route.irq, route.irq);
 403
 404     vfio_intx_disable_kvm(vdev);
 405
 406     vdev->intx.route = route;
 407
 408     if (route.mode != PCI_INTX_ENABLED) {
 409         return;
 410     }
 411
 412     vfio_intx_enable_kvm(vdev);
 413
 414     /* Re-enable the interrupt in cased we missed an EOI */
 415     vfio_intx_eoi(&vdev->vbasedev);
 416 }
 417
 418 static int vfio_intx_enable(VFIOPCIDevice *vdev)
 419 {
 420     uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1);
 421     int ret, argsz;
 422     struct vfio_irq_set *irq_set;
 423     int32_t *pfd;
 424
 425     if (!pin) {
 426         return 0;
 427     }
 428
 429     vfio_disable_interrupts(vdev);
 430
 431     vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
 432     pci_config_set_interrupt_pin(vdev->pdev.config, pin);
 433
 434 #ifdef CONFIG_KVM
 435     /*
 436      * Only conditional to avoid generating error messages on platforms
 437      * where we won't actually use the result anyway.
 438      */
 439     if (kvm_irqfds_enabled() && kvm_resamplefds_enabled()) {
 440         vdev->intx.route = pci_device_route_intx_to_irq(&vdev->pdev,
 441                                                         vdev->intx.pin);
 442     }
 443 #endif
 444
 445     ret = event_notifier_init(&vdev->intx.interrupt, 0);
 446     if (ret) {
 447         error_report("vfio: Error: event_notifier_init failed");
 448         return ret;
 449     }
 450
 451     argsz = sizeof(*irq_set) + sizeof(*pfd);
 452
 453     irq_set = g_malloc0(argsz);
 454     irq_set->argsz = argsz;
 455     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
 456     irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
 457     irq_set->start = 0;
 458     irq_set->count = 1;
 459     pfd = (int32_t *)&irq_set->data;
 460
 461     *pfd = event_notifier_get_fd(&vdev->intx.interrupt);
 462     qemu_set_fd_handler(*pfd, vfio_intx_interrupt, NULL, vdev);
 463
 464     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
 465     g_free(irq_set);
 466     if (ret) {
 467         error_report("vfio: Error: Failed to setup INTx fd: %m");
 468         qemu_set_fd_handler(*pfd, NULL, NULL, vdev);
 469         event_notifier_cleanup(&vdev->intx.interrupt);
 470         return -errno;
 471     }
 472
 473     vfio_intx_enable_kvm(vdev);
 474
 475     vdev->interrupt = VFIO_INT_INTx;
 476
 477     trace_vfio_intx_enable(vdev->vbasedev.name);
 478
 479     return 0;
 480 }
 481
 482 static void vfio_intx_disable(VFIOPCIDevice *vdev)
 483 {
 484     int fd;
 485
 486     timer_del(vdev->intx.mmap_timer);
 487     vfio_intx_disable_kvm(vdev);
 488     vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
 489     vdev->intx.pending = false;
 490     pci_irq_deassert(&vdev->pdev);
 491     vfio_mmap_set_enabled(vdev, true);
 492
 493     fd = event_notifier_get_fd(&vdev->intx.interrupt);
 494     qemu_set_fd_handler(fd, NULL, NULL, vdev);
 495     event_notifier_cleanup(&vdev->intx.interrupt);
 496
 497     vdev->interrupt = VFIO_INT_NONE;
 498
 499     trace_vfio_intx_disable(vdev->vbasedev.name);
 500 }
 501
 502 /*
 503  * MSI/X
 504  */
 505 static void vfio_msi_interrupt(void *opaque)
 506 {
 507     VFIOMSIVector *vector = opaque;
 508     VFIOPCIDevice *vdev = vector->vdev;
 509     MSIMessage (*get_msg)(PCIDevice *dev, unsigned vector);
 510     void (*notify)(PCIDevice *dev, unsigned vector);
 511     MSIMessage msg;
 512     int nr = vector - vdev->msi_vectors;
 513
 514     if (!event_notifier_test_and_clear(&vector->interrupt)) {
 515         return;
 516     }
 517
 518     if (vdev->interrupt == VFIO_INT_MSIX) {
 519         get_msg = msix_get_message;
 520         notify = msix_notify;
 521     } else if (vdev->interrupt == VFIO_INT_MSI) {
 522         get_msg = msi_get_message;
 523         notify = msi_notify;
 524     } else {
 525         abort();
 526     }
 527
 528     msg = get_msg(&vdev->pdev, nr);
 529     trace_vfio_msi_interrupt(vdev->vbasedev.name, nr, msg.address, msg.data);
 530     notify(&vdev->pdev, nr);
 531 }
 532
 533 static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
 534 {
 535     struct vfio_irq_set *irq_set;
 536     int ret = 0, i, argsz;
 537     int32_t *fds;
 538
 539     argsz = sizeof(*irq_set) + (vdev->nr_vectors * sizeof(*fds));
 540
 541     irq_set = g_malloc0(argsz);
 542     irq_set->argsz = argsz;
 543     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
 544     irq_set->index = msix ? VFIO_PCI_MSIX_IRQ_INDEX : VFIO_PCI_MSI_IRQ_INDEX;
 545     irq_set->start = 0;
 546     irq_set->count = vdev->nr_vectors;
 547     fds = (int32_t *)&irq_set->data;
 548
 549     for (i = 0; i < vdev->nr_vectors; i++) {
 550         int fd = -1;
 551
 552         /*
 553          * MSI vs MSI-X - The guest has direct access to MSI mask and pending
 554          * bits, therefore we always use the KVM signaling path when setup.
 555          * MSI-X mask and pending bits are emulated, so we want to use the
 556          * KVM signaling path only when configured and unmasked.
 557          */
 558         if (vdev->msi_vectors[i].use) {
 559             if (vdev->msi_vectors[i].virq < 0 ||
 560                 (msix && msix_is_masked(&vdev->pdev, i))) {
 561                 fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
 562             } else {
 563                 fd = event_notifier_get_fd(&vdev->msi_vectors[i].kvm_interrupt);
 564             }
 565         }
 566
 567         fds[i] = fd;
 568     }
 569
 570     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
 571
 572     g_free(irq_set);
 573
 574     return ret;
 575 }
 576
 577 static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
 578                                   MSIMessage *msg, bool msix)
 579 {
 580     int virq;
 581
 582     if ((msix && vdev->no_kvm_msix) || (!msix && vdev->no_kvm_msi) || !msg) {
 583         return;
 584     }
 585
 586     if (event_notifier_init(&vector->kvm_interrupt, 0)) {
 587         return;
 588     }
 589
 590     virq = kvm_irqchip_add_msi_route(kvm_state, *msg);
 591     if (virq < 0) {
 592         event_notifier_cleanup(&vector->kvm_interrupt);
 593         return;
 594     }
 595
 596     if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
 597                                        NULL, virq) < 0) {
 598         kvm_irqchip_release_virq(kvm_state, virq);
 599         event_notifier_cleanup(&vector->kvm_interrupt);
 600         return;
 601     }
 602
 603     vector->virq = virq;
 604 }
 605
 606 static void vfio_remove_kvm_msi_virq(VFIOMSIVector *vector)
 607 {
 608     kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
 609                                           vector->virq);
 610     kvm_irqchip_release_virq(kvm_state, vector->virq);
 611     vector->virq = -1;
 612     event_notifier_cleanup(&vector->kvm_interrupt);
 613 }
 614
 615 static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg)
 616 {
 617     kvm_irqchip_update_msi_route(kvm_state, vector->virq, msg);
 618 }
 619
 620 static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
 621                                    MSIMessage *msg, IOHandler *handler)
 622 {
 623     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
 624     VFIOMSIVector *vector;
 625     int ret;
 626
 627     trace_vfio_msix_vector_do_use(vdev->vbasedev.name, nr);
 628
 629     vector = &vdev->msi_vectors[nr];
 630
 631     if (!vector->use) {
 632         vector->vdev = vdev;
 633         vector->virq = -1;
 634         if (event_notifier_init(&vector->interrupt, 0)) {
 635             error_report("vfio: Error: event_notifier_init failed");
 636         }
 637         vector->use = true;
 638         msix_vector_use(pdev, nr);
 639     }
 640
 641     qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
 642                         handler, NULL, vector);
 643
 644     /*
 645      * Attempt to enable route through KVM irqchip,
 646      * default to userspace handling if unavailable.
 647      */
 648     if (vector->virq >= 0) {
 649         if (!msg) {
 650             vfio_remove_kvm_msi_virq(vector);
 651         } else {
 652             vfio_update_kvm_msi_virq(vector, *msg);
 653         }
 654     } else {
 655         vfio_add_kvm_msi_virq(vdev, vector, msg, true);
 656     }
 657
 658     /*
 659      * We don't want to have the host allocate all possible MSI vectors
 660      * for a device if they're not in use, so we shutdown and incrementally
 661      * increase them as needed.
 662      */
 663     if (vdev->nr_vectors < nr + 1) {
 664         vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
 665         vdev->nr_vectors = nr + 1;
 666         ret = vfio_enable_vectors(vdev, true);
 667         if (ret) {
 668             error_report("vfio: failed to enable vectors, %d", ret);
 669         }
 670     } else {
 671         int argsz;
 672         struct vfio_irq_set *irq_set;
 673         int32_t *pfd;
 674
 675         argsz = sizeof(*irq_set) + sizeof(*pfd);
 676
 677         irq_set = g_malloc0(argsz);
 678         irq_set->argsz = argsz;
 679         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
 680                          VFIO_IRQ_SET_ACTION_TRIGGER;
 681         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
 682         irq_set->start = nr;
 683         irq_set->count = 1;
 684         pfd = (int32_t *)&irq_set->data;
 685
 686         if (vector->virq >= 0) {
 687             *pfd = event_notifier_get_fd(&vector->kvm_interrupt);
 688         } else {
 689             *pfd = event_notifier_get_fd(&vector->interrupt);
 690         }
 691
 692         ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
 693         g_free(irq_set);
 694         if (ret) {
 695             error_report("vfio: failed to modify vector, %d", ret);
 696         }
 697     }
 698
 699     return 0;
 700 }
 701
 702 static int vfio_msix_vector_use(PCIDevice *pdev,
 703                                 unsigned int nr, MSIMessage msg)
 704 {
 705     return vfio_msix_vector_do_use(pdev, nr, &msg, vfio_msi_interrupt);
 706 }
 707
 708 static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
 709 {
 710     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
 711     VFIOMSIVector *vector = &vdev->msi_vectors[nr];
 712
 713     trace_vfio_msix_vector_release(vdev->vbasedev.name, nr);
 714
 715     /*
 716      * There are still old guests that mask and unmask vectors on every
 717      * interrupt.  If we're using QEMU bypass with a KVM irqfd, leave all of
 718      * the KVM setup in place, simply switch VFIO to use the non-bypass
 719      * eventfd.  We'll then fire the interrupt through QEMU and the MSI-X
 720      * core will mask the interrupt and set pending bits, allowing it to
 721      * be re-asserted on unmask.  Nothing to do if already using QEMU mode.
 722      */
 723     if (vector->virq >= 0) {
 724         int argsz;
 725         struct vfio_irq_set *irq_set;
 726         int32_t *pfd;
 727
 728         argsz = sizeof(*irq_set) + sizeof(*pfd);
 729
 730         irq_set = g_malloc0(argsz);
 731         irq_set->argsz = argsz;
 732         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
 733                          VFIO_IRQ_SET_ACTION_TRIGGER;
 734         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
 735         irq_set->start = nr;
 736         irq_set->count = 1;
 737         pfd = (int32_t *)&irq_set->data;
 738
 739         *pfd = event_notifier_get_fd(&vector->interrupt);
 740
 741         ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
 742
 743         g_free(irq_set);
 744     }
 745 }
 746
 747 static void vfio_msix_enable(VFIOPCIDevice *vdev)
 748 {
 749     vfio_disable_interrupts(vdev);
 750
 751     vdev->msi_vectors = g_malloc0(vdev->msix->entries * sizeof(VFIOMSIVector));
 752
 753     vdev->interrupt = VFIO_INT_MSIX;
 754
 755     /*
 756      * Some communication channels between VF & PF or PF & fw rely on the
 757      * physical state of the device and expect that enabling MSI-X from the
 758      * guest enables the same on the host.  When our guest is Linux, the
 759      * guest driver call to pci_enable_msix() sets the enabling bit in the
 760      * MSI-X capability, but leaves the vector table masked.  We therefore
 761      * can't rely on a vector_use callback (from request_irq() in the guest)
 762      * to switch the physical device into MSI-X mode because that may come a
 763      * long time after pci_enable_msix().  This code enables vector 0 with
 764      * triggering to userspace, then immediately release the vector, leaving
 765      * the physical device with no vectors enabled, but MSI-X enabled, just
 766      * like the guest view.
 767      */
 768     vfio_msix_vector_do_use(&vdev->pdev, 0, NULL, NULL);
 769     vfio_msix_vector_release(&vdev->pdev, 0);
 770
 771     if (msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use,
 772                                   vfio_msix_vector_release, NULL)) {
 773         error_report("vfio: msix_set_vector_notifiers failed");
 774     }
 775
 776     trace_vfio_msix_enable(vdev->vbasedev.name);
 777 }
 778
 779 static void vfio_msi_enable(VFIOPCIDevice *vdev)
 780 {
 781     int ret, i;
 782
 783     vfio_disable_interrupts(vdev);
 784
 785     vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev);
 786 retry:
 787     vdev->msi_vectors = g_malloc0(vdev->nr_vectors * sizeof(VFIOMSIVector));
 788
 789     for (i = 0; i < vdev->nr_vectors; i++) {
 790         VFIOMSIVector *vector = &vdev->msi_vectors[i];
 791         MSIMessage msg = msi_get_message(&vdev->pdev, i);
 792
 793         vector->vdev = vdev;
 794         vector->virq = -1;
 795         vector->use = true;
 796
 797         if (event_notifier_init(&vector->interrupt, 0)) {
 798             error_report("vfio: Error: event_notifier_init failed");
 799         }
 800
 801         qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
 802                             vfio_msi_interrupt, NULL, vector);
 803
 804         /*
 805          * Attempt to enable route through KVM irqchip,
 806          * default to userspace handling if unavailable.
 807          */
 808         vfio_add_kvm_msi_virq(vdev, vector, &msg, false);
 809     }
 810
 811     /* Set interrupt type prior to possible interrupts */
 812     vdev->interrupt = VFIO_INT_MSI;
 813
 814     ret = vfio_enable_vectors(vdev, false);
 815     if (ret) {
 816         if (ret < 0) {
 817             error_report("vfio: Error: Failed to setup MSI fds: %m");
 818         } else if (ret != vdev->nr_vectors) {
 819             error_report("vfio: Error: Failed to enable %d "
 820                          "MSI vectors, retry with %d", vdev->nr_vectors, ret);
 821         }
 822
 823         for (i = 0; i < vdev->nr_vectors; i++) {
 824             VFIOMSIVector *vector = &vdev->msi_vectors[i];
 825             if (vector->virq >= 0) {
 826                 vfio_remove_kvm_msi_virq(vector);
 827             }
 828             qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
 829                                 NULL, NULL, NULL);
 830             event_notifier_cleanup(&vector->interrupt);
 831         }
 832
 833         g_free(vdev->msi_vectors);
 834
 835         if (ret > 0 && ret != vdev->nr_vectors) {
 836             vdev->nr_vectors = ret;
 837             goto retry;
 838         }
 839         vdev->nr_vectors = 0;
 840
 841         /*
 842          * Failing to setup MSI doesn't really fall within any specification.
 843          * Let's try leaving interrupts disabled and hope the guest figures
 844          * out to fall back to INTx for this device.
 845          */
 846         error_report("vfio: Error: Failed to enable MSI");
 847         vdev->interrupt = VFIO_INT_NONE;
 848
 849         return;
 850     }
 851
 852     trace_vfio_msi_enable(vdev->vbasedev.name, vdev->nr_vectors);
 853 }
 854
 855 static void vfio_msi_disable_common(VFIOPCIDevice *vdev)
 856 {
 857     int i;
 858
 859     for (i = 0; i < vdev->nr_vectors; i++) {
 860         VFIOMSIVector *vector = &vdev->msi_vectors[i];
 861         if (vdev->msi_vectors[i].use) {
 862             if (vector->virq >= 0) {
 863                 vfio_remove_kvm_msi_virq(vector);
 864             }
 865             qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
 866                                 NULL, NULL, NULL);
 867             event_notifier_cleanup(&vector->interrupt);
 868         }
 869     }
 870
 871     g_free(vdev->msi_vectors);
 872     vdev->msi_vectors = NULL;
 873     vdev->nr_vectors = 0;
 874     vdev->interrupt = VFIO_INT_NONE;
 875
 876     vfio_intx_enable(vdev);
 877 }
 878
 879 static void vfio_msix_disable(VFIOPCIDevice *vdev)
 880 {
 881     int i;
 882
 883     msix_unset_vector_notifiers(&vdev->pdev);
 884
 885     /*
 886      * MSI-X will only release vectors if MSI-X is still enabled on the
 887      * device, check through the rest and release it ourselves if necessary.
 888      */
 889     for (i = 0; i < vdev->nr_vectors; i++) {
 890         if (vdev->msi_vectors[i].use) {
 891             vfio_msix_vector_release(&vdev->pdev, i);
 892             msix_vector_unuse(&vdev->pdev, i);
 893         }
 894     }
 895
 896     if (vdev->nr_vectors) {
 897         vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
 898     }
 899
 900     vfio_msi_disable_common(vdev);
 901
 902     trace_vfio_msix_disable(vdev->vbasedev.name);
 903 }
 904
 905 static void vfio_msi_disable(VFIOPCIDevice *vdev)
 906 {
 907     vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSI_IRQ_INDEX);
 908     vfio_msi_disable_common(vdev);
 909
 910     trace_vfio_msi_disable(vdev->vbasedev.name);
 911 }
 912
 913 static void vfio_update_msi(VFIOPCIDevice *vdev)
 914 {
 915     int i;
 916
 917     for (i = 0; i < vdev->nr_vectors; i++) {
 918         VFIOMSIVector *vector = &vdev->msi_vectors[i];
 919         MSIMessage msg;
 920
 921         if (!vector->use || vector->virq < 0) {
 922             continue;
 923         }
 924
 925         msg = msi_get_message(&vdev->pdev, i);
 926         vfio_update_kvm_msi_virq(vector, msg);
 927     }
 928 }
 929
 930 static void vfio_pci_load_rom(VFIOPCIDevice *vdev)
 931 {
 932     struct vfio_region_info reg_info = {
 933         .argsz = sizeof(reg_info),
 934         .index = VFIO_PCI_ROM_REGION_INDEX
 935     };
 936     uint64_t size;
 937     off_t off = 0;
 938     ssize_t bytes;
 939
 940     if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info)) {
 941         error_report("vfio: Error getting ROM info: %m");
 942         return;
 943     }
 944
 945     trace_vfio_pci_load_rom(vdev->vbasedev.name, (unsigned long)reg_info.size,
 946                             (unsigned long)reg_info.offset,
 947                             (unsigned long)reg_info.flags);
 948
 949     vdev->rom_size = size = reg_info.size;
 950     vdev->rom_offset = reg_info.offset;
 951
 952     if (!vdev->rom_size) {
 953         vdev->rom_read_failed = true;
 954         error_report("vfio-pci: Cannot read device rom at "
 955                     "%s", vdev->vbasedev.name);
 956         error_printf("Device option ROM contents are probably invalid "
 957                     "(check dmesg).\nSkip option ROM probe with rombar=0, "
 958                     "or load from file with romfile=\n");
 959         return;
 960     }
 961
 962     vdev->rom = g_malloc(size);
 963     memset(vdev->rom, 0xff, size);
 964
 965     while (size) {
 966         bytes = pread(vdev->vbasedev.fd, vdev->rom + off,
 967                       size, vdev->rom_offset + off);
 968         if (bytes == 0) {
 969             break;
 970         } else if (bytes > 0) {
 971             off += bytes;
 972             size -= bytes;
 973         } else {
 974             if (errno == EINTR || errno == EAGAIN) {
 975                 continue;
 976             }
 977             error_report("vfio: Error reading device ROM: %m");
 978             break;
 979         }
 980     }
 981 }
 982
 983 static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size)
 984 {
 985     VFIOPCIDevice *vdev = opaque;
 986     union {
 987         uint8_t byte;
 988         uint16_t word;
 989         uint32_t dword;
 990         uint64_t qword;
 991     } val;
 992     uint64_t data = 0;
 993
 994     /* Load the ROM lazily when the guest tries to read it */
 995     if (unlikely(!vdev->rom && !vdev->rom_read_failed)) {
 996         vfio_pci_load_rom(vdev);
 997     }
 998
 999     memcpy(&val, vdev->rom + addr,
1000            (addr < vdev->rom_size) ? MIN(size, vdev->rom_size - addr) : 0);
1001
1002     switch (size) {
1003     case 1:
1004         data = val.byte;
1005         break;
1006     case 2:
1007         data = le16_to_cpu(val.word);
1008         break;
1009     case 4:
1010         data = le32_to_cpu(val.dword);
1011         break;
1012     default:
1013         hw_error("vfio: unsupported read size, %d bytes\n", size);
1014         break;
1015     }
1016
1017     trace_vfio_rom_read(vdev->vbasedev.name, addr, size, data);
1018
1019     return data;
1020 }
1021
1022 static void vfio_rom_write(void *opaque, hwaddr addr,
1023                            uint64_t data, unsigned size)
1024 {
1025 }
1026
1027 static const MemoryRegionOps vfio_rom_ops = {
1028     .read = vfio_rom_read,
1029     .write = vfio_rom_write,
1030     .endianness = DEVICE_LITTLE_ENDIAN,
1031 };
1032
1033 static bool vfio_blacklist_opt_rom(VFIOPCIDevice *vdev)
1034 {
1035     PCIDevice *pdev = &vdev->pdev;
1036     uint16_t vendor_id, device_id;
1037     int count = 0;
1038
1039     vendor_id = pci_get_word(pdev->config + PCI_VENDOR_ID);
1040     device_id = pci_get_word(pdev->config + PCI_DEVICE_ID);
1041
1042     while (count < ARRAY_SIZE(romblacklist)) {
1043         if (romblacklist[count].vendor_id == vendor_id &&
1044             romblacklist[count].device_id == device_id) {
1045                 return true;
1046         }
1047         count++;
1048     }
1049
1050     return false;
1051 }
1052
1053 static void vfio_pci_size_rom(VFIOPCIDevice *vdev)
1054 {
1055     uint32_t orig, size = cpu_to_le32((uint32_t)PCI_ROM_ADDRESS_MASK);
1056     off_t offset = vdev->config_offset + PCI_ROM_ADDRESS;
1057     DeviceState *dev = DEVICE(vdev);
1058     char name[32];
1059     int fd = vdev->vbasedev.fd;
1060
1061     if (vdev->pdev.romfile || !vdev->pdev.rom_bar) {
1062         /* Since pci handles romfile, just print a message and return */
1063         if (vfio_blacklist_opt_rom(vdev) && vdev->pdev.romfile) {
1064             error_printf("Warning : Device at %04x:%02x:%02x.%x "
1065                          "is known to cause system instability issues during "
1066                          "option rom execution. "
1067                          "Proceeding anyway since user specified romfile\n",
1068                          vdev->host.domain, vdev->host.bus, vdev->host.slot,
1069                          vdev->host.function);
1070         }
1071         return;
1072     }
1073
1074     /*
1075      * Use the same size ROM BAR as the physical device.  The contents
1076      * will get filled in later when the guest tries to read it.
1077      */
1078     if (pread(fd, &orig, 4, offset) != 4 ||
1079         pwrite(fd, &size, 4, offset) != 4 ||
1080         pread(fd, &size, 4, offset) != 4 ||
1081         pwrite(fd, &orig, 4, offset) != 4) {
1082         error_report("%s(%04x:%02x:%02x.%x) failed: %m",
1083                      __func__, vdev->host.domain, vdev->host.bus,
1084                      vdev->host.slot, vdev->host.function);
1085         return;
1086     }
1087
1088     size = ~(le32_to_cpu(size) & PCI_ROM_ADDRESS_MASK) + 1;
1089
1090     if (!size) {
1091         return;
1092     }
1093
1094     if (vfio_blacklist_opt_rom(vdev)) {
1095         if (dev->opts && qemu_opt_get(dev->opts, "rombar")) {
1096             error_printf("Warning : Device at %04x:%02x:%02x.%x "
1097                          "is known to cause system instability issues during "
1098                          "option rom execution. "
1099                          "Proceeding anyway since user specified non zero value for "
1100                          "rombar\n",
1101                          vdev->host.domain, vdev->host.bus, vdev->host.slot,
1102                          vdev->host.function);
1103         } else {
1104             error_printf("Warning : Rom loading for device at "
1105                          "%04x:%02x:%02x.%x has been disabled due to "
1106                          "system instability issues. "
1107                          "Specify rombar=1 or romfile to force\n",
1108                          vdev->host.domain, vdev->host.bus, vdev->host.slot,
1109                          vdev->host.function);
1110             return;
1111         }
1112     }
1113
1114     trace_vfio_pci_size_rom(vdev->vbasedev.name, size);
1115
1116     snprintf(name, sizeof(name), "vfio[%04x:%02x:%02x.%x].rom",
1117              vdev->host.domain, vdev->host.bus, vdev->host.slot,
1118              vdev->host.function);
1119
1120     memory_region_init_io(&vdev->pdev.rom, OBJECT(vdev),
1121                           &vfio_rom_ops, vdev, name, size);
1122
1123     pci_register_bar(&vdev->pdev, PCI_ROM_SLOT,
1124                      PCI_BASE_ADDRESS_SPACE_MEMORY, &vdev->pdev.rom);
1125
1126     vdev->pdev.has_rom = true;
1127     vdev->rom_read_failed = false;
1128 }
1129
1130 static void vfio_vga_write(void *opaque, hwaddr addr,
1131                            uint64_t data, unsigned size)
1132 {
1133     VFIOVGARegion *region = opaque;
1134     VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]);
1135     union {
1136         uint8_t byte;
1137         uint16_t word;
1138         uint32_t dword;
1139         uint64_t qword;
1140     } buf;
1141     off_t offset = vga->fd_offset + region->offset + addr;
1142
1143     switch (size) {
1144     case 1:
1145         buf.byte = data;
1146         break;
1147     case 2:
1148         buf.word = cpu_to_le16(data);
1149         break;
1150     case 4:
1151         buf.dword = cpu_to_le32(data);
1152         break;
1153     default:
1154         hw_error("vfio: unsupported write size, %d bytes", size);
1155         break;
1156     }
1157
1158     if (pwrite(vga->fd, &buf, size, offset) != size) {
1159         error_report("%s(,0x%"HWADDR_PRIx", 0x%"PRIx64", %d) failed: %m",
1160                      __func__, region->offset + addr, data, size);
1161     }
1162
1163     trace_vfio_vga_write(region->offset + addr, data, size);
1164 }
1165
1166 static uint64_t vfio_vga_read(void *opaque, hwaddr addr, unsigned size)
1167 {
1168     VFIOVGARegion *region = opaque;
1169     VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]);
1170     union {
1171         uint8_t byte;
1172         uint16_t word;
1173         uint32_t dword;
1174         uint64_t qword;
1175     } buf;
1176     uint64_t data = 0;
1177     off_t offset = vga->fd_offset + region->offset + addr;
1178
1179     if (pread(vga->fd, &buf, size, offset) != size) {
1180         error_report("%s(,0x%"HWADDR_PRIx", %d) failed: %m",
1181                      __func__, region->offset + addr, size);
1182         return (uint64_t)-1;
1183     }
1184
1185     switch (size) {
1186     case 1:
1187         data = buf.byte;
1188         break;
1189     case 2:
1190         data = le16_to_cpu(buf.word);
1191         break;
1192     case 4:
1193         data = le32_to_cpu(buf.dword);
1194         break;
1195     default:
1196         hw_error("vfio: unsupported read size, %d bytes", size);
1197         break;
1198     }
1199
1200     trace_vfio_vga_read(region->offset + addr, size, data);
1201
1202     return data;
1203 }
1204
1205 static const MemoryRegionOps vfio_vga_ops = {
1206     .read = vfio_vga_read,
1207     .write = vfio_vga_write,
1208     .endianness = DEVICE_LITTLE_ENDIAN,
1209 };
1210
1211 /*
1212  * Device specific quirks
1213  */
1214
1215 /* Is range1 fully contained within range2?  */
1216 static bool vfio_range_contained(uint64_t first1, uint64_t len1,
1217                                  uint64_t first2, uint64_t len2) {
1218     return (first1 >= first2 && first1 + len1 <= first2 + len2);
1219 }
1220
1221 static bool vfio_flags_enabled(uint8_t flags, uint8_t mask)
1222 {
1223     return (mask && (flags & mask) == mask);
1224 }
1225
1226 static uint64_t vfio_generic_window_quirk_read(void *opaque,
1227                                                hwaddr addr, unsigned size)
1228 {
1229     VFIOQuirk *quirk = opaque;
1230     VFIOPCIDevice *vdev = quirk->vdev;
1231     uint64_t data;
1232
1233     if (vfio_flags_enabled(quirk->data.flags, quirk->data.read_flags) &&
1234         ranges_overlap(addr, size,
1235                        quirk->data.data_offset, quirk->data.data_size)) {
1236         hwaddr offset = addr - quirk->data.data_offset;
1237
1238         if (!vfio_range_contained(addr, size, quirk->data.data_offset,
1239                                   quirk->data.data_size)) {
1240             hw_error("%s: window data read not fully contained: %s",
1241                      __func__, memory_region_name(&quirk->mem));
1242         }
1243
1244         data = vfio_pci_read_config(&vdev->pdev,
1245                                     quirk->data.address_val + offset, size);
1246
1247         trace_vfio_generic_window_quirk_read(memory_region_name(&quirk->mem),
1248                                              vdev->vbasedev.name,
1249                                              quirk->data.bar,
1250                                              addr, size, data);
1251     } else {
1252         data = vfio_region_read(&vdev->bars[quirk->data.bar].region,
1253                                 addr + quirk->data.base_offset, size);
1254     }
1255
1256     return data;
1257 }
1258
1259 static void vfio_generic_window_quirk_write(void *opaque, hwaddr addr,
1260                                             uint64_t data, unsigned size)
1261 {
1262     VFIOQuirk *quirk = opaque;
1263     VFIOPCIDevice *vdev = quirk->vdev;
1264
1265     if (ranges_overlap(addr, size,
1266                        quirk->data.address_offset, quirk->data.address_size)) {
1267
1268         if (addr != quirk->data.address_offset) {
1269             hw_error("%s: offset write into address window: %s",
1270                      __func__, memory_region_name(&quirk->mem));
1271         }
1272
1273         if ((data & ~quirk->data.address_mask) == quirk->data.address_match) {
1274             quirk->data.flags |= quirk->data.write_flags |
1275                                  quirk->data.read_flags;
1276             quirk->data.address_val = data & quirk->data.address_mask;
1277         } else {
1278             quirk->data.flags &= ~(quirk->data.write_flags |
1279                                    quirk->data.read_flags);
1280         }
1281     }
1282
1283     if (vfio_flags_enabled(quirk->data.flags, quirk->data.write_flags) &&
1284         ranges_overlap(addr, size,
1285                        quirk->data.data_offset, quirk->data.data_size)) {
1286         hwaddr offset = addr - quirk->data.data_offset;
1287
1288         if (!vfio_range_contained(addr, size, quirk->data.data_offset,
1289                                   quirk->data.data_size)) {
1290             hw_error("%s: window data write not fully contained: %s",
1291                      __func__, memory_region_name(&quirk->mem));
1292         }
1293
1294         vfio_pci_write_config(&vdev->pdev,
1295                               quirk->data.address_val + offset, data, size);
1296         trace_vfio_generic_window_quirk_write(memory_region_name(&quirk->mem),
1297                                               vdev->vbasedev.name,
1298                                               quirk->data.bar,
1299                                               addr, data, size);
1300         return;
1301     }
1302
1303     vfio_region_write(&vdev->bars[quirk->data.bar].region,
1304                    addr + quirk->data.base_offset, data, size);
1305 }
1306
1307 static const MemoryRegionOps vfio_generic_window_quirk = {
1308     .read = vfio_generic_window_quirk_read,
1309     .write = vfio_generic_window_quirk_write,
1310     .endianness = DEVICE_LITTLE_ENDIAN,
1311 };
1312
1313 static uint64_t vfio_generic_quirk_read(void *opaque,
1314                                         hwaddr addr, unsigned size)
1315 {
1316     VFIOQuirk *quirk = opaque;
1317     VFIOPCIDevice *vdev = quirk->vdev;
1318     hwaddr base = quirk->data.address_match & TARGET_PAGE_MASK;
1319     hwaddr offset = quirk->data.address_match & ~TARGET_PAGE_MASK;
1320     uint64_t data;
1321
1322     if (vfio_flags_enabled(quirk->data.flags, quirk->data.read_flags) &&
1323         ranges_overlap(addr, size, offset, quirk->data.address_mask + 1)) {
1324         if (!vfio_range_contained(addr, size, offset,
1325                                   quirk->data.address_mask + 1)) {
1326             hw_error("%s: read not fully contained: %s",
1327                      __func__, memory_region_name(&quirk->mem));
1328         }
1329
1330         data = vfio_pci_read_config(&vdev->pdev, addr - offset, size);
1331
1332         trace_vfio_generic_quirk_read(memory_region_name(&quirk->mem),
1333                                       vdev->vbasedev.name, quirk->data.bar,
1334                                       addr + base, size, data);
1335     } else {
1336         data = vfio_region_read(&vdev->bars[quirk->data.bar].region,
1337                                 addr + base, size);
1338     }
1339
1340     return data;
1341 }
1342
1343 static void vfio_generic_quirk_write(void *opaque, hwaddr addr,
1344                                      uint64_t data, unsigned size)
1345 {
1346     VFIOQuirk *quirk = opaque;
1347     VFIOPCIDevice *vdev = quirk->vdev;
1348     hwaddr base = quirk->data.address_match & TARGET_PAGE_MASK;
1349     hwaddr offset = quirk->data.address_match & ~TARGET_PAGE_MASK;
1350
1351     if (vfio_flags_enabled(quirk->data.flags, quirk->data.write_flags) &&
1352         ranges_overlap(addr, size, offset, quirk->data.address_mask + 1)) {
1353         if (!vfio_range_contained(addr, size, offset,
1354                                   quirk->data.address_mask + 1)) {
1355             hw_error("%s: write not fully contained: %s",
1356                      __func__, memory_region_name(&quirk->mem));
1357         }
1358
1359         vfio_pci_write_config(&vdev->pdev, addr - offset, data, size);
1360
1361         trace_vfio_generic_quirk_write(memory_region_name(&quirk->mem),
1362                                        vdev->vbasedev.name, quirk->data.bar,
1363                                        addr + base, data, size);
1364     } else {
1365         vfio_region_write(&vdev->bars[quirk->data.bar].region,
1366                           addr + base, data, size);
1367     }
1368 }
1369
1370 static const MemoryRegionOps vfio_generic_quirk = {
1371     .read = vfio_generic_quirk_read,
1372     .write = vfio_generic_quirk_write,
1373     .endianness = DEVICE_LITTLE_ENDIAN,
1374 };
1375
1376 #define PCI_VENDOR_ID_ATI               0x1002
1377
1378 /*
1379  * Radeon HD cards (HD5450 & HD7850) report the upper byte of the I/O port BAR
1380  * through VGA register 0x3c3.  On newer cards, the I/O port BAR is always
1381  * BAR4 (older cards like the X550 used BAR1, but we don't care to support
1382  * those).  Note that on bare metal, a read of 0x3c3 doesn't always return the
1383  * I/O port BAR address.  Originally this was coded to return the virtual BAR
1384  * address only if the physical register read returns the actual BAR address,
1385  * but users have reported greater success if we return the virtual address
1386  * unconditionally.
1387  */
1388 static uint64_t vfio_ati_3c3_quirk_read(void *opaque,
1389                                         hwaddr addr, unsigned size)
1390 {
1391     VFIOQuirk *quirk = opaque;
1392     VFIOPCIDevice *vdev = quirk->vdev;
1393     uint64_t data = vfio_pci_read_config(&vdev->pdev,
1394                                          PCI_BASE_ADDRESS_0 + (4 * 4) + 1,
1395                                          size);
1396     trace_vfio_ati_3c3_quirk_read(data);
1397
1398     return data;
1399 }
1400
1401 static const MemoryRegionOps vfio_ati_3c3_quirk = {
1402     .read = vfio_ati_3c3_quirk_read,
1403     .endianness = DEVICE_LITTLE_ENDIAN,
1404 };
1405
1406 static void vfio_vga_probe_ati_3c3_quirk(VFIOPCIDevice *vdev)
1407 {
1408     PCIDevice *pdev = &vdev->pdev;
1409     VFIOQuirk *quirk;
1410
1411     if (pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_ATI) {
1412         return;
1413     }
1414
1415     /*
1416      * As long as the BAR is >= 256 bytes it will be aligned such that the
1417      * lower byte is always zero.  Filter out anything else, if it exists.
1418      */
1419     if (!vdev->bars[4].ioport || vdev->bars[4].region.size < 256) {
1420         return;
1421     }
1422
1423     quirk = g_malloc0(sizeof(*quirk));
1424     quirk->vdev = vdev;
1425
1426     memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_ati_3c3_quirk, quirk,
1427                           "vfio-ati-3c3-quirk", 1);
1428     memory_region_add_subregion(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem,
1429                                 3 /* offset 3 bytes from 0x3c0 */, &quirk->mem);
1430
1431     QLIST_INSERT_HEAD(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].quirks,
1432                       quirk, next);
1433
1434     trace_vfio_vga_probe_ati_3c3_quirk(vdev->vbasedev.name);
1435 }
1436
1437 /*
1438  * Newer ATI/AMD devices, including HD5450 and HD7850, have a window to PCI
1439  * config space through MMIO BAR2 at offset 0x4000.  Nothing seems to access
1440  * the MMIO space directly, but a window to this space is provided through
1441  * I/O port BAR4.  Offset 0x0 is the address register and offset 0x4 is the
1442  * data register.  When the address is programmed to a range of 0x4000-0x4fff
1443  * PCI configuration space is available.  Experimentation seems to indicate
1444  * that only read-only access is provided, but we drop writes when the window
1445  * is enabled to config space nonetheless.
1446  */
1447 static void vfio_probe_ati_bar4_window_quirk(VFIOPCIDevice *vdev, int nr)
1448 {
1449     PCIDevice *pdev = &vdev->pdev;
1450     VFIOQuirk *quirk;
1451
1452     if (!vdev->has_vga || nr != 4 ||
1453         pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_ATI) {
1454         return;
1455     }
1456
1457     quirk = g_malloc0(sizeof(*quirk));
1458     quirk->vdev = vdev;
1459     quirk->data.address_size = 4;
1460     quirk->data.data_offset = 4;
1461     quirk->data.data_size = 4;
1462     quirk->data.address_match = 0x4000;
1463     quirk->data.address_mask = PCIE_CONFIG_SPACE_SIZE - 1;
1464     quirk->data.bar = nr;
1465     quirk->data.read_flags = quirk->data.write_flags = 1;
1466
1467     memory_region_init_io(&quirk->mem, OBJECT(vdev),
1468                           &vfio_generic_window_quirk, quirk,
1469                           "vfio-ati-bar4-window-quirk", 8);
1470     memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem,
1471                           quirk->data.base_offset, &quirk->mem, 1);
1472
1473     QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1474
1475     trace_vfio_probe_ati_bar4_window_quirk(vdev->vbasedev.name);
1476 }
1477
1478 #define PCI_VENDOR_ID_REALTEK 0x10ec
1479
1480 /*
1481  * RTL8168 devices have a backdoor that can access the MSI-X table.  At BAR2
1482  * offset 0x70 there is a dword data register, offset 0x74 is a dword address
1483  * register.  According to the Linux r8169 driver, the MSI-X table is addressed
1484  * when the "type" portion of the address register is set to 0x1.  This appears
1485  * to be bits 16:30.  Bit 31 is both a write indicator and some sort of
1486  * "address latched" indicator.  Bits 12:15 are a mask field, which we can
1487  * ignore because the MSI-X table should always be accessed as a dword (full
1488  * mask).  Bits 0:11 is offset within the type.
1489  *
1490  * Example trace:
1491  *
1492  * Read from MSI-X table offset 0
1493  * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x1f000, 4) // store read addr
1494  * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x8001f000 // latch
1495  * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x70, 4) = 0xfee00398 // read data
1496  *
1497  * Write 0xfee00000 to MSI-X table offset 0
1498  * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x70, 0xfee00000, 4) // write data
1499  * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x8001f000, 4) // do write
1500  * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x1f000 // complete
1501  */
1502
1503 static uint64_t vfio_rtl8168_window_quirk_read(void *opaque,
1504                                                hwaddr addr, unsigned size)
1505 {
1506     VFIOQuirk *quirk = opaque;
1507     VFIOPCIDevice *vdev = quirk->vdev;
1508     uint64_t val = 0;
1509
1510     if (!quirk->data.flags) { /* Non-MSI-X table access */
1511         return vfio_region_read(&vdev->bars[quirk->data.bar].region,
1512                                 addr + 0x70, size);
1513     }
1514
1515     switch (addr) {
1516     case 4: /* address */
1517         val = quirk->data.address_match ^ 0x80000000U; /* latch/complete */
1518         break;
1519     case 0: /* data */
1520         if ((vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX)) {
1521             memory_region_dispatch_read(&vdev->pdev.msix_table_mmio,
1522                                 (hwaddr)(quirk->data.address_match & 0xfff),
1523                                 &val, size, MEMTXATTRS_UNSPECIFIED);
1524         }
1525         break;
1526     }
1527
1528     trace_vfio_rtl8168_quirk_read(vdev->vbasedev.name,
1529                                   addr ? "address" : "data", val);
1530     return val;
1531 }
1532
1533 static void vfio_rtl8168_window_quirk_write(void *opaque, hwaddr addr,
1534                                             uint64_t data, unsigned size)
1535 {
1536     VFIOQuirk *quirk = opaque;
1537     VFIOPCIDevice *vdev = quirk->vdev;
1538
1539     switch (addr) {
1540     case 4: /* address */
1541         if ((data & 0x7fff0000) == 0x10000) { /* MSI-X table */
1542             quirk->data.flags = 1; /* Activate reads */
1543             quirk->data.address_match = data;
1544
1545             trace_vfio_rtl8168_quirk_write(vdev->vbasedev.name, data);
1546
1547             if (data & 0x80000000U) { /* Do write */
1548                 if (vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX) {
1549                     hwaddr offset = data & 0xfff;
1550                     uint64_t val = quirk->data.address_mask;
1551
1552                     trace_vfio_rtl8168_quirk_msix(vdev->vbasedev.name,
1553                                                   (uint16_t)offset, val);
1554
1555                     /* Write to the proper guest MSI-X table instead */
1556                     memory_region_dispatch_write(&vdev->pdev.msix_table_mmio,
1557                                                  offset, val, size,
1558                                                  MEMTXATTRS_UNSPECIFIED);
1559                 }
1560                 return; /* Do not write guest MSI-X data to hardware */
1561             }
1562         } else {
1563             quirk->data.flags = 0; /* De-activate reads, non-MSI-X */
1564         }
1565         break;
1566     case 0: /* data */
1567         quirk->data.address_mask = data;
1568         break;
1569     }
1570
1571     vfio_region_write(&vdev->bars[quirk->data.bar].region,
1572                       addr + 0x70, data, size);
1573 }
1574
1575 static const MemoryRegionOps vfio_rtl8168_window_quirk = {
1576     .read = vfio_rtl8168_window_quirk_read,
1577     .write = vfio_rtl8168_window_quirk_write,
1578     .valid = {
1579         .min_access_size = 4,
1580         .max_access_size = 4,
1581         .unaligned = false,
1582     },
1583     .endianness = DEVICE_LITTLE_ENDIAN,
1584 };
1585
1586 static void vfio_probe_rtl8168_bar2_window_quirk(VFIOPCIDevice *vdev, int nr)
1587 {
1588     PCIDevice *pdev = &vdev->pdev;
1589     VFIOQuirk *quirk;
1590
1591     if (pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_REALTEK ||
1592         pci_get_word(pdev->config + PCI_DEVICE_ID) != 0x8168 || nr != 2) {
1593         return;
1594     }
1595
1596     quirk = g_malloc0(sizeof(*quirk));
1597     quirk->vdev = vdev;
1598     quirk->data.bar = nr;
1599
1600     memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_rtl8168_window_quirk,
1601                           quirk, "vfio-rtl8168-window-quirk", 8);
1602     memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem,
1603                                         0x70, &quirk->mem, 1);
1604
1605     QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1606
1607     trace_vfio_rtl8168_quirk_enable(vdev->vbasedev.name);
1608 }
1609
1610 /*
1611  * Trap the BAR2 MMIO window to config space as well.
1612  */
1613 static void vfio_probe_ati_bar2_4000_quirk(VFIOPCIDevice *vdev, int nr)
1614 {
1615     PCIDevice *pdev = &vdev->pdev;
1616     VFIOQuirk *quirk;
1617
1618     /* Only enable on newer devices where BAR2 is 64bit */
1619     if (!vdev->has_vga || nr != 2 || !vdev->bars[2].mem64 ||
1620         pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_ATI) {
1621         return;
1622     }
1623
1624     quirk = g_malloc0(sizeof(*quirk));
1625     quirk->vdev = vdev;
1626     quirk->data.flags = quirk->data.read_flags = quirk->data.write_flags = 1;
1627     quirk->data.address_match = 0x4000;
1628     quirk->data.address_mask = PCIE_CONFIG_SPACE_SIZE - 1;
1629     quirk->data.bar = nr;
1630
1631     memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_generic_quirk, quirk,
1632                           "vfio-ati-bar2-4000-quirk",
1633                           TARGET_PAGE_ALIGN(quirk->data.address_mask + 1));
1634     memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem,
1635                           quirk->data.address_match & TARGET_PAGE_MASK,
1636                           &quirk->mem, 1);
1637
1638     QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1639
1640     trace_vfio_probe_ati_bar2_4000_quirk(vdev->vbasedev.name);
1641 }
1642
1643 /*
1644  * Older ATI/AMD cards like the X550 have a similar window to that above.
1645  * I/O port BAR1 provides a window to a mirror of PCI config space located
1646  * in BAR2 at offset 0xf00.  We don't care to support such older cards, but
1647  * note it for future reference.
1648  */
1649
1650 #define PCI_VENDOR_ID_NVIDIA                    0x10de
1651
1652 /*
1653  * Nvidia has several different methods to get to config space, the
1654  * nouveu project has several of these documented here:
1655  * https://github.com/pathscale/envytools/tree/master/hwdocs
1656  *
1657  * The first quirk is actually not documented in envytools and is found
1658  * on 10de:01d1 (NVIDIA Corporation G72 [GeForce 7300 LE]).  This is an
1659  * NV46 chipset.  The backdoor uses the legacy VGA I/O ports to access
1660  * the mirror of PCI config space found at BAR0 offset 0x1800.  The access
1661  * sequence first writes 0x338 to I/O port 0x3d4.  The target offset is
1662  * then written to 0x3d0.  Finally 0x538 is written for a read and 0x738
1663  * is written for a write to 0x3d4.  The BAR0 offset is then accessible
1664  * through 0x3d0.  This quirk doesn't seem to be necessary on newer cards
1665  * that use the I/O port BAR5 window but it doesn't hurt to leave it.
1666  */
1667 enum {
1668     NV_3D0_NONE = 0,
1669     NV_3D0_SELECT,
1670     NV_3D0_WINDOW,
1671     NV_3D0_READ,
1672     NV_3D0_WRITE,
1673 };
1674
1675 static uint64_t vfio_nvidia_3d0_quirk_read(void *opaque,
1676                                            hwaddr addr, unsigned size)
1677 {
1678     VFIOQuirk *quirk = opaque;
1679     VFIOPCIDevice *vdev = quirk->vdev;
1680     PCIDevice *pdev = &vdev->pdev;
1681     uint64_t data = vfio_vga_read(&vdev->vga.region[QEMU_PCI_VGA_IO_HI],
1682                                   addr + quirk->data.base_offset, size);
1683
1684     if (quirk->data.flags == NV_3D0_READ && addr == quirk->data.data_offset) {
1685         data = vfio_pci_read_config(pdev, quirk->data.address_val, size);
1686         trace_vfio_nvidia_3d0_quirk_read(size, data);
1687     }
1688
1689     quirk->data.flags = NV_3D0_NONE;
1690
1691     return data;
1692 }
1693
1694 static void vfio_nvidia_3d0_quirk_write(void *opaque, hwaddr addr,
1695                                         uint64_t data, unsigned size)
1696 {
1697     VFIOQuirk *quirk = opaque;
1698     VFIOPCIDevice *vdev = quirk->vdev;
1699     PCIDevice *pdev = &vdev->pdev;
1700
1701     switch (quirk->data.flags) {
1702     case NV_3D0_NONE:
1703         if (addr == quirk->data.address_offset && data == 0x338) {
1704             quirk->data.flags = NV_3D0_SELECT;
1705         }
1706         break;
1707     case NV_3D0_SELECT:
1708         quirk->data.flags = NV_3D0_NONE;
1709         if (addr == quirk->data.data_offset &&
1710             (data & ~quirk->data.address_mask) == quirk->data.address_match) {
1711             quirk->data.flags = NV_3D0_WINDOW;
1712             quirk->data.address_val = data & quirk->data.address_mask;
1713         }
1714         break;
1715     case NV_3D0_WINDOW:
1716         quirk->data.flags = NV_3D0_NONE;
1717         if (addr == quirk->data.address_offset) {
1718             if (data == 0x538) {
1719                 quirk->data.flags = NV_3D0_READ;
1720             } else if (data == 0x738) {
1721                 quirk->data.flags = NV_3D0_WRITE;
1722             }
1723         }
1724         break;
1725     case NV_3D0_WRITE:
1726         quirk->data.flags = NV_3D0_NONE;
1727         if (addr == quirk->data.data_offset) {
1728             vfio_pci_write_config(pdev, quirk->data.address_val, data, size);
1729             trace_vfio_nvidia_3d0_quirk_write(data, size);
1730             return;
1731         }
1732         break;
1733     }
1734
1735     vfio_vga_write(&vdev->vga.region[QEMU_PCI_VGA_IO_HI],
1736                    addr + quirk->data.base_offset, data, size);
1737 }
1738
1739 static const MemoryRegionOps vfio_nvidia_3d0_quirk = {
1740     .read = vfio_nvidia_3d0_quirk_read,
1741     .write = vfio_nvidia_3d0_quirk_write,
1742     .endianness = DEVICE_LITTLE_ENDIAN,
1743 };
1744
1745 static void vfio_vga_probe_nvidia_3d0_quirk(VFIOPCIDevice *vdev)
1746 {
1747     PCIDevice *pdev = &vdev->pdev;
1748     VFIOQuirk *quirk;
1749
1750     if (pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_NVIDIA ||
1751         !vdev->bars[1].region.size) {
1752         return;
1753     }
1754
1755     quirk = g_malloc0(sizeof(*quirk));
1756     quirk->vdev = vdev;
1757     quirk->data.base_offset = 0x10;
1758     quirk->data.address_offset = 4;
1759     quirk->data.address_size = 2;
1760     quirk->data.address_match = 0x1800;
1761     quirk->data.address_mask = PCI_CONFIG_SPACE_SIZE - 1;
1762     quirk->data.data_offset = 0;
1763     quirk->data.data_size = 4;
1764
1765     memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_nvidia_3d0_quirk,
1766                           quirk, "vfio-nvidia-3d0-quirk", 6);
1767     memory_region_add_subregion(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem,
1768                                 quirk->data.base_offset, &quirk->mem);
1769
1770     QLIST_INSERT_HEAD(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].quirks,
1771                       quirk, next);
1772
1773     trace_vfio_vga_probe_nvidia_3d0_quirk(vdev->vbasedev.name);
1774 }
1775
1776 /*
1777  * The second quirk is documented in envytools.  The I/O port BAR5 is just
1778  * a set of address/data ports to the MMIO BARs.  The BAR we care about is
1779  * again BAR0.  This backdoor is apparently a bit newer than the one above
1780  * so we need to not only trap 256 bytes @0x1800, but all of PCI config
1781  * space, including extended space is available at the 4k @0x88000.
1782  */
1783 enum {
1784     NV_BAR5_ADDRESS = 0x1,
1785     NV_BAR5_ENABLE = 0x2,
1786     NV_BAR5_MASTER = 0x4,
1787     NV_BAR5_VALID = 0x7,
1788 };
1789
1790 static void vfio_nvidia_bar5_window_quirk_write(void *opaque, hwaddr addr,
1791                                                 uint64_t data, unsigned size)
1792 {
1793     VFIOQuirk *quirk = opaque;
1794
1795     switch (addr) {
1796     case 0x0:
1797         if (data & 0x1) {
1798             quirk->data.flags |= NV_BAR5_MASTER;
1799         } else {
1800             quirk->data.flags &= ~NV_BAR5_MASTER;
1801         }
1802         break;
1803     case 0x4:
1804         if (data & 0x1) {
1805             quirk->data.flags |= NV_BAR5_ENABLE;
1806         } else {
1807             quirk->data.flags &= ~NV_BAR5_ENABLE;
1808         }
1809         break;
1810     case 0x8:
1811         if (quirk->data.flags & NV_BAR5_MASTER) {
1812             if ((data & ~0xfff) == 0x88000) {
1813                 quirk->data.flags |= NV_BAR5_ADDRESS;
1814                 quirk->data.address_val = data & 0xfff;
1815             } else if ((data & ~0xff) == 0x1800) {
1816                 quirk->data.flags |= NV_BAR5_ADDRESS;
1817                 quirk->data.address_val = data & 0xff;
1818             } else {
1819                 quirk->data.flags &= ~NV_BAR5_ADDRESS;
1820             }
1821         }
1822         break;
1823     }
1824
1825     vfio_generic_window_quirk_write(opaque, addr, data, size);
1826 }
1827
1828 static const MemoryRegionOps vfio_nvidia_bar5_window_quirk = {
1829     .read = vfio_generic_window_quirk_read,
1830     .write = vfio_nvidia_bar5_window_quirk_write,
1831     .valid.min_access_size = 4,
1832     .endianness = DEVICE_LITTLE_ENDIAN,
1833 };
1834
1835 static void vfio_probe_nvidia_bar5_window_quirk(VFIOPCIDevice *vdev, int nr)
1836 {
1837     PCIDevice *pdev = &vdev->pdev;
1838     VFIOQuirk *quirk;
1839
1840     if (!vdev->has_vga || nr != 5 ||
1841         pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_NVIDIA) {
1842         return;
1843     }
1844
1845     quirk = g_malloc0(sizeof(*quirk));
1846     quirk->vdev = vdev;
1847     quirk->data.read_flags = quirk->data.write_flags = NV_BAR5_VALID;
1848     quirk->data.address_offset = 0x8;
1849     quirk->data.address_size = 0; /* actually 4, but avoids generic code */
1850     quirk->data.data_offset = 0xc;
1851     quirk->data.data_size = 4;
1852     quirk->data.bar = nr;
1853
1854     memory_region_init_io(&quirk->mem, OBJECT(vdev),
1855                           &vfio_nvidia_bar5_window_quirk, quirk,
1856                           "vfio-nvidia-bar5-window-quirk", 16);
1857     memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem,
1858                                         0, &quirk->mem, 1);
1859
1860     QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1861
1862     trace_vfio_probe_nvidia_bar5_window_quirk(vdev->vbasedev.name);
1863 }
1864
1865 static void vfio_nvidia_88000_quirk_write(void *opaque, hwaddr addr,
1866                                           uint64_t data, unsigned size)
1867 {
1868     VFIOQuirk *quirk = opaque;
1869     VFIOPCIDevice *vdev = quirk->vdev;
1870     PCIDevice *pdev = &vdev->pdev;
1871     hwaddr base = quirk->data.address_match & TARGET_PAGE_MASK;
1872
1873     vfio_generic_quirk_write(opaque, addr, data, size);
1874
1875     /*
1876      * Nvidia seems to acknowledge MSI interrupts by writing 0xff to the
1877      * MSI capability ID register.  Both the ID and next register are
1878      * read-only, so we allow writes covering either of those to real hw.
1879      * NB - only fixed for the 0x88000 MMIO window.
1880      */
1881     if ((pdev->cap_present & QEMU_PCI_CAP_MSI) &&
1882         vfio_range_contained(addr, size, pdev->msi_cap, PCI_MSI_FLAGS)) {
1883         vfio_region_write(&vdev->bars[quirk->data.bar].region,
1884                           addr + base, data, size);
1885     }
1886 }
1887
1888 static const MemoryRegionOps vfio_nvidia_88000_quirk = {
1889     .read = vfio_generic_quirk_read,
1890     .write = vfio_nvidia_88000_quirk_write,
1891     .endianness = DEVICE_LITTLE_ENDIAN,
1892 };
1893
1894 /*
1895  * Finally, BAR0 itself.  We want to redirect any accesses to either
1896  * 0x1800 or 0x88000 through the PCI config space access functions.
1897  *
1898  * NB - quirk at a page granularity or else they don't seem to work when
1899  *      BARs are mmap'd
1900  *
1901  * Here's offset 0x88000...
1902  */
1903 static void vfio_probe_nvidia_bar0_88000_quirk(VFIOPCIDevice *vdev, int nr)
1904 {
1905     PCIDevice *pdev = &vdev->pdev;
1906     VFIOQuirk *quirk;
1907     uint16_t vendor, class;
1908
1909     vendor = pci_get_word(pdev->config + PCI_VENDOR_ID);
1910     class = pci_get_word(pdev->config + PCI_CLASS_DEVICE);
1911
1912     if (nr != 0 || vendor != PCI_VENDOR_ID_NVIDIA ||
1913         class != PCI_CLASS_DISPLAY_VGA) {
1914         return;
1915     }
1916
1917     quirk = g_malloc0(sizeof(*quirk));
1918     quirk->vdev = vdev;
1919     quirk->data.flags = quirk->data.read_flags = quirk->data.write_flags = 1;
1920     quirk->data.address_match = 0x88000;
1921     quirk->data.address_mask = PCIE_CONFIG_SPACE_SIZE - 1;
1922     quirk->data.bar = nr;
1923
1924     memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_nvidia_88000_quirk,
1925                           quirk, "vfio-nvidia-bar0-88000-quirk",
1926                           TARGET_PAGE_ALIGN(quirk->data.address_mask + 1));
1927     memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem,
1928                           quirk->data.address_match & TARGET_PAGE_MASK,
1929                           &quirk->mem, 1);
1930
1931     QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1932
1933     trace_vfio_probe_nvidia_bar0_88000_quirk(vdev->vbasedev.name);
1934 }
1935
1936 /*
1937  * And here's the same for BAR0 offset 0x1800...
1938  */
1939 static void vfio_probe_nvidia_bar0_1800_quirk(VFIOPCIDevice *vdev, int nr)
1940 {
1941     PCIDevice *pdev = &vdev->pdev;
1942     VFIOQuirk *quirk;
1943
1944     if (!vdev->has_vga || nr != 0 ||
1945         pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_NVIDIA) {
1946         return;
1947     }
1948
1949     /* Log the chipset ID */
1950     trace_vfio_probe_nvidia_bar0_1800_quirk_id(
1951             (unsigned int)(vfio_region_read(&vdev->bars[0].region, 0, 4) >> 20)
1952             & 0xff);
1953
1954     quirk = g_malloc0(sizeof(*quirk));
1955     quirk->vdev = vdev;
1956     quirk->data.flags = quirk->data.read_flags = quirk->data.write_flags = 1;
1957     quirk->data.address_match = 0x1800;
1958     quirk->data.address_mask = PCI_CONFIG_SPACE_SIZE - 1;
1959     quirk->data.bar = nr;
1960
1961     memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_generic_quirk, quirk,
1962                           "vfio-nvidia-bar0-1800-quirk",
1963                           TARGET_PAGE_ALIGN(quirk->data.address_mask + 1));
1964     memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem,
1965                           quirk->data.address_match & TARGET_PAGE_MASK,
1966                           &quirk->mem, 1);
1967
1968     QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1969
1970     trace_vfio_probe_nvidia_bar0_1800_quirk(vdev->vbasedev.name);
1971 }
1972
1973 /*
1974  * TODO - Some Nvidia devices provide config access to their companion HDA
1975  * device and even to their parent bridge via these config space mirrors.
1976  * Add quirks for those regions.
1977  */
1978
1979 /*
1980  * Common quirk probe entry points.
1981  */
1982 static void vfio_vga_quirk_setup(VFIOPCIDevice *vdev)
1983 {
1984     vfio_vga_probe_ati_3c3_quirk(vdev);
1985     vfio_vga_probe_nvidia_3d0_quirk(vdev);
1986 }
1987
1988 static void vfio_vga_quirk_teardown(VFIOPCIDevice *vdev)
1989 {
1990     VFIOQuirk *quirk;
1991     int i;
1992
1993     for (i = 0; i < ARRAY_SIZE(vdev->vga.region); i++) {
1994         QLIST_FOREACH(quirk, &vdev->vga.region[i].quirks, next) {
1995             memory_region_del_subregion(&vdev->vga.region[i].mem, &quirk->mem);
1996         }
1997     }
1998 }
1999
2000 static void vfio_vga_quirk_free(VFIOPCIDevice *vdev)
2001 {
2002     int i;
2003
2004     for (i = 0; i < ARRAY_SIZE(vdev->vga.region); i++) {
2005         while (!QLIST_EMPTY(&vdev->vga.region[i].quirks)) {
2006             VFIOQuirk *quirk = QLIST_FIRST(&vdev->vga.region[i].quirks);
2007             object_unparent(OBJECT(&quirk->mem));
2008             QLIST_REMOVE(quirk, next);
2009             g_free(quirk);
2010         }
2011     }
2012 }
2013
2014 static void vfio_bar_quirk_setup(VFIOPCIDevice *vdev, int nr)
2015 {
2016     vfio_probe_ati_bar4_window_quirk(vdev, nr);
2017     vfio_probe_ati_bar2_4000_quirk(vdev, nr);
2018     vfio_probe_nvidia_bar5_window_quirk(vdev, nr);
2019     vfio_probe_nvidia_bar0_88000_quirk(vdev, nr);
2020     vfio_probe_nvidia_bar0_1800_quirk(vdev, nr);
2021     vfio_probe_rtl8168_bar2_window_quirk(vdev, nr);
2022 }
2023
2024 static void vfio_bar_quirk_teardown(VFIOPCIDevice *vdev, int nr)
2025 {
2026     VFIOBAR *bar = &vdev->bars[nr];
2027     VFIOQuirk *quirk;
2028
2029     QLIST_FOREACH(quirk, &bar->quirks, next) {
2030         memory_region_del_subregion(&bar->region.mem, &quirk->mem);
2031     }
2032 }
2033
2034 static void vfio_bar_quirk_free(VFIOPCIDevice *vdev, int nr)
2035 {
2036     VFIOBAR *bar = &vdev->bars[nr];
2037
2038     while (!QLIST_EMPTY(&bar->quirks)) {
2039         VFIOQuirk *quirk = QLIST_FIRST(&bar->quirks);
2040         object_unparent(OBJECT(&quirk->mem));
2041         QLIST_REMOVE(quirk, next);
2042         g_free(quirk);
2043     }
2044 }
2045
2046 /*
2047  * PCI config space
2048  */
2049 static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
2050 {
2051     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
2052     uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val;
2053
2054     memcpy(&emu_bits, vdev->emulated_config_bits + addr, len);
2055     emu_bits = le32_to_cpu(emu_bits);
2056
2057     if (emu_bits) {
2058         emu_val = pci_default_read_config(pdev, addr, len);
2059     }
2060
2061     if (~emu_bits & (0xffffffffU >> (32 - len * 8))) {
2062         ssize_t ret;
2063
2064         ret = pread(vdev->vbasedev.fd, &phys_val, len,
2065                     vdev->config_offset + addr);
2066         if (ret != len) {
2067             error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x) failed: %m",
2068                          __func__, vdev->host.domain, vdev->host.bus,
2069                          vdev->host.slot, vdev->host.function, addr, len);
2070             return -errno;
2071         }
2072         phys_val = le32_to_cpu(phys_val);
2073     }
2074
2075     val = (emu_val & emu_bits) | (phys_val & ~emu_bits);
2076
2077     trace_vfio_pci_read_config(vdev->vbasedev.name, addr, len, val);
2078
2079     return val;
2080 }
2081
2082 static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
2083                                   uint32_t val, int len)
2084 {
2085     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
2086     uint32_t val_le = cpu_to_le32(val);
2087
2088     trace_vfio_pci_write_config(vdev->vbasedev.name, addr, val, len);
2089
2090     /* Write everything to VFIO, let it filter out what we can't write */
2091     if (pwrite(vdev->vbasedev.fd, &val_le, len, vdev->config_offset + addr)
2092                 != len) {
2093         error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x, 0x%x) failed: %m",
2094                      __func__, vdev->host.domain, vdev->host.bus,
2095                      vdev->host.slot, vdev->host.function, addr, val, len);
2096     }
2097
2098     /* MSI/MSI-X Enabling/Disabling */
2099     if (pdev->cap_present & QEMU_PCI_CAP_MSI &&
2100         ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size)) {
2101         int is_enabled, was_enabled = msi_enabled(pdev);
2102
2103         pci_default_write_config(pdev, addr, val, len);
2104
2105         is_enabled = msi_enabled(pdev);
2106
2107         if (!was_enabled) {
2108             if (is_enabled) {
2109                 vfio_msi_enable(vdev);
2110             }
2111         } else {
2112             if (!is_enabled) {
2113                 vfio_msi_disable(vdev);
2114             } else {
2115                 vfio_update_msi(vdev);
2116             }
2117         }
2118     } else if (pdev->cap_present & QEMU_PCI_CAP_MSIX &&
2119         ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) {
2120         int is_enabled, was_enabled = msix_enabled(pdev);
2121
2122         pci_default_write_config(pdev, addr, val, len);
2123
2124         is_enabled = msix_enabled(pdev);
2125
2126         if (!was_enabled && is_enabled) {
2127             vfio_msix_enable(vdev);
2128         } else if (was_enabled && !is_enabled) {
2129             vfio_msix_disable(vdev);
2130         }
2131     } else {
2132         /* Write everything to QEMU to keep emulated bits correct */
2133         pci_default_write_config(pdev, addr, val, len);
2134     }
2135 }
2136
2137 /*
2138  * Interrupt setup
2139  */
2140 static void vfio_disable_interrupts(VFIOPCIDevice *vdev)
2141 {
2142     /*
2143      * More complicated than it looks.  Disabling MSI/X transitions the
2144      * device to INTx mode (if supported).  Therefore we need to first
2145      * disable MSI/X and then cleanup by disabling INTx.
2146      */
2147     if (vdev->interrupt == VFIO_INT_MSIX) {
2148         vfio_msix_disable(vdev);
2149     } else if (vdev->interrupt == VFIO_INT_MSI) {
2150         vfio_msi_disable(vdev);
2151     }
2152
2153     if (vdev->interrupt == VFIO_INT_INTx) {
2154         vfio_intx_disable(vdev);
2155     }
2156 }
2157
2158 static int vfio_msi_setup(VFIOPCIDevice *vdev, int pos)
2159 {
2160     uint16_t ctrl;
2161     bool msi_64bit, msi_maskbit;
2162     int ret, entries;
2163
2164     if (pread(vdev->vbasedev.fd, &ctrl, sizeof(ctrl),
2165               vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
2166         return -errno;
2167     }
2168     ctrl = le16_to_cpu(ctrl);
2169
2170     msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT);
2171     msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT);
2172     entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1);
2173
2174     trace_vfio_msi_setup(vdev->vbasedev.name, pos);
2175
2176     ret = msi_init(&vdev->pdev, pos, entries, msi_64bit, msi_maskbit);
2177     if (ret < 0) {
2178         if (ret == -ENOTSUP) {
2179             return 0;
2180         }
2181         error_report("vfio: msi_init failed");
2182         return ret;
2183     }
2184     vdev->msi_cap_size = 0xa + (msi_maskbit ? 0xa : 0) + (msi_64bit ? 0x4 : 0);
2185
2186     return 0;
2187 }
2188
2189 /*
2190  * We don't have any control over how pci_add_capability() inserts
2191  * capabilities into the chain.  In order to setup MSI-X we need a
2192  * MemoryRegion for the BAR.  In order to setup the BAR and not
2193  * attempt to mmap the MSI-X table area, which VFIO won't allow, we
2194  * need to first look for where the MSI-X table lives.  So we
2195  * unfortunately split MSI-X setup across two functions.
2196  */
2197 static int vfio_msix_early_setup(VFIOPCIDevice *vdev)
2198 {
2199     uint8_t pos;
2200     uint16_t ctrl;
2201     uint32_t table, pba;
2202     int fd = vdev->vbasedev.fd;
2203     VFIOMSIXInfo *msix;
2204
2205     pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX);
2206     if (!pos) {
2207         return 0;
2208     }
2209
2210     if (pread(fd, &ctrl, sizeof(ctrl),
2211               vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
2212         return -errno;
2213     }
2214
2215     if (pread(fd, &table, sizeof(table),
2216               vdev->config_offset + pos + PCI_MSIX_TABLE) != sizeof(table)) {
2217         return -errno;
2218     }
2219
2220     if (pread(fd, &pba, sizeof(pba),
2221               vdev->config_offset + pos + PCI_MSIX_PBA) != sizeof(pba)) {
2222         return -errno;
2223     }
2224
2225     ctrl = le16_to_cpu(ctrl);
2226     table = le32_to_cpu(table);
2227     pba = le32_to_cpu(pba);
2228
2229     msix = g_malloc0(sizeof(*msix));
2230     msix->table_bar = table & PCI_MSIX_FLAGS_BIRMASK;
2231     msix->table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
2232     msix->pba_bar = pba & PCI_MSIX_FLAGS_BIRMASK;
2233     msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK;
2234     msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
2235
2236     /*
2237      * Test the size of the pba_offset variable and catch if it extends outside
2238      * of the specified BAR. If it is the case, we need to apply a hardware
2239      * specific quirk if the device is known or we have a broken configuration.
2240      */
2241     if (msix->pba_offset >= vdev->bars[msix->pba_bar].region.size) {
2242         PCIDevice *pdev = &vdev->pdev;
2243         uint16_t vendor = pci_get_word(pdev->config + PCI_VENDOR_ID);
2244         uint16_t device = pci_get_word(pdev->config + PCI_DEVICE_ID);
2245
2246         /*
2247          * Chelsio T5 Virtual Function devices are encoded as 0x58xx for T5
2248          * adapters. The T5 hardware returns an incorrect value of 0x8000 for
2249          * the VF PBA offset while the BAR itself is only 8k. The correct value
2250          * is 0x1000, so we hard code that here.
2251          */
2252         if (vendor == PCI_VENDOR_ID_CHELSIO && (device & 0xff00) == 0x5800) {
2253             msix->pba_offset = 0x1000;
2254         } else {
2255             error_report("vfio: Hardware reports invalid configuration, "
2256                          "MSIX PBA outside of specified BAR");
2257             g_free(msix);
2258             return -EINVAL;
2259         }
2260     }
2261
2262     trace_vfio_msix_early_setup(vdev->vbasedev.name, pos, msix->table_bar,
2263                                 msix->table_offset, msix->entries);
2264     vdev->msix = msix;
2265
2266     return 0;
2267 }
2268
2269 static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos)
2270 {
2271     int ret;
2272
2273     ret = msix_init(&vdev->pdev, vdev->msix->entries,
2274                     &vdev->bars[vdev->msix->table_bar].region.mem,
2275                     vdev->msix->table_bar, vdev->msix->table_offset,
2276                     &vdev->bars[vdev->msix->pba_bar].region.mem,
2277                     vdev->msix->pba_bar, vdev->msix->pba_offset, pos);
2278     if (ret < 0) {
2279         if (ret == -ENOTSUP) {
2280             return 0;
2281         }
2282         error_report("vfio: msix_init failed");
2283         return ret;
2284     }
2285
2286     return 0;
2287 }
2288
2289 static void vfio_teardown_msi(VFIOPCIDevice *vdev)
2290 {
2291     msi_uninit(&vdev->pdev);
2292
2293     if (vdev->msix) {
2294         msix_uninit(&vdev->pdev,
2295                     &vdev->bars[vdev->msix->table_bar].region.mem,
2296                     &vdev->bars[vdev->msix->pba_bar].region.mem);
2297     }
2298 }
2299
2300 /*
2301  * Resource setup
2302  */
2303 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled)
2304 {
2305     int i;
2306
2307     for (i = 0; i < PCI_ROM_SLOT; i++) {
2308         VFIOBAR *bar = &vdev->bars[i];
2309
2310         if (!bar->region.size) {
2311             continue;
2312         }
2313
2314         memory_region_set_enabled(&bar->region.mmap_mem, enabled);
2315         if (vdev->msix && vdev->msix->table_bar == i) {
2316             memory_region_set_enabled(&vdev->msix->mmap_mem, enabled);
2317         }
2318     }
2319 }
2320
2321 static void vfio_unregister_bar(VFIOPCIDevice *vdev, int nr)
2322 {
2323     VFIOBAR *bar = &vdev->bars[nr];
2324
2325     if (!bar->region.size) {
2326         return;
2327     }
2328
2329     vfio_bar_quirk_teardown(vdev, nr);
2330
2331     memory_region_del_subregion(&bar->region.mem, &bar->region.mmap_mem);
2332
2333     if (vdev->msix && vdev->msix->table_bar == nr) {
2334         memory_region_del_subregion(&bar->region.mem, &vdev->msix->mmap_mem);
2335     }
2336 }
2337
2338 static void vfio_unmap_bar(VFIOPCIDevice *vdev, int nr)
2339 {
2340     VFIOBAR *bar = &vdev->bars[nr];
2341
2342     if (!bar->region.size) {
2343         return;
2344     }
2345
2346     vfio_bar_quirk_free(vdev, nr);
2347
2348     munmap(bar->region.mmap, memory_region_size(&bar->region.mmap_mem));
2349
2350     if (vdev->msix && vdev->msix->table_bar == nr) {
2351         munmap(vdev->msix->mmap, memory_region_size(&vdev->msix->mmap_mem));
2352     }
2353 }
2354
2355 static void vfio_map_bar(VFIOPCIDevice *vdev, int nr)
2356 {
2357     VFIOBAR *bar = &vdev->bars[nr];
2358     uint64_t size = bar->region.size;
2359     char name[64];
2360     uint32_t pci_bar;
2361     uint8_t type;
2362     int ret;
2363
2364     /* Skip both unimplemented BARs and the upper half of 64bit BARS. */
2365     if (!size) {
2366         return;
2367     }
2368
2369     snprintf(name, sizeof(name), "VFIO %04x:%02x:%02x.%x BAR %d",
2370              vdev->host.domain, vdev->host.bus, vdev->host.slot,
2371              vdev->host.function, nr);
2372
2373     /* Determine what type of BAR this is for registration */
2374     ret = pread(vdev->vbasedev.fd, &pci_bar, sizeof(pci_bar),
2375                 vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr));
2376     if (ret != sizeof(pci_bar)) {
2377         error_report("vfio: Failed to read BAR %d (%m)", nr);
2378         return;
2379     }
2380
2381     pci_bar = le32_to_cpu(pci_bar);
2382     bar->ioport = (pci_bar & PCI_BASE_ADDRESS_SPACE_IO);
2383     bar->mem64 = bar->ioport ? 0 : (pci_bar & PCI_BASE_ADDRESS_MEM_TYPE_64);
2384     type = pci_bar & (bar->ioport ? ~PCI_BASE_ADDRESS_IO_MASK :
2385                                     ~PCI_BASE_ADDRESS_MEM_MASK);
2386
2387     /* A "slow" read/write mapping underlies all BARs */
2388     memory_region_init_io(&bar->region.mem, OBJECT(vdev), &vfio_region_ops,
2389                           bar, name, size);
2390     pci_register_bar(&vdev->pdev, nr, type, &bar->region.mem);
2391
2392     /*
2393      * We can't mmap areas overlapping the MSIX vector table, so we
2394      * potentially insert a direct-mapped subregion before and after it.
2395      */
2396     if (vdev->msix && vdev->msix->table_bar == nr) {
2397         size = vdev->msix->table_offset & qemu_real_host_page_mask;
2398     }
2399
2400     strncat(name, " mmap", sizeof(name) - strlen(name) - 1);
2401     if (vfio_mmap_region(OBJECT(vdev), &bar->region, &bar->region.mem,
2402                       &bar->region.mmap_mem, &bar->region.mmap,
2403                       size, 0, name)) {
2404         error_report("%s unsupported. Performance may be slow", name);
2405     }
2406
2407     if (vdev->msix && vdev->msix->table_bar == nr) {
2408         uint64_t start;
2409
2410         start = REAL_HOST_PAGE_ALIGN((uint64_t)vdev->msix->table_offset +
2411                                      (vdev->msix->entries *
2412                                       PCI_MSIX_ENTRY_SIZE));
2413
2414         size = start < bar->region.size ? bar->region.size - start : 0;
2415         strncat(name, " msix-hi", sizeof(name) - strlen(name) - 1);
2416         /* VFIOMSIXInfo contains another MemoryRegion for this mapping */
2417         if (vfio_mmap_region(OBJECT(vdev), &bar->region, &bar->region.mem,
2418                           &vdev->msix->mmap_mem,
2419                           &vdev->msix->mmap, size, start, name)) {
2420             error_report("%s unsupported. Performance may be slow", name);
2421         }
2422     }
2423
2424     vfio_bar_quirk_setup(vdev, nr);
2425 }
2426
2427 static void vfio_map_bars(VFIOPCIDevice *vdev)
2428 {
2429     int i;
2430
2431     for (i = 0; i < PCI_ROM_SLOT; i++) {
2432         vfio_map_bar(vdev, i);
2433     }
2434
2435     if (vdev->has_vga) {
2436         memory_region_init_io(&vdev->vga.region[QEMU_PCI_VGA_MEM].mem,
2437                               OBJECT(vdev), &vfio_vga_ops,
2438                               &vdev->vga.region[QEMU_PCI_VGA_MEM],
2439                               "vfio-vga-mmio@0xa0000",
2440                               QEMU_PCI_VGA_MEM_SIZE);
2441         memory_region_init_io(&vdev->vga.region[QEMU_PCI_VGA_IO_LO].mem,
2442                               OBJECT(vdev), &vfio_vga_ops,
2443                               &vdev->vga.region[QEMU_PCI_VGA_IO_LO],
2444                               "vfio-vga-io@0x3b0",
2445                               QEMU_PCI_VGA_IO_LO_SIZE);
2446         memory_region_init_io(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem,
2447                               OBJECT(vdev), &vfio_vga_ops,
2448                               &vdev->vga.region[QEMU_PCI_VGA_IO_HI],
2449                               "vfio-vga-io@0x3c0",
2450                               QEMU_PCI_VGA_IO_HI_SIZE);
2451
2452         pci_register_vga(&vdev->pdev, &vdev->vga.region[QEMU_PCI_VGA_MEM].mem,
2453                          &vdev->vga.region[QEMU_PCI_VGA_IO_LO].mem,
2454                          &vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem);
2455         vfio_vga_quirk_setup(vdev);
2456     }
2457 }
2458
2459 static void vfio_unregister_bars(VFIOPCIDevice *vdev)
2460 {
2461     int i;
2462
2463     for (i = 0; i < PCI_ROM_SLOT; i++) {
2464         vfio_unregister_bar(vdev, i);
2465     }
2466
2467     if (vdev->has_vga) {
2468         vfio_vga_quirk_teardown(vdev);
2469         pci_unregister_vga(&vdev->pdev);
2470     }
2471 }
2472
2473 static void vfio_unmap_bars(VFIOPCIDevice *vdev)
2474 {
2475     int i;
2476
2477     for (i = 0; i < PCI_ROM_SLOT; i++) {
2478         vfio_unmap_bar(vdev, i);
2479     }
2480
2481     if (vdev->has_vga) {
2482         vfio_vga_quirk_free(vdev);
2483     }
2484 }
2485
2486 /*
2487  * General setup
2488  */
2489 static uint8_t vfio_std_cap_max_size(PCIDevice *pdev, uint8_t pos)
2490 {
2491     uint8_t tmp, next = 0xff;
2492
2493     for (tmp = pdev->config[PCI_CAPABILITY_LIST]; tmp;
2494          tmp = pdev->config[tmp + 1]) {
2495         if (tmp > pos && tmp < next) {
2496             next = tmp;
2497         }
2498     }
2499
2500     return next - pos;
2501 }
2502
2503 static void vfio_set_word_bits(uint8_t *buf, uint16_t val, uint16_t mask)
2504 {
2505     pci_set_word(buf, (pci_get_word(buf) & ~mask) | val);
2506 }
2507
2508 static void vfio_add_emulated_word(VFIOPCIDevice *vdev, int pos,
2509                                    uint16_t val, uint16_t mask)
2510 {
2511     vfio_set_word_bits(vdev->pdev.config + pos, val, mask);
2512     vfio_set_word_bits(vdev->pdev.wmask + pos, ~mask, mask);
2513     vfio_set_word_bits(vdev->emulated_config_bits + pos, mask, mask);
2514 }
2515
2516 static void vfio_set_long_bits(uint8_t *buf, uint32_t val, uint32_t mask)
2517 {
2518     pci_set_long(buf, (pci_get_long(buf) & ~mask) | val);
2519 }
2520
2521 static void vfio_add_emulated_long(VFIOPCIDevice *vdev, int pos,
2522                                    uint32_t val, uint32_t mask)
2523 {
2524     vfio_set_long_bits(vdev->pdev.config + pos, val, mask);
2525     vfio_set_long_bits(vdev->pdev.wmask + pos, ~mask, mask);
2526     vfio_set_long_bits(vdev->emulated_config_bits + pos, mask, mask);
2527 }
2528
2529 static int vfio_setup_pcie_cap(VFIOPCIDevice *vdev, int pos, uint8_t size)
2530 {
2531     uint16_t flags;
2532     uint8_t type;
2533
2534     flags = pci_get_word(vdev->pdev.config + pos + PCI_CAP_FLAGS);
2535     type = (flags & PCI_EXP_FLAGS_TYPE) >> 4;
2536
2537     if (type != PCI_EXP_TYPE_ENDPOINT &&
2538         type != PCI_EXP_TYPE_LEG_END &&
2539         type != PCI_EXP_TYPE_RC_END) {
2540
2541         error_report("vfio: Assignment of PCIe type 0x%x "
2542                      "devices is not currently supported", type);
2543         return -EINVAL;
2544     }
2545
2546     if (!pci_bus_is_express(vdev->pdev.bus)) {
2547         /*
2548          * Use express capability as-is on PCI bus.  It doesn't make much
2549          * sense to even expose, but some drivers (ex. tg3) depend on it
2550          * and guests don't seem to be particular about it.  We'll need
2551          * to revist this or force express devices to express buses if we
2552          * ever expose an IOMMU to the guest.
2553          */
2554     } else if (pci_bus_is_root(vdev->pdev.bus)) {
2555         /*
2556          * On a Root Complex bus Endpoints become Root Complex Integrated
2557          * Endpoints, which changes the type and clears the LNK & LNK2 fields.
2558          */
2559         if (type == PCI_EXP_TYPE_ENDPOINT) {
2560             vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
2561                                    PCI_EXP_TYPE_RC_END << 4,
2562                                    PCI_EXP_FLAGS_TYPE);
2563
2564             /* Link Capabilities, Status, and Control goes away */
2565             if (size > PCI_EXP_LNKCTL) {
2566                 vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP, 0, ~0);
2567                 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0);
2568                 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA, 0, ~0);
2569
2570 #ifndef PCI_EXP_LNKCAP2
2571 #define PCI_EXP_LNKCAP2 44
2572 #endif
2573 #ifndef PCI_EXP_LNKSTA2
2574 #define PCI_EXP_LNKSTA2 50
2575 #endif
2576                 /* Link 2 Capabilities, Status, and Control goes away */
2577                 if (size > PCI_EXP_LNKCAP2) {
2578                     vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP2, 0, ~0);
2579                     vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL2, 0, ~0);
2580                     vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA2, 0, ~0);
2581                 }
2582             }
2583
2584         } else if (type == PCI_EXP_TYPE_LEG_END) {
2585             /*
2586              * Legacy endpoints don't belong on the root complex.  Windows
2587              * seems to be happier with devices if we skip the capability.
2588              */
2589             return 0;
2590         }
2591
2592     } else {
2593         /*
2594          * Convert Root Complex Integrated Endpoints to regular endpoints.
2595          * These devices don't support LNK/LNK2 capabilities, so make them up.
2596          */
2597         if (type == PCI_EXP_TYPE_RC_END) {
2598             vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
2599                                    PCI_EXP_TYPE_ENDPOINT << 4,
2600                                    PCI_EXP_FLAGS_TYPE);
2601             vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP,
2602                                    PCI_EXP_LNK_MLW_1 | PCI_EXP_LNK_LS_25, ~0);
2603             vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0);
2604         }
2605
2606         /* Mark the Link Status bits as emulated to allow virtual negotiation */
2607         vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA,
2608                                pci_get_word(vdev->pdev.config + pos +
2609                                             PCI_EXP_LNKSTA),
2610                                PCI_EXP_LNKCAP_MLW | PCI_EXP_LNKCAP_SLS);
2611     }
2612
2613     pos = pci_add_capability(&vdev->pdev, PCI_CAP_ID_EXP, pos, size);
2614     if (pos >= 0) {
2615         vdev->pdev.exp.exp_cap = pos;
2616     }
2617
2618     return pos;
2619 }
2620
2621 static void vfio_check_pcie_flr(VFIOPCIDevice *vdev, uint8_t pos)
2622 {
2623     uint32_t cap = pci_get_long(vdev->pdev.config + pos + PCI_EXP_DEVCAP);
2624
2625     if (cap & PCI_EXP_DEVCAP_FLR) {
2626         trace_vfio_check_pcie_flr(vdev->vbasedev.name);
2627         vdev->has_flr = true;
2628     }
2629 }
2630
2631 static void vfio_check_pm_reset(VFIOPCIDevice *vdev, uint8_t pos)
2632 {
2633     uint16_t csr = pci_get_word(vdev->pdev.config + pos + PCI_PM_CTRL);
2634
2635     if (!(csr & PCI_PM_CTRL_NO_SOFT_RESET)) {
2636         trace_vfio_check_pm_reset(vdev->vbasedev.name);
2637         vdev->has_pm_reset = true;
2638     }
2639 }
2640
2641 static void vfio_check_af_flr(VFIOPCIDevice *vdev, uint8_t pos)
2642 {
2643     uint8_t cap = pci_get_byte(vdev->pdev.config + pos + PCI_AF_CAP);
2644
2645     if ((cap & PCI_AF_CAP_TP) && (cap & PCI_AF_CAP_FLR)) {
2646         trace_vfio_check_af_flr(vdev->vbasedev.name);
2647         vdev->has_flr = true;
2648     }
2649 }
2650
2651 static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos)
2652 {
2653     PCIDevice *pdev = &vdev->pdev;
2654     uint8_t cap_id, next, size;
2655     int ret;
2656
2657     cap_id = pdev->config[pos];
2658     next = pdev->config[pos + 1];
2659
2660     /*
2661      * If it becomes important to configure capabilities to their actual
2662      * size, use this as the default when it's something we don't recognize.
2663      * Since QEMU doesn't actually handle many of the config accesses,
2664      * exact size doesn't seem worthwhile.
2665      */
2666     size = vfio_std_cap_max_size(pdev, pos);
2667
2668     /*
2669      * pci_add_capability always inserts the new capability at the head
2670      * of the chain.  Therefore to end up with a chain that matches the
2671      * physical device, we insert from the end by making this recursive.
2672      * This is also why we pre-caclulate size above as cached config space
2673      * will be changed as we unwind the stack.
2674      */
2675     if (next) {
2676         ret = vfio_add_std_cap(vdev, next);
2677         if (ret) {
2678             return ret;
2679         }
2680     } else {
2681         /* Begin the rebuild, use QEMU emulated list bits */
2682         pdev->config[PCI_CAPABILITY_LIST] = 0;
2683         vdev->emulated_config_bits[PCI_CAPABILITY_LIST] = 0xff;
2684         vdev->emulated_config_bits[PCI_STATUS] |= PCI_STATUS_CAP_LIST;
2685     }
2686
2687     /* Use emulated next pointer to allow dropping caps */
2688     pci_set_byte(vdev->emulated_config_bits + pos + 1, 0xff);
2689
2690     switch (cap_id) {
2691     case PCI_CAP_ID_MSI:
2692         ret = vfio_msi_setup(vdev, pos);
2693         break;
2694     case PCI_CAP_ID_EXP:
2695         vfio_check_pcie_flr(vdev, pos);
2696         ret = vfio_setup_pcie_cap(vdev, pos, size);
2697         break;
2698     case PCI_CAP_ID_MSIX:
2699         ret = vfio_msix_setup(vdev, pos);
2700         break;
2701     case PCI_CAP_ID_PM:
2702         vfio_check_pm_reset(vdev, pos);
2703         vdev->pm_cap = pos;
2704         ret = pci_add_capability(pdev, cap_id, pos, size);
2705         break;
2706     case PCI_CAP_ID_AF:
2707         vfio_check_af_flr(vdev, pos);
2708         ret = pci_add_capability(pdev, cap_id, pos, size);
2709         break;
2710     default:
2711         ret = pci_add_capability(pdev, cap_id, pos, size);
2712         break;
2713     }
2714
2715     if (ret < 0) {
2716         error_report("vfio: %04x:%02x:%02x.%x Error adding PCI capability "
2717                      "0x%x[0x%x]@0x%x: %d", vdev->host.domain,
2718                      vdev->host.bus, vdev->host.slot, vdev->host.function,
2719                      cap_id, size, pos, ret);
2720         return ret;
2721     }
2722
2723     return 0;
2724 }
2725
2726 static int vfio_add_capabilities(VFIOPCIDevice *vdev)
2727 {
2728     PCIDevice *pdev = &vdev->pdev;
2729
2730     if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST) ||
2731         !pdev->config[PCI_CAPABILITY_LIST]) {
2732         return 0; /* Nothing to add */
2733     }
2734
2735     return vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]);
2736 }
2737
2738 static void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
2739 {
2740     PCIDevice *pdev = &vdev->pdev;
2741     uint16_t cmd;
2742
2743     vfio_disable_interrupts(vdev);
2744
2745     /* Make sure the device is in D0 */
2746     if (vdev->pm_cap) {
2747         uint16_t pmcsr;
2748         uint8_t state;
2749
2750         pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
2751         state = pmcsr & PCI_PM_CTRL_STATE_MASK;
2752         if (state) {
2753             pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
2754             vfio_pci_write_config(pdev, vdev->pm_cap + PCI_PM_CTRL, pmcsr, 2);
2755             /* vfio handles the necessary delay here */
2756             pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
2757             state = pmcsr & PCI_PM_CTRL_STATE_MASK;
2758             if (state) {
2759                 error_report("vfio: Unable to power on device, stuck in D%d",
2760                              state);
2761             }
2762         }
2763     }
2764
2765     /*
2766      * Stop any ongoing DMA by disconecting I/O, MMIO, and bus master.
2767      * Also put INTx Disable in known state.
2768      */
2769     cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2);
2770     cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER |
2771              PCI_COMMAND_INTX_DISABLE);
2772     vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2);
2773 }
2774
2775 static void vfio_pci_post_reset(VFIOPCIDevice *vdev)
2776 {
2777     vfio_intx_enable(vdev);
2778 }
2779
2780 static bool vfio_pci_host_match(PCIHostDeviceAddress *host1,
2781                                 PCIHostDeviceAddress *host2)
2782 {
2783     return (host1->domain == host2->domain && host1->bus == host2->bus &&
2784             host1->slot == host2->slot && host1->function == host2->function);
2785 }
2786
2787 static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
2788 {
2789     VFIOGroup *group;
2790     struct vfio_pci_hot_reset_info *info;
2791     struct vfio_pci_dependent_device *devices;
2792     struct vfio_pci_hot_reset *reset;
2793     int32_t *fds;
2794     int ret, i, count;
2795     bool multi = false;
2796
2797     trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi");
2798
2799     vfio_pci_pre_reset(vdev);
2800     vdev->vbasedev.needs_reset = false;
2801
2802     info = g_malloc0(sizeof(*info));
2803     info->argsz = sizeof(*info);
2804
2805     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
2806     if (ret && errno != ENOSPC) {
2807         ret = -errno;
2808         if (!vdev->has_pm_reset) {
2809             error_report("vfio: Cannot reset device %04x:%02x:%02x.%x, "
2810                          "no available reset mechanism.", vdev->host.domain,
2811                          vdev->host.bus, vdev->host.slot, vdev->host.function);
2812         }
2813         goto out_single;
2814     }
2815
2816     count = info->count;
2817     info = g_realloc(info, sizeof(*info) + (count * sizeof(*devices)));
2818     info->argsz = sizeof(*info) + (count * sizeof(*devices));
2819     devices = &info->devices[0];
2820
2821     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
2822     if (ret) {
2823         ret = -errno;
2824         error_report("vfio: hot reset info failed: %m");
2825         goto out_single;
2826     }
2827
2828     trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name);
2829
2830     /* Verify that we have all the groups required */
2831     for (i = 0; i < info->count; i++) {
2832         PCIHostDeviceAddress host;
2833         VFIOPCIDevice *tmp;
2834         VFIODevice *vbasedev_iter;
2835
2836         host.domain = devices[i].segment;
2837         host.bus = devices[i].bus;
2838         host.slot = PCI_SLOT(devices[i].devfn);
2839         host.function = PCI_FUNC(devices[i].devfn);
2840
2841         trace_vfio_pci_hot_reset_dep_devices(host.domain,
2842                 host.bus, host.slot, host.function, devices[i].group_id);
2843
2844         if (vfio_pci_host_match(&host, &vdev->host)) {
2845             continue;
2846         }
2847
2848         QLIST_FOREACH(group, &vfio_group_list, next) {
2849             if (group->groupid == devices[i].group_id) {
2850                 break;
2851             }
2852         }
2853
2854         if (!group) {
2855             if (!vdev->has_pm_reset) {
2856                 error_report("vfio: Cannot reset device %s, "
2857                              "depends on group %d which is not owned.",
2858                              vdev->vbasedev.name, devices[i].group_id);
2859             }
2860             ret = -EPERM;
2861             goto out;
2862         }
2863
2864         /* Prep dependent devices for reset and clear our marker. */
2865         QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
2866             if (vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
2867                 continue;
2868             }
2869             tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
2870             if (vfio_pci_host_match(&host, &tmp->host)) {
2871                 if (single) {
2872                     ret = -EINVAL;
2873                     goto out_single;
2874                 }
2875                 vfio_pci_pre_reset(tmp);
2876                 tmp->vbasedev.needs_reset = false;
2877                 multi = true;
2878                 break;
2879             }
2880         }
2881     }
2882
2883     if (!single && !multi) {
2884         ret = -EINVAL;
2885         goto out_single;
2886     }
2887
2888     /* Determine how many group fds need to be passed */
2889     count = 0;
2890     QLIST_FOREACH(group, &vfio_group_list, next) {
2891         for (i = 0; i < info->count; i++) {
2892             if (group->groupid == devices[i].group_id) {
2893                 count++;
2894                 break;
2895             }
2896         }
2897     }
2898
2899     reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds)));
2900     reset->argsz = sizeof(*reset) + (count * sizeof(*fds));
2901     fds = &reset->group_fds[0];
2902
2903     /* Fill in group fds */
2904     QLIST_FOREACH(group, &vfio_group_list, next) {
2905         for (i = 0; i < info->count; i++) {
2906             if (group->groupid == devices[i].group_id) {
2907                 fds[reset->count++] = group->fd;
2908                 break;
2909             }
2910         }
2911     }
2912
2913     /* Bus reset! */
2914     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
2915     g_free(reset);
2916
2917     trace_vfio_pci_hot_reset_result(vdev->vbasedev.name,
2918                                     ret ? "%m" : "Success");
2919
2920 out:
2921     /* Re-enable INTx on affected devices */
2922     for (i = 0; i < info->count; i++) {
2923         PCIHostDeviceAddress host;
2924         VFIOPCIDevice *tmp;
2925         VFIODevice *vbasedev_iter;
2926
2927         host.domain = devices[i].segment;
2928         host.bus = devices[i].bus;
2929         host.slot = PCI_SLOT(devices[i].devfn);
2930         host.function = PCI_FUNC(devices[i].devfn);
2931
2932         if (vfio_pci_host_match(&host, &vdev->host)) {
2933             continue;
2934         }
2935
2936         QLIST_FOREACH(group, &vfio_group_list, next) {
2937             if (group->groupid == devices[i].group_id) {
2938                 break;
2939             }
2940         }
2941
2942         if (!group) {
2943             break;
2944         }
2945
2946         QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
2947             if (vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
2948                 continue;
2949             }
2950             tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
2951             if (vfio_pci_host_match(&host, &tmp->host)) {
2952                 vfio_pci_post_reset(tmp);
2953                 break;
2954             }
2955         }
2956     }
2957 out_single:
2958     vfio_pci_post_reset(vdev);
2959     g_free(info);
2960
2961     return ret;
2962 }
2963
2964 /*
2965  * We want to differentiate hot reset of mulitple in-use devices vs hot reset
2966  * of a single in-use device.  VFIO_DEVICE_RESET will already handle the case
2967  * of doing hot resets when there is only a single device per bus.  The in-use
2968  * here refers to how many VFIODevices are affected.  A hot reset that affects
2969  * multiple devices, but only a single in-use device, means that we can call
2970  * it from our bus ->reset() callback since the extent is effectively a single
2971  * device.  This allows us to make use of it in the hotplug path.  When there
2972  * are multiple in-use devices, we can only trigger the hot reset during a
2973  * system reset and thus from our reset handler.  We separate _one vs _multi
2974  * here so that we don't overlap and do a double reset on the system reset
2975  * path where both our reset handler and ->reset() callback are used.  Calling
2976  * _one() will only do a hot reset for the one in-use devices case, calling
2977  * _multi() will do nothing if a _one() would have been sufficient.
2978  */
2979 static int vfio_pci_hot_reset_one(VFIOPCIDevice *vdev)
2980 {
2981     return vfio_pci_hot_reset(vdev, true);
2982 }
2983
2984 static int vfio_pci_hot_reset_multi(VFIODevice *vbasedev)
2985 {
2986     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
2987     return vfio_pci_hot_reset(vdev, false);
2988 }
2989
2990 static void vfio_pci_compute_needs_reset(VFIODevice *vbasedev)
2991 {
2992     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
2993     if (!vbasedev->reset_works || (!vdev->has_flr && vdev->has_pm_reset)) {
2994         vbasedev->needs_reset = true;
2995     }
2996 }
2997
2998 static VFIODeviceOps vfio_pci_ops = {
2999     .vfio_compute_needs_reset = vfio_pci_compute_needs_reset,
3000     .vfio_hot_reset_multi = vfio_pci_hot_reset_multi,
3001     .vfio_eoi = vfio_intx_eoi,
3002 };
3003
3004 static int vfio_populate_device(VFIOPCIDevice *vdev)
3005 {
3006     VFIODevice *vbasedev = &vdev->vbasedev;
3007     struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) };
3008     struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
3009     int i, ret = -1;
3010
3011     /* Sanity check device */
3012     if (!(vbasedev->flags & VFIO_DEVICE_FLAGS_PCI)) {
3013         error_report("vfio: Um, this isn't a PCI device");
3014         goto error;
3015     }
3016
3017     if (vbasedev->num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1) {
3018         error_report("vfio: unexpected number of io regions %u",
3019                      vbasedev->num_regions);
3020         goto error;
3021     }
3022
3023     if (vbasedev->num_irqs < VFIO_PCI_MSIX_IRQ_INDEX + 1) {
3024         error_report("vfio: unexpected number of irqs %u", vbasedev->num_irqs);
3025         goto error;
3026     }
3027
3028     for (i = VFIO_PCI_BAR0_REGION_INDEX; i < VFIO_PCI_ROM_REGION_INDEX; i++) {
3029         reg_info.index = i;
3030
3031         ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
3032         if (ret) {
3033             error_report("vfio: Error getting region %d info: %m", i);
3034             goto error;
3035         }
3036
3037         trace_vfio_populate_device_region(vbasedev->name, i,
3038                                           (unsigned long)reg_info.size,
3039                                           (unsigned long)reg_info.offset,
3040                                           (unsigned long)reg_info.flags);
3041
3042         vdev->bars[i].region.vbasedev = vbasedev;
3043         vdev->bars[i].region.flags = reg_info.flags;
3044         vdev->bars[i].region.size = reg_info.size;
3045         vdev->bars[i].region.fd_offset = reg_info.offset;
3046         vdev->bars[i].region.nr = i;
3047         QLIST_INIT(&vdev->bars[i].quirks);
3048     }
3049
3050     reg_info.index = VFIO_PCI_CONFIG_REGION_INDEX;
3051
3052     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
3053     if (ret) {
3054         error_report("vfio: Error getting config info: %m");
3055         goto error;
3056     }
3057
3058     trace_vfio_populate_device_config(vdev->vbasedev.name,
3059                                       (unsigned long)reg_info.size,
3060                                       (unsigned long)reg_info.offset,
3061                                       (unsigned long)reg_info.flags);
3062
3063     vdev->config_size = reg_info.size;
3064     if (vdev->config_size == PCI_CONFIG_SPACE_SIZE) {
3065         vdev->pdev.cap_present &= ~QEMU_PCI_CAP_EXPRESS;
3066     }
3067     vdev->config_offset = reg_info.offset;
3068
3069     if ((vdev->features & VFIO_FEATURE_ENABLE_VGA) &&
3070         vbasedev->num_regions > VFIO_PCI_VGA_REGION_INDEX) {
3071         struct vfio_region_info vga_info = {
3072             .argsz = sizeof(vga_info),
3073             .index = VFIO_PCI_VGA_REGION_INDEX,
3074          };
3075
3076         ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_REGION_INFO, &vga_info);
3077         if (ret) {
3078             error_report(
3079                 "vfio: Device does not support requested feature x-vga");
3080             goto error;
3081         }
3082
3083         if (!(vga_info.flags & VFIO_REGION_INFO_FLAG_READ) ||
3084             !(vga_info.flags & VFIO_REGION_INFO_FLAG_WRITE) ||
3085             vga_info.size < 0xbffff + 1) {
3086             error_report("vfio: Unexpected VGA info, flags 0x%lx, size 0x%lx",
3087                          (unsigned long)vga_info.flags,
3088                          (unsigned long)vga_info.size);
3089             goto error;
3090         }
3091
3092         vdev->vga.fd_offset = vga_info.offset;
3093         vdev->vga.fd = vdev->vbasedev.fd;
3094
3095         vdev->vga.region[QEMU_PCI_VGA_MEM].offset = QEMU_PCI_VGA_MEM_BASE;
3096         vdev->vga.region[QEMU_PCI_VGA_MEM].nr = QEMU_PCI_VGA_MEM;
3097         QLIST_INIT(&vdev->vga.region[QEMU_PCI_VGA_MEM].quirks);
3098
3099         vdev->vga.region[QEMU_PCI_VGA_IO_LO].offset = QEMU_PCI_VGA_IO_LO_BASE;
3100         vdev->vga.region[QEMU_PCI_VGA_IO_LO].nr = QEMU_PCI_VGA_IO_LO;
3101         QLIST_INIT(&vdev->vga.region[QEMU_PCI_VGA_IO_LO].quirks);
3102
3103         vdev->vga.region[QEMU_PCI_VGA_IO_HI].offset = QEMU_PCI_VGA_IO_HI_BASE;
3104         vdev->vga.region[QEMU_PCI_VGA_IO_HI].nr = QEMU_PCI_VGA_IO_HI;
3105         QLIST_INIT(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].quirks);
3106
3107         vdev->has_vga = true;
3108     }
3109
3110     irq_info.index = VFIO_PCI_ERR_IRQ_INDEX;
3111
3112     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
3113     if (ret) {
3114         /* This can fail for an old kernel or legacy PCI dev */
3115         trace_vfio_populate_device_get_irq_info_failure();
3116         ret = 0;
3117     } else if (irq_info.count == 1) {
3118         vdev->pci_aer = true;
3119     } else {
3120         error_report("vfio: %s "
3121                      "Could not enable error recovery for the device",
3122                      vbasedev->name);
3123     }
3124
3125 error:
3126     return ret;
3127 }
3128
3129 static void vfio_put_device(VFIOPCIDevice *vdev)
3130 {
3131     g_free(vdev->vbasedev.name);
3132     if (vdev->msix) {
3133         object_unparent(OBJECT(&vdev->msix->mmap_mem));
3134         g_free(vdev->msix);
3135         vdev->msix = NULL;
3136     }
3137     vfio_put_base_device(&vdev->vbasedev);
3138 }
3139
3140 static void vfio_err_notifier_handler(void *opaque)
3141 {
3142     VFIOPCIDevice *vdev = opaque;
3143
3144     if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
3145         return;
3146     }
3147
3148     /*
3149      * TBD. Retrieve the error details and decide what action
3150      * needs to be taken. One of the actions could be to pass
3151      * the error to the guest and have the guest driver recover
3152      * from the error. This requires that PCIe capabilities be
3153      * exposed to the guest. For now, we just terminate the
3154      * guest to contain the error.
3155      */
3156
3157     error_report("%s(%04x:%02x:%02x.%x) Unrecoverable error detected.  "
3158                  "Please collect any data possible and then kill the guest",
3159                  __func__, vdev->host.domain, vdev->host.bus,
3160                  vdev->host.slot, vdev->host.function);
3161
3162     vm_stop(RUN_STATE_INTERNAL_ERROR);
3163 }
3164
3165 /*
3166  * Registers error notifier for devices supporting error recovery.
3167  * If we encounter a failure in this function, we report an error
3168  * and continue after disabling error recovery support for the
3169  * device.
3170  */
3171 static void vfio_register_err_notifier(VFIOPCIDevice *vdev)
3172 {
3173     int ret;
3174     int argsz;
3175     struct vfio_irq_set *irq_set;
3176     int32_t *pfd;
3177
3178     if (!vdev->pci_aer) {
3179         return;
3180     }
3181
3182     if (event_notifier_init(&vdev->err_notifier, 0)) {
3183         error_report("vfio: Unable to init event notifier for error detection");
3184         vdev->pci_aer = false;
3185         return;
3186     }
3187
3188     argsz = sizeof(*irq_set) + sizeof(*pfd);
3189
3190     irq_set = g_malloc0(argsz);
3191     irq_set->argsz = argsz;
3192     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
3193                      VFIO_IRQ_SET_ACTION_TRIGGER;
3194     irq_set->index = VFIO_PCI_ERR_IRQ_INDEX;
3195     irq_set->start = 0;
3196     irq_set->count = 1;
3197     pfd = (int32_t *)&irq_set->data;
3198
3199     *pfd = event_notifier_get_fd(&vdev->err_notifier);
3200     qemu_set_fd_handler(*pfd, vfio_err_notifier_handler, NULL, vdev);
3201
3202     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
3203     if (ret) {
3204         error_report("vfio: Failed to set up error notification");
3205         qemu_set_fd_handler(*pfd, NULL, NULL, vdev);
3206         event_notifier_cleanup(&vdev->err_notifier);
3207         vdev->pci_aer = false;
3208     }
3209     g_free(irq_set);
3210 }
3211
3212 static void vfio_unregister_err_notifier(VFIOPCIDevice *vdev)
3213 {
3214     int argsz;
3215     struct vfio_irq_set *irq_set;
3216     int32_t *pfd;
3217     int ret;
3218
3219     if (!vdev->pci_aer) {
3220         return;
3221     }
3222
3223     argsz = sizeof(*irq_set) + sizeof(*pfd);
3224
3225     irq_set = g_malloc0(argsz);
3226     irq_set->argsz = argsz;
3227     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
3228                      VFIO_IRQ_SET_ACTION_TRIGGER;
3229     irq_set->index = VFIO_PCI_ERR_IRQ_INDEX;
3230     irq_set->start = 0;
3231     irq_set->count = 1;
3232     pfd = (int32_t *)&irq_set->data;
3233     *pfd = -1;
3234
3235     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
3236     if (ret) {
3237         error_report("vfio: Failed to de-assign error fd: %m");
3238     }
3239     g_free(irq_set);
3240     qemu_set_fd_handler(event_notifier_get_fd(&vdev->err_notifier),
3241                         NULL, NULL, vdev);
3242     event_notifier_cleanup(&vdev->err_notifier);
3243 }
3244
3245 static void vfio_req_notifier_handler(void *opaque)
3246 {
3247     VFIOPCIDevice *vdev = opaque;
3248
3249     if (!event_notifier_test_and_clear(&vdev->req_notifier)) {
3250         return;
3251     }
3252
3253     qdev_unplug(&vdev->pdev.qdev, NULL);
3254 }
3255
3256 static void vfio_register_req_notifier(VFIOPCIDevice *vdev)
3257 {
3258     struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info),
3259                                       .index = VFIO_PCI_REQ_IRQ_INDEX };
3260     int argsz;
3261     struct vfio_irq_set *irq_set;
3262     int32_t *pfd;
3263
3264     if (!(vdev->features & VFIO_FEATURE_ENABLE_REQ)) {
3265         return;
3266     }
3267
3268     if (ioctl(vdev->vbasedev.fd,
3269               VFIO_DEVICE_GET_IRQ_INFO, &irq_info) < 0 || irq_info.count < 1) {
3270         return;
3271     }
3272
3273     if (event_notifier_init(&vdev->req_notifier, 0)) {
3274         error_report("vfio: Unable to init event notifier for device request");
3275         return;
3276     }
3277
3278     argsz = sizeof(*irq_set) + sizeof(*pfd);
3279
3280     irq_set = g_malloc0(argsz);
3281     irq_set->argsz = argsz;
3282     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
3283                      VFIO_IRQ_SET_ACTION_TRIGGER;
3284     irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
3285     irq_set->start = 0;
3286     irq_set->count = 1;
3287     pfd = (int32_t *)&irq_set->data;
3288
3289     *pfd = event_notifier_get_fd(&vdev->req_notifier);
3290     qemu_set_fd_handler(*pfd, vfio_req_notifier_handler, NULL, vdev);
3291
3292     if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set)) {
3293         error_report("vfio: Failed to set up device request notification");
3294         qemu_set_fd_handler(*pfd, NULL, NULL, vdev);
3295         event_notifier_cleanup(&vdev->req_notifier);
3296     } else {
3297         vdev->req_enabled = true;
3298     }
3299
3300     g_free(irq_set);
3301 }
3302
3303 static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
3304 {
3305     int argsz;
3306     struct vfio_irq_set *irq_set;
3307     int32_t *pfd;
3308
3309     if (!vdev->req_enabled) {
3310         return;
3311     }
3312
3313     argsz = sizeof(*irq_set) + sizeof(*pfd);
3314
3315     irq_set = g_malloc0(argsz);
3316     irq_set->argsz = argsz;
3317     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
3318                      VFIO_IRQ_SET_ACTION_TRIGGER;
3319     irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
3320     irq_set->start = 0;
3321     irq_set->count = 1;
3322     pfd = (int32_t *)&irq_set->data;
3323     *pfd = -1;
3324
3325     if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set)) {
3326         error_report("vfio: Failed to de-assign device request fd: %m");
3327     }
3328     g_free(irq_set);
3329     qemu_set_fd_handler(event_notifier_get_fd(&vdev->req_notifier),
3330                         NULL, NULL, vdev);
3331     event_notifier_cleanup(&vdev->req_notifier);
3332
3333     vdev->req_enabled = false;
3334 }
3335
3336 /*
3337  * AMD Radeon PCI config reset, based on Linux:
3338  *   drivers/gpu/drm/radeon/ci_smc.c:ci_is_smc_running()
3339  *   drivers/gpu/drm/radeon/radeon_device.c:radeon_pci_config_reset
3340  *   drivers/gpu/drm/radeon/ci_smc.c:ci_reset_smc()
3341  *   drivers/gpu/drm/radeon/ci_smc.c:ci_stop_smc_clock()
3342  * IDs: include/drm/drm_pciids.h
3343  * Registers: http://cgit.freedesktop.org/~agd5f/linux/commit/?id=4e2aa447f6f0
3344  *
3345  * Bonaire and Hawaii GPUs do not respond to a bus reset.  This is a bug in the
3346  * hardware that should be fixed on future ASICs.  The symptom of this is that
3347  * once the accerlated driver loads, Windows guests will bsod on subsequent
3348  * attmpts to load the driver, such as after VM reset or shutdown/restart.  To
3349  * work around this, we do an AMD specific PCI config reset, followed by an SMC
3350  * reset.  The PCI config reset only works if SMC firmware is running, so we
3351  * have a dependency on the state of the device as to whether this reset will
3352  * be effective.  There are still cases where we won't be able to kick the
3353  * device into working, but this greatly improves the usability overall.  The
3354  * config reset magic is relatively common on AMD GPUs, but the setup and SMC
3355  * poking is largely ASIC specific.
3356  */
3357 static bool vfio_radeon_smc_is_running(VFIOPCIDevice *vdev)
3358 {
3359     uint32_t clk, pc_c;
3360
3361     /*
3362      * Registers 200h and 204h are index and data registers for accessing
3363      * indirect configuration registers within the device.
3364      */
3365     vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000004, 4);
3366     clk = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
3367     vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000370, 4);
3368     pc_c = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
3369
3370     return (!(clk & 1) && (0x20100 <= pc_c));
3371 }
3372
3373 /*
3374  * The scope of a config reset is controlled by a mode bit in the misc register
3375  * and a fuse, exposed as a bit in another register.  The fuse is the default
3376  * (0 = GFX, 1 = whole GPU), the misc bit is a toggle, with the forumula
3377  * scope = !(misc ^ fuse), where the resulting scope is defined the same as
3378  * the fuse.  A truth table therefore tells us that if misc == fuse, we need
3379  * to flip the value of the bit in the misc register.
3380  */
3381 static void vfio_radeon_set_gfx_only_reset(VFIOPCIDevice *vdev)
3382 {
3383     uint32_t misc, fuse;
3384     bool a, b;
3385
3386     vfio_region_write(&vdev->bars[5].region, 0x200, 0xc00c0000, 4);
3387     fuse = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
3388     b = fuse & 64;
3389
3390     vfio_region_write(&vdev->bars[5].region, 0x200, 0xc0000010, 4);
3391     misc = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
3392     a = misc & 2;
3393
3394     if (a == b) {
3395         vfio_region_write(&vdev->bars[5].region, 0x204, misc ^ 2, 4);
3396         vfio_region_read(&vdev->bars[5].region, 0x204, 4); /* flush */
3397     }
3398 }
3399
3400 static int vfio_radeon_reset(VFIOPCIDevice *vdev)
3401 {
3402     PCIDevice *pdev = &vdev->pdev;
3403     int i, ret = 0;
3404     uint32_t data;
3405
3406     /* Defer to a kernel implemented reset */
3407     if (vdev->vbasedev.reset_works) {
3408         return -ENODEV;
3409     }
3410
3411     /* Enable only memory BAR access */
3412     vfio_pci_write_config(pdev, PCI_COMMAND, PCI_COMMAND_MEMORY, 2);
3413
3414     /* Reset only works if SMC firmware is loaded and running */
3415     if (!vfio_radeon_smc_is_running(vdev)) {
3416         ret = -EINVAL;
3417         goto out;
3418     }
3419
3420     /* Make sure only the GFX function is reset */
3421     vfio_radeon_set_gfx_only_reset(vdev);
3422
3423     /* AMD PCI config reset */
3424     vfio_pci_write_config(pdev, 0x7c, 0x39d5e86b, 4);
3425     usleep(100);
3426
3427     /* Read back the memory size to make sure we're out of reset */
3428     for (i = 0; i < 100000; i++) {
3429         if (vfio_region_read(&vdev->bars[5].region, 0x5428, 4) != 0xffffffff) {
3430             break;
3431         }
3432         usleep(1);
3433     }
3434
3435     /* Reset SMC */
3436     vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000000, 4);
3437     data = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
3438     data |= 1;
3439     vfio_region_write(&vdev->bars[5].region, 0x204, data, 4);
3440
3441     /* Disable SMC clock */
3442     vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000004, 4);
3443     data = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
3444     data |= 1;
3445     vfio_region_write(&vdev->bars[5].region, 0x204, data, 4);
3446
3447 out:
3448     /* Restore PCI command register */
3449     vfio_pci_write_config(pdev, PCI_COMMAND, 0, 2);
3450
3451     return ret;
3452 }
3453
3454 static void vfio_setup_resetfn(VFIOPCIDevice *vdev)
3455 {
3456     PCIDevice *pdev = &vdev->pdev;
3457     uint16_t vendor, device;
3458
3459     vendor = pci_get_word(pdev->config + PCI_VENDOR_ID);
3460     device = pci_get_word(pdev->config + PCI_DEVICE_ID);
3461
3462     switch (vendor) {
3463     case 0x1002:
3464         switch (device) {
3465         /* Bonaire */
3466         case 0x6649: /* Bonaire [FirePro W5100] */
3467         case 0x6650:
3468         case 0x6651:
3469         case 0x6658: /* Bonaire XTX [Radeon R7 260X] */
3470         case 0x665c: /* Bonaire XT [Radeon HD 7790/8770 / R9 260 OEM] */
3471         case 0x665d: /* Bonaire [Radeon R7 200 Series] */
3472         /* Hawaii */
3473         case 0x67A0: /* Hawaii XT GL [FirePro W9100] */
3474         case 0x67A1: /* Hawaii PRO GL [FirePro W8100] */
3475         case 0x67A2:
3476         case 0x67A8:
3477         case 0x67A9:
3478         case 0x67AA:
3479         case 0x67B0: /* Hawaii XT [Radeon R9 290X] */
3480         case 0x67B1: /* Hawaii PRO [Radeon R9 290] */
3481         case 0x67B8:
3482         case 0x67B9:
3483         case 0x67BA:
3484         case 0x67BE:
3485             vdev->resetfn = vfio_radeon_reset;
3486             break;
3487         }
3488         break;
3489     }
3490 }
3491
3492 static int vfio_initfn(PCIDevice *pdev)
3493 {
3494     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
3495     VFIODevice *vbasedev_iter;
3496     VFIOGroup *group;
3497     char path[PATH_MAX], iommu_group_path[PATH_MAX], *group_name;
3498     ssize_t len;
3499     struct stat st;
3500     int groupid;
3501     int ret;
3502
3503     /* Check that the host device exists */
3504     snprintf(path, sizeof(path),
3505              "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
3506              vdev->host.domain, vdev->host.bus, vdev->host.slot,
3507              vdev->host.function);
3508     if (stat(path, &st) < 0) {
3509         error_report("vfio: error: no such host device: %s", path);
3510         return -errno;
3511     }
3512
3513     vdev->vbasedev.ops = &vfio_pci_ops;
3514
3515     vdev->vbasedev.type = VFIO_DEVICE_TYPE_PCI;
3516     vdev->vbasedev.name = g_strdup_printf("%04x:%02x:%02x.%01x",
3517                                           vdev->host.domain, vdev->host.bus,
3518                                           vdev->host.slot, vdev->host.function);
3519
3520     strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1);
3521
3522     len = readlink(path, iommu_group_path, sizeof(path));
3523     if (len <= 0 || len >= sizeof(path)) {
3524         error_report("vfio: error no iommu_group for device");
3525         return len < 0 ? -errno : -ENAMETOOLONG;
3526     }
3527
3528     iommu_group_path[len] = 0;
3529     group_name = basename(iommu_group_path);
3530
3531     if (sscanf(group_name, "%d", &groupid) != 1) {
3532         error_report("vfio: error reading %s: %m", path);
3533         return -errno;
3534     }
3535
3536     trace_vfio_initfn(vdev->vbasedev.name, groupid);
3537
3538     group = vfio_get_group(groupid, pci_device_iommu_address_space(pdev));
3539     if (!group) {
3540         error_report("vfio: failed to get group %d", groupid);
3541         return -ENOENT;
3542     }
3543
3544     snprintf(path, sizeof(path), "%04x:%02x:%02x.%01x",
3545             vdev->host.domain, vdev->host.bus, vdev->host.slot,
3546             vdev->host.function);
3547
3548     QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
3549         if (strcmp(vbasedev_iter->name, vdev->vbasedev.name) == 0) {
3550             error_report("vfio: error: device %s is already attached", path);
3551             vfio_put_group(group);
3552             return -EBUSY;
3553         }
3554     }
3555
3556     ret = vfio_get_device(group, path, &vdev->vbasedev);
3557     if (ret) {
3558         error_report("vfio: failed to get device %s", path);
3559         vfio_put_group(group);
3560         return ret;
3561     }
3562
3563     ret = vfio_populate_device(vdev);
3564     if (ret) {
3565         return ret;
3566     }
3567
3568     /* Get a copy of config space */
3569     ret = pread(vdev->vbasedev.fd, vdev->pdev.config,
3570                 MIN(pci_config_size(&vdev->pdev), vdev->config_size),
3571                 vdev->config_offset);
3572     if (ret < (int)MIN(pci_config_size(&vdev->pdev), vdev->config_size)) {
3573         ret = ret < 0 ? -errno : -EFAULT;
3574         error_report("vfio: Failed to read device config space");
3575         return ret;
3576     }
3577
3578     /* vfio emulates a lot for us, but some bits need extra love */
3579     vdev->emulated_config_bits = g_malloc0(vdev->config_size);
3580
3581     /* QEMU can choose to expose the ROM or not */
3582     memset(vdev->emulated_config_bits + PCI_ROM_ADDRESS, 0xff, 4);
3583
3584     /* QEMU can change multi-function devices to single function, or reverse */
3585     vdev->emulated_config_bits[PCI_HEADER_TYPE] =
3586                                               PCI_HEADER_TYPE_MULTI_FUNCTION;
3587
3588     /* Restore or clear multifunction, this is always controlled by QEMU */
3589     if (vdev->pdev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
3590         vdev->pdev.config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION;
3591     } else {
3592         vdev->pdev.config[PCI_HEADER_TYPE] &= ~PCI_HEADER_TYPE_MULTI_FUNCTION;
3593     }
3594
3595     /*
3596      * Clear host resource mapping info.  If we choose not to register a
3597      * BAR, such as might be the case with the option ROM, we can get
3598      * confusing, unwritable, residual addresses from the host here.
3599      */
3600     memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24);
3601     memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4);
3602
3603     vfio_pci_size_rom(vdev);
3604
3605     ret = vfio_msix_early_setup(vdev);
3606     if (ret) {
3607         return ret;
3608     }
3609
3610     vfio_map_bars(vdev);
3611
3612     ret = vfio_add_capabilities(vdev);
3613     if (ret) {
3614         goto out_teardown;
3615     }
3616
3617     /* QEMU emulates all of MSI & MSIX */
3618     if (pdev->cap_present & QEMU_PCI_CAP_MSIX) {
3619         memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff,
3620                MSIX_CAP_LENGTH);
3621     }
3622
3623     if (pdev->cap_present & QEMU_PCI_CAP_MSI) {
3624         memset(vdev->emulated_config_bits + pdev->msi_cap, 0xff,
3625                vdev->msi_cap_size);
3626     }
3627
3628     if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
3629         vdev->intx.mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL,
3630                                                   vfio_intx_mmap_enable, vdev);
3631         pci_device_set_intx_routing_notifier(&vdev->pdev, vfio_intx_update);
3632         ret = vfio_intx_enable(vdev);
3633         if (ret) {
3634             goto out_teardown;
3635         }
3636     }
3637
3638     vfio_register_err_notifier(vdev);
3639     vfio_register_req_notifier(vdev);
3640     vfio_setup_resetfn(vdev);
3641
3642     return 0;
3643
3644 out_teardown:
3645     pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
3646     vfio_teardown_msi(vdev);
3647     vfio_unregister_bars(vdev);
3648     return ret;
3649 }
3650
3651 static void vfio_instance_finalize(Object *obj)
3652 {
3653     PCIDevice *pci_dev = PCI_DEVICE(obj);
3654     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pci_dev);
3655     VFIOGroup *group = vdev->vbasedev.group;
3656
3657     vfio_unmap_bars(vdev);
3658     g_free(vdev->emulated_config_bits);
3659     g_free(vdev->rom);
3660     vfio_put_device(vdev);
3661     vfio_put_group(group);
3662 }
3663
3664 static void vfio_exitfn(PCIDevice *pdev)
3665 {
3666     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
3667
3668     vfio_unregister_req_notifier(vdev);
3669     vfio_unregister_err_notifier(vdev);
3670     pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
3671     vfio_disable_interrupts(vdev);
3672     if (vdev->intx.mmap_timer) {
3673         timer_free(vdev->intx.mmap_timer);
3674     }
3675     vfio_teardown_msi(vdev);
3676     vfio_unregister_bars(vdev);
3677 }
3678
3679 static void vfio_pci_reset(DeviceState *dev)
3680 {
3681     PCIDevice *pdev = DO_UPCAST(PCIDevice, qdev, dev);
3682     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
3683
3684     trace_vfio_pci_reset(vdev->vbasedev.name);
3685
3686     vfio_pci_pre_reset(vdev);
3687
3688     if (vdev->resetfn && !vdev->resetfn(vdev)) {
3689         goto post_reset;
3690     }
3691
3692     if (vdev->vbasedev.reset_works &&
3693         (vdev->has_flr || !vdev->has_pm_reset) &&
3694         !ioctl(vdev->vbasedev.fd, VFIO_DEVICE_RESET)) {
3695         trace_vfio_pci_reset_flr(vdev->vbasedev.name);
3696         goto post_reset;
3697     }
3698
3699     /* See if we can do our own bus reset */
3700     if (!vfio_pci_hot_reset_one(vdev)) {
3701         goto post_reset;
3702     }
3703
3704     /* If nothing else works and the device supports PM reset, use it */
3705     if (vdev->vbasedev.reset_works && vdev->has_pm_reset &&
3706         !ioctl(vdev->vbasedev.fd, VFIO_DEVICE_RESET)) {
3707         trace_vfio_pci_reset_pm(vdev->vbasedev.name);
3708         goto post_reset;
3709     }
3710
3711 post_reset:
3712     vfio_pci_post_reset(vdev);
3713 }
3714
3715 static void vfio_instance_init(Object *obj)
3716 {
3717     PCIDevice *pci_dev = PCI_DEVICE(obj);
3718     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, PCI_DEVICE(obj));
3719
3720     device_add_bootindex_property(obj, &vdev->bootindex,
3721                                   "bootindex", NULL,
3722                                   &pci_dev->qdev, NULL);
3723 }
3724
3725 static Property vfio_pci_dev_properties[] = {
3726     DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIOPCIDevice, host),
3727     DEFINE_PROP_UINT32("x-intx-mmap-timeout-ms", VFIOPCIDevice,
3728                        intx.mmap_timeout, 1100),
3729     DEFINE_PROP_BIT("x-vga", VFIOPCIDevice, features,
3730                     VFIO_FEATURE_ENABLE_VGA_BIT, false),
3731     DEFINE_PROP_BIT("x-req", VFIOPCIDevice, features,
3732                     VFIO_FEATURE_ENABLE_REQ_BIT, true),
3733     DEFINE_PROP_BOOL("x-mmap", VFIOPCIDevice, vbasedev.allow_mmap, true),
3734     DEFINE_PROP_BOOL("x-no-kvm-intx", VFIOPCIDevice, no_kvm_intx, false),
3735     DEFINE_PROP_BOOL("x-no-kvm-msi", VFIOPCIDevice, no_kvm_msi, false),
3736     DEFINE_PROP_BOOL("x-no-kvm-msix", VFIOPCIDevice, no_kvm_msix, false),
3737     /*
3738      * TODO - support passed fds... is this necessary?
3739      * DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name),
3740      * DEFINE_PROP_STRING("vfiogroupfd, VFIOPCIDevice, vfiogroupfd_name),
3741      */
3742     DEFINE_PROP_END_OF_LIST(),
3743 };
3744
3745 static const VMStateDescription vfio_pci_vmstate = {
3746     .name = "vfio-pci",
3747     .unmigratable = 1,
3748 };
3749
3750 static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
3751 {
3752     DeviceClass *dc = DEVICE_CLASS(klass);
3753     PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass);
3754
3755     dc->reset = vfio_pci_reset;
3756     dc->props = vfio_pci_dev_properties;
3757     dc->vmsd = &vfio_pci_vmstate;
3758     dc->desc = "VFIO-based PCI device assignment";
3759     set_bit(DEVICE_CATEGORY_MISC, dc->categories);
3760     pdc->init = vfio_initfn;
3761     pdc->exit = vfio_exitfn;
3762     pdc->config_read = vfio_pci_read_config;
3763     pdc->config_write = vfio_pci_write_config;
3764     pdc->is_express = 1; /* We might be */
3765 }
3766
3767 static const TypeInfo vfio_pci_dev_info = {
3768     .name = "vfio-pci",
3769     .parent = TYPE_PCI_DEVICE,
3770     .instance_size = sizeof(VFIOPCIDevice),
3771     .class_init = vfio_pci_dev_class_init,
3772     .instance_init = vfio_instance_init,
3773     .instance_finalize = vfio_instance_finalize,
3774 };
3775
3776 static void register_vfio_pci_dev_type(void)
3777 {
3778     type_register_static(&vfio_pci_dev_info);
3779 }
3780
3781 type_init(register_vfio_pci_dev_type)