hw/vfio/pci.c

   1 /*
   2  * vfio based device assignment support
   3  *
   4  * Copyright Red Hat, Inc. 2012
   5  *
   6  * Authors:
   7  *  Alex Williamson <alex.williamson@redhat.com>
   8  *
   9  * This work is licensed under the terms of the GNU GPL, version 2.  See
  10  * the COPYING file in the top-level directory.
  11  *
  12  * Based on qemu-kvm device-assignment:
  13  *  Adapted for KVM by Qumranet.
  14  *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
  15  *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
  16  *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
  17  *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
  18  *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
  19  */
  20
  21 #include <linux/vfio.h>
  22 #include <sys/ioctl.h>
  23 #include <sys/mman.h>
  24 #include <sys/stat.h>
  25 #include <sys/types.h>
  26 #include <unistd.h>
  27
  28 #include "config.h"
  29 #include "hw/pci/msi.h"
  30 #include "hw/pci/msix.h"
  31 #include "qemu/error-report.h"
  32 #include "qemu/range.h"
  33 #include "sysemu/kvm.h"
  34 #include "sysemu/sysemu.h"
  35 #include "pci.h"
  36 #include "trace.h"
  37
  38 /*
  39  * List of device ids/vendor ids for which to disable
  40  * option rom loading. This avoids the guest hangs during rom
  41  * execution as noticed with the BCM 57810 card for lack of a
  42  * more better way to handle such issues.
  43  * The  user can still override by specifying a romfile or
  44  * rombar=1.
  45  * Please see https://bugs.launchpad.net/qemu/+bug/1284874
  46  * for an analysis of the 57810 card hang. When adding
  47  * a new vendor id/device id combination below, please also add
  48  * your card/environment details and information that could
  49  * help in debugging to the bug tracking this issue
  50  */
  51 static const VFIORomBlacklistEntry romblacklist[] = {
  52     /* Broadcom BCM 57810 */
  53     { 0x14e4, 0x168e }
  54 };
  55
  56 #define MSIX_CAP_LENGTH 12
  57
  58 static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
  59 static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
  60 static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
  61                                   uint32_t val, int len);
  62 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
  63
  64 /*
  65  * Disabling BAR mmaping can be slow, but toggling it around INTx can
  66  * also be a huge overhead.  We try to get the best of both worlds by
  67  * waiting until an interrupt to disable mmaps (subsequent transitions
  68  * to the same state are effectively no overhead).  If the interrupt has
  69  * been serviced and the time gap is long enough, we re-enable mmaps for
  70  * performance.  This works well for things like graphics cards, which
  71  * may not use their interrupt at all and are penalized to an unusable
  72  * level by read/write BAR traps.  Other devices, like NICs, have more
  73  * regular interrupts and see much better latency by staying in non-mmap
  74  * mode.  We therefore set the default mmap_timeout such that a ping
  75  * is just enough to keep the mmap disabled.  Users can experiment with
  76  * other options with the x-intx-mmap-timeout-ms parameter (a value of
  77  * zero disables the timer).
  78  */
  79 static void vfio_intx_mmap_enable(void *opaque)
  80 {
  81     VFIOPCIDevice *vdev = opaque;
  82
  83     if (vdev->intx.pending) {
  84         timer_mod(vdev->intx.mmap_timer,
  85                        qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout);
  86         return;
  87     }
  88
  89     vfio_mmap_set_enabled(vdev, true);
  90 }
  91
  92 static void vfio_intx_interrupt(void *opaque)
  93 {
  94     VFIOPCIDevice *vdev = opaque;
  95
  96     if (!event_notifier_test_and_clear(&vdev->intx.interrupt)) {
  97         return;
  98     }
  99
 100     trace_vfio_intx_interrupt(vdev->vbasedev.name, 'A' + vdev->intx.pin);
 101
 102     vdev->intx.pending = true;
 103     pci_irq_assert(&vdev->pdev);
 104     vfio_mmap_set_enabled(vdev, false);
 105     if (vdev->intx.mmap_timeout) {
 106         timer_mod(vdev->intx.mmap_timer,
 107                        qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout);
 108     }
 109 }
 110
 111 static void vfio_intx_eoi(VFIODevice *vbasedev)
 112 {
 113     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
 114
 115     if (!vdev->intx.pending) {
 116         return;
 117     }
 118
 119     trace_vfio_intx_eoi(vbasedev->name);
 120
 121     vdev->intx.pending = false;
 122     pci_irq_deassert(&vdev->pdev);
 123     vfio_unmask_single_irqindex(vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
 124 }
 125
 126 static void vfio_intx_enable_kvm(VFIOPCIDevice *vdev)
 127 {
 128 #ifdef CONFIG_KVM
 129     struct kvm_irqfd irqfd = {
 130         .fd = event_notifier_get_fd(&vdev->intx.interrupt),
 131         .gsi = vdev->intx.route.irq,
 132         .flags = KVM_IRQFD_FLAG_RESAMPLE,
 133     };
 134     struct vfio_irq_set *irq_set;
 135     int ret, argsz;
 136     int32_t *pfd;
 137
 138     if (vdev->no_kvm_intx || !kvm_irqfds_enabled() ||
 139         vdev->intx.route.mode != PCI_INTX_ENABLED ||
 140         !kvm_resamplefds_enabled()) {
 141         return;
 142     }
 143
 144     /* Get to a known interrupt state */
 145     qemu_set_fd_handler(irqfd.fd, NULL, NULL, vdev);
 146     vfio_mask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
 147     vdev->intx.pending = false;
 148     pci_irq_deassert(&vdev->pdev);
 149
 150     /* Get an eventfd for resample/unmask */
 151     if (event_notifier_init(&vdev->intx.unmask, 0)) {
 152         error_report("vfio: Error: event_notifier_init failed eoi");
 153         goto fail;
 154     }
 155
 156     /* KVM triggers it, VFIO listens for it */
 157     irqfd.resamplefd = event_notifier_get_fd(&vdev->intx.unmask);
 158
 159     if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) {
 160         error_report("vfio: Error: Failed to setup resample irqfd: %m");
 161         goto fail_irqfd;
 162     }
 163
 164     argsz = sizeof(*irq_set) + sizeof(*pfd);
 165
 166     irq_set = g_malloc0(argsz);
 167     irq_set->argsz = argsz;
 168     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK;
 169     irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
 170     irq_set->start = 0;
 171     irq_set->count = 1;
 172     pfd = (int32_t *)&irq_set->data;
 173
 174     *pfd = irqfd.resamplefd;
 175
 176     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
 177     g_free(irq_set);
 178     if (ret) {
 179         error_report("vfio: Error: Failed to setup INTx unmask fd: %m");
 180         goto fail_vfio;
 181     }
 182
 183     /* Let'em rip */
 184     vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
 185
 186     vdev->intx.kvm_accel = true;
 187
 188     trace_vfio_intx_enable_kvm(vdev->vbasedev.name);
 189
 190     return;
 191
 192 fail_vfio:
 193     irqfd.flags = KVM_IRQFD_FLAG_DEASSIGN;
 194     kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd);
 195 fail_irqfd:
 196     event_notifier_cleanup(&vdev->intx.unmask);
 197 fail:
 198     qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
 199     vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
 200 #endif
 201 }
 202
 203 static void vfio_intx_disable_kvm(VFIOPCIDevice *vdev)
 204 {
 205 #ifdef CONFIG_KVM
 206     struct kvm_irqfd irqfd = {
 207         .fd = event_notifier_get_fd(&vdev->intx.interrupt),
 208         .gsi = vdev->intx.route.irq,
 209         .flags = KVM_IRQFD_FLAG_DEASSIGN,
 210     };
 211
 212     if (!vdev->intx.kvm_accel) {
 213         return;
 214     }
 215
 216     /*
 217      * Get to a known state, hardware masked, QEMU ready to accept new
 218      * interrupts, QEMU IRQ de-asserted.
 219      */
 220     vfio_mask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
 221     vdev->intx.pending = false;
 222     pci_irq_deassert(&vdev->pdev);
 223
 224     /* Tell KVM to stop listening for an INTx irqfd */
 225     if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) {
 226         error_report("vfio: Error: Failed to disable INTx irqfd: %m");
 227     }
 228
 229     /* We only need to close the eventfd for VFIO to cleanup the kernel side */
 230     event_notifier_cleanup(&vdev->intx.unmask);
 231
 232     /* QEMU starts listening for interrupt events. */
 233     qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
 234
 235     vdev->intx.kvm_accel = false;
 236
 237     /* If we've missed an event, let it re-fire through QEMU */
 238     vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
 239
 240     trace_vfio_intx_disable_kvm(vdev->vbasedev.name);
 241 #endif
 242 }
 243
 244 static void vfio_intx_update(PCIDevice *pdev)
 245 {
 246     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
 247     PCIINTxRoute route;
 248
 249     if (vdev->interrupt != VFIO_INT_INTx) {
 250         return;
 251     }
 252
 253     route = pci_device_route_intx_to_irq(&vdev->pdev, vdev->intx.pin);
 254
 255     if (!pci_intx_route_changed(&vdev->intx.route, &route)) {
 256         return; /* Nothing changed */
 257     }
 258
 259     trace_vfio_intx_update(vdev->vbasedev.name,
 260                            vdev->intx.route.irq, route.irq);
 261
 262     vfio_intx_disable_kvm(vdev);
 263
 264     vdev->intx.route = route;
 265
 266     if (route.mode != PCI_INTX_ENABLED) {
 267         return;
 268     }
 269
 270     vfio_intx_enable_kvm(vdev);
 271
 272     /* Re-enable the interrupt in cased we missed an EOI */
 273     vfio_intx_eoi(&vdev->vbasedev);
 274 }
 275
 276 static int vfio_intx_enable(VFIOPCIDevice *vdev)
 277 {
 278     uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1);
 279     int ret, argsz;
 280     struct vfio_irq_set *irq_set;
 281     int32_t *pfd;
 282
 283     if (!pin) {
 284         return 0;
 285     }
 286
 287     vfio_disable_interrupts(vdev);
 288
 289     vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
 290     pci_config_set_interrupt_pin(vdev->pdev.config, pin);
 291
 292 #ifdef CONFIG_KVM
 293     /*
 294      * Only conditional to avoid generating error messages on platforms
 295      * where we won't actually use the result anyway.
 296      */
 297     if (kvm_irqfds_enabled() && kvm_resamplefds_enabled()) {
 298         vdev->intx.route = pci_device_route_intx_to_irq(&vdev->pdev,
 299                                                         vdev->intx.pin);
 300     }
 301 #endif
 302
 303     ret = event_notifier_init(&vdev->intx.interrupt, 0);
 304     if (ret) {
 305         error_report("vfio: Error: event_notifier_init failed");
 306         return ret;
 307     }
 308
 309     argsz = sizeof(*irq_set) + sizeof(*pfd);
 310
 311     irq_set = g_malloc0(argsz);
 312     irq_set->argsz = argsz;
 313     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
 314     irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
 315     irq_set->start = 0;
 316     irq_set->count = 1;
 317     pfd = (int32_t *)&irq_set->data;
 318
 319     *pfd = event_notifier_get_fd(&vdev->intx.interrupt);
 320     qemu_set_fd_handler(*pfd, vfio_intx_interrupt, NULL, vdev);
 321
 322     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
 323     g_free(irq_set);
 324     if (ret) {
 325         error_report("vfio: Error: Failed to setup INTx fd: %m");
 326         qemu_set_fd_handler(*pfd, NULL, NULL, vdev);
 327         event_notifier_cleanup(&vdev->intx.interrupt);
 328         return -errno;
 329     }
 330
 331     vfio_intx_enable_kvm(vdev);
 332
 333     vdev->interrupt = VFIO_INT_INTx;
 334
 335     trace_vfio_intx_enable(vdev->vbasedev.name);
 336
 337     return 0;
 338 }
 339
 340 static void vfio_intx_disable(VFIOPCIDevice *vdev)
 341 {
 342     int fd;
 343
 344     timer_del(vdev->intx.mmap_timer);
 345     vfio_intx_disable_kvm(vdev);
 346     vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
 347     vdev->intx.pending = false;
 348     pci_irq_deassert(&vdev->pdev);
 349     vfio_mmap_set_enabled(vdev, true);
 350
 351     fd = event_notifier_get_fd(&vdev->intx.interrupt);
 352     qemu_set_fd_handler(fd, NULL, NULL, vdev);
 353     event_notifier_cleanup(&vdev->intx.interrupt);
 354
 355     vdev->interrupt = VFIO_INT_NONE;
 356
 357     trace_vfio_intx_disable(vdev->vbasedev.name);
 358 }
 359
 360 /*
 361  * MSI/X
 362  */
 363 static void vfio_msi_interrupt(void *opaque)
 364 {
 365     VFIOMSIVector *vector = opaque;
 366     VFIOPCIDevice *vdev = vector->vdev;
 367     MSIMessage (*get_msg)(PCIDevice *dev, unsigned vector);
 368     void (*notify)(PCIDevice *dev, unsigned vector);
 369     MSIMessage msg;
 370     int nr = vector - vdev->msi_vectors;
 371
 372     if (!event_notifier_test_and_clear(&vector->interrupt)) {
 373         return;
 374     }
 375
 376     if (vdev->interrupt == VFIO_INT_MSIX) {
 377         get_msg = msix_get_message;
 378         notify = msix_notify;
 379     } else if (vdev->interrupt == VFIO_INT_MSI) {
 380         get_msg = msi_get_message;
 381         notify = msi_notify;
 382     } else {
 383         abort();
 384     }
 385
 386     msg = get_msg(&vdev->pdev, nr);
 387     trace_vfio_msi_interrupt(vdev->vbasedev.name, nr, msg.address, msg.data);
 388     notify(&vdev->pdev, nr);
 389 }
 390
 391 static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
 392 {
 393     struct vfio_irq_set *irq_set;
 394     int ret = 0, i, argsz;
 395     int32_t *fds;
 396
 397     argsz = sizeof(*irq_set) + (vdev->nr_vectors * sizeof(*fds));
 398
 399     irq_set = g_malloc0(argsz);
 400     irq_set->argsz = argsz;
 401     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
 402     irq_set->index = msix ? VFIO_PCI_MSIX_IRQ_INDEX : VFIO_PCI_MSI_IRQ_INDEX;
 403     irq_set->start = 0;
 404     irq_set->count = vdev->nr_vectors;
 405     fds = (int32_t *)&irq_set->data;
 406
 407     for (i = 0; i < vdev->nr_vectors; i++) {
 408         int fd = -1;
 409
 410         /*
 411          * MSI vs MSI-X - The guest has direct access to MSI mask and pending
 412          * bits, therefore we always use the KVM signaling path when setup.
 413          * MSI-X mask and pending bits are emulated, so we want to use the
 414          * KVM signaling path only when configured and unmasked.
 415          */
 416         if (vdev->msi_vectors[i].use) {
 417             if (vdev->msi_vectors[i].virq < 0 ||
 418                 (msix && msix_is_masked(&vdev->pdev, i))) {
 419                 fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
 420             } else {
 421                 fd = event_notifier_get_fd(&vdev->msi_vectors[i].kvm_interrupt);
 422             }
 423         }
 424
 425         fds[i] = fd;
 426     }
 427
 428     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
 429
 430     g_free(irq_set);
 431
 432     return ret;
 433 }
 434
 435 static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
 436                                   MSIMessage *msg, bool msix)
 437 {
 438     int virq;
 439
 440     if ((msix && vdev->no_kvm_msix) || (!msix && vdev->no_kvm_msi) || !msg) {
 441         return;
 442     }
 443
 444     if (event_notifier_init(&vector->kvm_interrupt, 0)) {
 445         return;
 446     }
 447
 448     virq = kvm_irqchip_add_msi_route(kvm_state, *msg);
 449     if (virq < 0) {
 450         event_notifier_cleanup(&vector->kvm_interrupt);
 451         return;
 452     }
 453
 454     if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
 455                                        NULL, virq) < 0) {
 456         kvm_irqchip_release_virq(kvm_state, virq);
 457         event_notifier_cleanup(&vector->kvm_interrupt);
 458         return;
 459     }
 460
 461     vector->virq = virq;
 462 }
 463
 464 static void vfio_remove_kvm_msi_virq(VFIOMSIVector *vector)
 465 {
 466     kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
 467                                           vector->virq);
 468     kvm_irqchip_release_virq(kvm_state, vector->virq);
 469     vector->virq = -1;
 470     event_notifier_cleanup(&vector->kvm_interrupt);
 471 }
 472
 473 static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg)
 474 {
 475     kvm_irqchip_update_msi_route(kvm_state, vector->virq, msg);
 476 }
 477
 478 static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
 479                                    MSIMessage *msg, IOHandler *handler)
 480 {
 481     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
 482     VFIOMSIVector *vector;
 483     int ret;
 484
 485     trace_vfio_msix_vector_do_use(vdev->vbasedev.name, nr);
 486
 487     vector = &vdev->msi_vectors[nr];
 488
 489     if (!vector->use) {
 490         vector->vdev = vdev;
 491         vector->virq = -1;
 492         if (event_notifier_init(&vector->interrupt, 0)) {
 493             error_report("vfio: Error: event_notifier_init failed");
 494         }
 495         vector->use = true;
 496         msix_vector_use(pdev, nr);
 497     }
 498
 499     qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
 500                         handler, NULL, vector);
 501
 502     /*
 503      * Attempt to enable route through KVM irqchip,
 504      * default to userspace handling if unavailable.
 505      */
 506     if (vector->virq >= 0) {
 507         if (!msg) {
 508             vfio_remove_kvm_msi_virq(vector);
 509         } else {
 510             vfio_update_kvm_msi_virq(vector, *msg);
 511         }
 512     } else {
 513         vfio_add_kvm_msi_virq(vdev, vector, msg, true);
 514     }
 515
 516     /*
 517      * We don't want to have the host allocate all possible MSI vectors
 518      * for a device if they're not in use, so we shutdown and incrementally
 519      * increase them as needed.
 520      */
 521     if (vdev->nr_vectors < nr + 1) {
 522         vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
 523         vdev->nr_vectors = nr + 1;
 524         ret = vfio_enable_vectors(vdev, true);
 525         if (ret) {
 526             error_report("vfio: failed to enable vectors, %d", ret);
 527         }
 528     } else {
 529         int argsz;
 530         struct vfio_irq_set *irq_set;
 531         int32_t *pfd;
 532
 533         argsz = sizeof(*irq_set) + sizeof(*pfd);
 534
 535         irq_set = g_malloc0(argsz);
 536         irq_set->argsz = argsz;
 537         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
 538                          VFIO_IRQ_SET_ACTION_TRIGGER;
 539         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
 540         irq_set->start = nr;
 541         irq_set->count = 1;
 542         pfd = (int32_t *)&irq_set->data;
 543
 544         if (vector->virq >= 0) {
 545             *pfd = event_notifier_get_fd(&vector->kvm_interrupt);
 546         } else {
 547             *pfd = event_notifier_get_fd(&vector->interrupt);
 548         }
 549
 550         ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
 551         g_free(irq_set);
 552         if (ret) {
 553             error_report("vfio: failed to modify vector, %d", ret);
 554         }
 555     }
 556
 557     return 0;
 558 }
 559
 560 static int vfio_msix_vector_use(PCIDevice *pdev,
 561                                 unsigned int nr, MSIMessage msg)
 562 {
 563     return vfio_msix_vector_do_use(pdev, nr, &msg, vfio_msi_interrupt);
 564 }
 565
 566 static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
 567 {
 568     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
 569     VFIOMSIVector *vector = &vdev->msi_vectors[nr];
 570
 571     trace_vfio_msix_vector_release(vdev->vbasedev.name, nr);
 572
 573     /*
 574      * There are still old guests that mask and unmask vectors on every
 575      * interrupt.  If we're using QEMU bypass with a KVM irqfd, leave all of
 576      * the KVM setup in place, simply switch VFIO to use the non-bypass
 577      * eventfd.  We'll then fire the interrupt through QEMU and the MSI-X
 578      * core will mask the interrupt and set pending bits, allowing it to
 579      * be re-asserted on unmask.  Nothing to do if already using QEMU mode.
 580      */
 581     if (vector->virq >= 0) {
 582         int argsz;
 583         struct vfio_irq_set *irq_set;
 584         int32_t *pfd;
 585
 586         argsz = sizeof(*irq_set) + sizeof(*pfd);
 587
 588         irq_set = g_malloc0(argsz);
 589         irq_set->argsz = argsz;
 590         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
 591                          VFIO_IRQ_SET_ACTION_TRIGGER;
 592         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
 593         irq_set->start = nr;
 594         irq_set->count = 1;
 595         pfd = (int32_t *)&irq_set->data;
 596
 597         *pfd = event_notifier_get_fd(&vector->interrupt);
 598
 599         ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
 600
 601         g_free(irq_set);
 602     }
 603 }
 604
 605 static void vfio_msix_enable(VFIOPCIDevice *vdev)
 606 {
 607     vfio_disable_interrupts(vdev);
 608
 609     vdev->msi_vectors = g_malloc0(vdev->msix->entries * sizeof(VFIOMSIVector));
 610
 611     vdev->interrupt = VFIO_INT_MSIX;
 612
 613     /*
 614      * Some communication channels between VF & PF or PF & fw rely on the
 615      * physical state of the device and expect that enabling MSI-X from the
 616      * guest enables the same on the host.  When our guest is Linux, the
 617      * guest driver call to pci_enable_msix() sets the enabling bit in the
 618      * MSI-X capability, but leaves the vector table masked.  We therefore
 619      * can't rely on a vector_use callback (from request_irq() in the guest)
 620      * to switch the physical device into MSI-X mode because that may come a
 621      * long time after pci_enable_msix().  This code enables vector 0 with
 622      * triggering to userspace, then immediately release the vector, leaving
 623      * the physical device with no vectors enabled, but MSI-X enabled, just
 624      * like the guest view.
 625      */
 626     vfio_msix_vector_do_use(&vdev->pdev, 0, NULL, NULL);
 627     vfio_msix_vector_release(&vdev->pdev, 0);
 628
 629     if (msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use,
 630                                   vfio_msix_vector_release, NULL)) {
 631         error_report("vfio: msix_set_vector_notifiers failed");
 632     }
 633
 634     trace_vfio_msix_enable(vdev->vbasedev.name);
 635 }
 636
 637 static void vfio_msi_enable(VFIOPCIDevice *vdev)
 638 {
 639     int ret, i;
 640
 641     vfio_disable_interrupts(vdev);
 642
 643     vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev);
 644 retry:
 645     vdev->msi_vectors = g_malloc0(vdev->nr_vectors * sizeof(VFIOMSIVector));
 646
 647     for (i = 0; i < vdev->nr_vectors; i++) {
 648         VFIOMSIVector *vector = &vdev->msi_vectors[i];
 649         MSIMessage msg = msi_get_message(&vdev->pdev, i);
 650
 651         vector->vdev = vdev;
 652         vector->virq = -1;
 653         vector->use = true;
 654
 655         if (event_notifier_init(&vector->interrupt, 0)) {
 656             error_report("vfio: Error: event_notifier_init failed");
 657         }
 658
 659         qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
 660                             vfio_msi_interrupt, NULL, vector);
 661
 662         /*
 663          * Attempt to enable route through KVM irqchip,
 664          * default to userspace handling if unavailable.
 665          */
 666         vfio_add_kvm_msi_virq(vdev, vector, &msg, false);
 667     }
 668
 669     /* Set interrupt type prior to possible interrupts */
 670     vdev->interrupt = VFIO_INT_MSI;
 671
 672     ret = vfio_enable_vectors(vdev, false);
 673     if (ret) {
 674         if (ret < 0) {
 675             error_report("vfio: Error: Failed to setup MSI fds: %m");
 676         } else if (ret != vdev->nr_vectors) {
 677             error_report("vfio: Error: Failed to enable %d "
 678                          "MSI vectors, retry with %d", vdev->nr_vectors, ret);
 679         }
 680
 681         for (i = 0; i < vdev->nr_vectors; i++) {
 682             VFIOMSIVector *vector = &vdev->msi_vectors[i];
 683             if (vector->virq >= 0) {
 684                 vfio_remove_kvm_msi_virq(vector);
 685             }
 686             qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
 687                                 NULL, NULL, NULL);
 688             event_notifier_cleanup(&vector->interrupt);
 689         }
 690
 691         g_free(vdev->msi_vectors);
 692
 693         if (ret > 0 && ret != vdev->nr_vectors) {
 694             vdev->nr_vectors = ret;
 695             goto retry;
 696         }
 697         vdev->nr_vectors = 0;
 698
 699         /*
 700          * Failing to setup MSI doesn't really fall within any specification.
 701          * Let's try leaving interrupts disabled and hope the guest figures
 702          * out to fall back to INTx for this device.
 703          */
 704         error_report("vfio: Error: Failed to enable MSI");
 705         vdev->interrupt = VFIO_INT_NONE;
 706
 707         return;
 708     }
 709
 710     trace_vfio_msi_enable(vdev->vbasedev.name, vdev->nr_vectors);
 711 }
 712
 713 static void vfio_msi_disable_common(VFIOPCIDevice *vdev)
 714 {
 715     int i;
 716
 717     for (i = 0; i < vdev->nr_vectors; i++) {
 718         VFIOMSIVector *vector = &vdev->msi_vectors[i];
 719         if (vdev->msi_vectors[i].use) {
 720             if (vector->virq >= 0) {
 721                 vfio_remove_kvm_msi_virq(vector);
 722             }
 723             qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
 724                                 NULL, NULL, NULL);
 725             event_notifier_cleanup(&vector->interrupt);
 726         }
 727     }
 728
 729     g_free(vdev->msi_vectors);
 730     vdev->msi_vectors = NULL;
 731     vdev->nr_vectors = 0;
 732     vdev->interrupt = VFIO_INT_NONE;
 733
 734     vfio_intx_enable(vdev);
 735 }
 736
 737 static void vfio_msix_disable(VFIOPCIDevice *vdev)
 738 {
 739     int i;
 740
 741     msix_unset_vector_notifiers(&vdev->pdev);
 742
 743     /*
 744      * MSI-X will only release vectors if MSI-X is still enabled on the
 745      * device, check through the rest and release it ourselves if necessary.
 746      */
 747     for (i = 0; i < vdev->nr_vectors; i++) {
 748         if (vdev->msi_vectors[i].use) {
 749             vfio_msix_vector_release(&vdev->pdev, i);
 750             msix_vector_unuse(&vdev->pdev, i);
 751         }
 752     }
 753
 754     if (vdev->nr_vectors) {
 755         vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
 756     }
 757
 758     vfio_msi_disable_common(vdev);
 759
 760     trace_vfio_msix_disable(vdev->vbasedev.name);
 761 }
 762
 763 static void vfio_msi_disable(VFIOPCIDevice *vdev)
 764 {
 765     vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSI_IRQ_INDEX);
 766     vfio_msi_disable_common(vdev);
 767
 768     trace_vfio_msi_disable(vdev->vbasedev.name);
 769 }
 770
 771 static void vfio_update_msi(VFIOPCIDevice *vdev)
 772 {
 773     int i;
 774
 775     for (i = 0; i < vdev->nr_vectors; i++) {
 776         VFIOMSIVector *vector = &vdev->msi_vectors[i];
 777         MSIMessage msg;
 778
 779         if (!vector->use || vector->virq < 0) {
 780             continue;
 781         }
 782
 783         msg = msi_get_message(&vdev->pdev, i);
 784         vfio_update_kvm_msi_virq(vector, msg);
 785     }
 786 }
 787
 788 static void vfio_pci_load_rom(VFIOPCIDevice *vdev)
 789 {
 790     struct vfio_region_info reg_info = {
 791         .argsz = sizeof(reg_info),
 792         .index = VFIO_PCI_ROM_REGION_INDEX
 793     };
 794     uint64_t size;
 795     off_t off = 0;
 796     ssize_t bytes;
 797
 798     if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info)) {
 799         error_report("vfio: Error getting ROM info: %m");
 800         return;
 801     }
 802
 803     trace_vfio_pci_load_rom(vdev->vbasedev.name, (unsigned long)reg_info.size,
 804                             (unsigned long)reg_info.offset,
 805                             (unsigned long)reg_info.flags);
 806
 807     vdev->rom_size = size = reg_info.size;
 808     vdev->rom_offset = reg_info.offset;
 809
 810     if (!vdev->rom_size) {
 811         vdev->rom_read_failed = true;
 812         error_report("vfio-pci: Cannot read device rom at "
 813                     "%s", vdev->vbasedev.name);
 814         error_printf("Device option ROM contents are probably invalid "
 815                     "(check dmesg).\nSkip option ROM probe with rombar=0, "
 816                     "or load from file with romfile=\n");
 817         return;
 818     }
 819
 820     vdev->rom = g_malloc(size);
 821     memset(vdev->rom, 0xff, size);
 822
 823     while (size) {
 824         bytes = pread(vdev->vbasedev.fd, vdev->rom + off,
 825                       size, vdev->rom_offset + off);
 826         if (bytes == 0) {
 827             break;
 828         } else if (bytes > 0) {
 829             off += bytes;
 830             size -= bytes;
 831         } else {
 832             if (errno == EINTR || errno == EAGAIN) {
 833                 continue;
 834             }
 835             error_report("vfio: Error reading device ROM: %m");
 836             break;
 837         }
 838     }
 839 }
 840
 841 static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size)
 842 {
 843     VFIOPCIDevice *vdev = opaque;
 844     union {
 845         uint8_t byte;
 846         uint16_t word;
 847         uint32_t dword;
 848         uint64_t qword;
 849     } val;
 850     uint64_t data = 0;
 851
 852     /* Load the ROM lazily when the guest tries to read it */
 853     if (unlikely(!vdev->rom && !vdev->rom_read_failed)) {
 854         vfio_pci_load_rom(vdev);
 855     }
 856
 857     memcpy(&val, vdev->rom + addr,
 858            (addr < vdev->rom_size) ? MIN(size, vdev->rom_size - addr) : 0);
 859
 860     switch (size) {
 861     case 1:
 862         data = val.byte;
 863         break;
 864     case 2:
 865         data = le16_to_cpu(val.word);
 866         break;
 867     case 4:
 868         data = le32_to_cpu(val.dword);
 869         break;
 870     default:
 871         hw_error("vfio: unsupported read size, %d bytes\n", size);
 872         break;
 873     }
 874
 875     trace_vfio_rom_read(vdev->vbasedev.name, addr, size, data);
 876
 877     return data;
 878 }
 879
 880 static void vfio_rom_write(void *opaque, hwaddr addr,
 881                            uint64_t data, unsigned size)
 882 {
 883 }
 884
 885 static const MemoryRegionOps vfio_rom_ops = {
 886     .read = vfio_rom_read,
 887     .write = vfio_rom_write,
 888     .endianness = DEVICE_LITTLE_ENDIAN,
 889 };
 890
 891 static bool vfio_blacklist_opt_rom(VFIOPCIDevice *vdev)
 892 {
 893     PCIDevice *pdev = &vdev->pdev;
 894     uint16_t vendor_id, device_id;
 895     int count = 0;
 896
 897     vendor_id = pci_get_word(pdev->config + PCI_VENDOR_ID);
 898     device_id = pci_get_word(pdev->config + PCI_DEVICE_ID);
 899
 900     while (count < ARRAY_SIZE(romblacklist)) {
 901         if (romblacklist[count].vendor_id == vendor_id &&
 902             romblacklist[count].device_id == device_id) {
 903                 return true;
 904         }
 905         count++;
 906     }
 907
 908     return false;
 909 }
 910
 911 static void vfio_pci_size_rom(VFIOPCIDevice *vdev)
 912 {
 913     uint32_t orig, size = cpu_to_le32((uint32_t)PCI_ROM_ADDRESS_MASK);
 914     off_t offset = vdev->config_offset + PCI_ROM_ADDRESS;
 915     DeviceState *dev = DEVICE(vdev);
 916     char name[32];
 917     int fd = vdev->vbasedev.fd;
 918
 919     if (vdev->pdev.romfile || !vdev->pdev.rom_bar) {
 920         /* Since pci handles romfile, just print a message and return */
 921         if (vfio_blacklist_opt_rom(vdev) && vdev->pdev.romfile) {
 922             error_printf("Warning : Device at %04x:%02x:%02x.%x "
 923                          "is known to cause system instability issues during "
 924                          "option rom execution. "
 925                          "Proceeding anyway since user specified romfile\n",
 926                          vdev->host.domain, vdev->host.bus, vdev->host.slot,
 927                          vdev->host.function);
 928         }
 929         return;
 930     }
 931
 932     /*
 933      * Use the same size ROM BAR as the physical device.  The contents
 934      * will get filled in later when the guest tries to read it.
 935      */
 936     if (pread(fd, &orig, 4, offset) != 4 ||
 937         pwrite(fd, &size, 4, offset) != 4 ||
 938         pread(fd, &size, 4, offset) != 4 ||
 939         pwrite(fd, &orig, 4, offset) != 4) {
 940         error_report("%s(%04x:%02x:%02x.%x) failed: %m",
 941                      __func__, vdev->host.domain, vdev->host.bus,
 942                      vdev->host.slot, vdev->host.function);
 943         return;
 944     }
 945
 946     size = ~(le32_to_cpu(size) & PCI_ROM_ADDRESS_MASK) + 1;
 947
 948     if (!size) {
 949         return;
 950     }
 951
 952     if (vfio_blacklist_opt_rom(vdev)) {
 953         if (dev->opts && qemu_opt_get(dev->opts, "rombar")) {
 954             error_printf("Warning : Device at %04x:%02x:%02x.%x "
 955                          "is known to cause system instability issues during "
 956                          "option rom execution. "
 957                          "Proceeding anyway since user specified non zero value for "
 958                          "rombar\n",
 959                          vdev->host.domain, vdev->host.bus, vdev->host.slot,
 960                          vdev->host.function);
 961         } else {
 962             error_printf("Warning : Rom loading for device at "
 963                          "%04x:%02x:%02x.%x has been disabled due to "
 964                          "system instability issues. "
 965                          "Specify rombar=1 or romfile to force\n",
 966                          vdev->host.domain, vdev->host.bus, vdev->host.slot,
 967                          vdev->host.function);
 968             return;
 969         }
 970     }
 971
 972     trace_vfio_pci_size_rom(vdev->vbasedev.name, size);
 973
 974     snprintf(name, sizeof(name), "vfio[%04x:%02x:%02x.%x].rom",
 975              vdev->host.domain, vdev->host.bus, vdev->host.slot,
 976              vdev->host.function);
 977
 978     memory_region_init_io(&vdev->pdev.rom, OBJECT(vdev),
 979                           &vfio_rom_ops, vdev, name, size);
 980
 981     pci_register_bar(&vdev->pdev, PCI_ROM_SLOT,
 982                      PCI_BASE_ADDRESS_SPACE_MEMORY, &vdev->pdev.rom);
 983
 984     vdev->pdev.has_rom = true;
 985     vdev->rom_read_failed = false;
 986 }
 987
 988 static void vfio_vga_write(void *opaque, hwaddr addr,
 989                            uint64_t data, unsigned size)
 990 {
 991     VFIOVGARegion *region = opaque;
 992     VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]);
 993     union {
 994         uint8_t byte;
 995         uint16_t word;
 996         uint32_t dword;
 997         uint64_t qword;
 998     } buf;
 999     off_t offset = vga->fd_offset + region->offset + addr;
1000
1001     switch (size) {
1002     case 1:
1003         buf.byte = data;
1004         break;
1005     case 2:
1006         buf.word = cpu_to_le16(data);
1007         break;
1008     case 4:
1009         buf.dword = cpu_to_le32(data);
1010         break;
1011     default:
1012         hw_error("vfio: unsupported write size, %d bytes", size);
1013         break;
1014     }
1015
1016     if (pwrite(vga->fd, &buf, size, offset) != size) {
1017         error_report("%s(,0x%"HWADDR_PRIx", 0x%"PRIx64", %d) failed: %m",
1018                      __func__, region->offset + addr, data, size);
1019     }
1020
1021     trace_vfio_vga_write(region->offset + addr, data, size);
1022 }
1023
1024 static uint64_t vfio_vga_read(void *opaque, hwaddr addr, unsigned size)
1025 {
1026     VFIOVGARegion *region = opaque;
1027     VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]);
1028     union {
1029         uint8_t byte;
1030         uint16_t word;
1031         uint32_t dword;
1032         uint64_t qword;
1033     } buf;
1034     uint64_t data = 0;
1035     off_t offset = vga->fd_offset + region->offset + addr;
1036
1037     if (pread(vga->fd, &buf, size, offset) != size) {
1038         error_report("%s(,0x%"HWADDR_PRIx", %d) failed: %m",
1039                      __func__, region->offset + addr, size);
1040         return (uint64_t)-1;
1041     }
1042
1043     switch (size) {
1044     case 1:
1045         data = buf.byte;
1046         break;
1047     case 2:
1048         data = le16_to_cpu(buf.word);
1049         break;
1050     case 4:
1051         data = le32_to_cpu(buf.dword);
1052         break;
1053     default:
1054         hw_error("vfio: unsupported read size, %d bytes", size);
1055         break;
1056     }
1057
1058     trace_vfio_vga_read(region->offset + addr, size, data);
1059
1060     return data;
1061 }
1062
1063 static const MemoryRegionOps vfio_vga_ops = {
1064     .read = vfio_vga_read,
1065     .write = vfio_vga_write,
1066     .endianness = DEVICE_LITTLE_ENDIAN,
1067 };
1068
1069 /*
1070  * Device specific quirks
1071  */
1072
1073 /* Is range1 fully contained within range2?  */
1074 static bool vfio_range_contained(uint64_t first1, uint64_t len1,
1075                                  uint64_t first2, uint64_t len2) {
1076     return (first1 >= first2 && first1 + len1 <= first2 + len2);
1077 }
1078
1079 static bool vfio_flags_enabled(uint8_t flags, uint8_t mask)
1080 {
1081     return (mask && (flags & mask) == mask);
1082 }
1083
1084 static uint64_t vfio_generic_window_quirk_read(void *opaque,
1085                                                hwaddr addr, unsigned size)
1086 {
1087     VFIOQuirk *quirk = opaque;
1088     VFIOPCIDevice *vdev = quirk->vdev;
1089     uint64_t data;
1090
1091     if (vfio_flags_enabled(quirk->data.flags, quirk->data.read_flags) &&
1092         ranges_overlap(addr, size,
1093                        quirk->data.data_offset, quirk->data.data_size)) {
1094         hwaddr offset = addr - quirk->data.data_offset;
1095
1096         if (!vfio_range_contained(addr, size, quirk->data.data_offset,
1097                                   quirk->data.data_size)) {
1098             hw_error("%s: window data read not fully contained: %s",
1099                      __func__, memory_region_name(&quirk->mem));
1100         }
1101
1102         data = vfio_pci_read_config(&vdev->pdev,
1103                                     quirk->data.address_val + offset, size);
1104
1105         trace_vfio_generic_window_quirk_read(memory_region_name(&quirk->mem),
1106                                              vdev->vbasedev.name,
1107                                              quirk->data.bar,
1108                                              addr, size, data);
1109     } else {
1110         data = vfio_region_read(&vdev->bars[quirk->data.bar].region,
1111                                 addr + quirk->data.base_offset, size);
1112     }
1113
1114     return data;
1115 }
1116
1117 static void vfio_generic_window_quirk_write(void *opaque, hwaddr addr,
1118                                             uint64_t data, unsigned size)
1119 {
1120     VFIOQuirk *quirk = opaque;
1121     VFIOPCIDevice *vdev = quirk->vdev;
1122
1123     if (ranges_overlap(addr, size,
1124                        quirk->data.address_offset, quirk->data.address_size)) {
1125
1126         if (addr != quirk->data.address_offset) {
1127             hw_error("%s: offset write into address window: %s",
1128                      __func__, memory_region_name(&quirk->mem));
1129         }
1130
1131         if ((data & ~quirk->data.address_mask) == quirk->data.address_match) {
1132             quirk->data.flags |= quirk->data.write_flags |
1133                                  quirk->data.read_flags;
1134             quirk->data.address_val = data & quirk->data.address_mask;
1135         } else {
1136             quirk->data.flags &= ~(quirk->data.write_flags |
1137                                    quirk->data.read_flags);
1138         }
1139     }
1140
1141     if (vfio_flags_enabled(quirk->data.flags, quirk->data.write_flags) &&
1142         ranges_overlap(addr, size,
1143                        quirk->data.data_offset, quirk->data.data_size)) {
1144         hwaddr offset = addr - quirk->data.data_offset;
1145
1146         if (!vfio_range_contained(addr, size, quirk->data.data_offset,
1147                                   quirk->data.data_size)) {
1148             hw_error("%s: window data write not fully contained: %s",
1149                      __func__, memory_region_name(&quirk->mem));
1150         }
1151
1152         vfio_pci_write_config(&vdev->pdev,
1153                               quirk->data.address_val + offset, data, size);
1154         trace_vfio_generic_window_quirk_write(memory_region_name(&quirk->mem),
1155                                               vdev->vbasedev.name,
1156                                               quirk->data.bar,
1157                                               addr, data, size);
1158         return;
1159     }
1160
1161     vfio_region_write(&vdev->bars[quirk->data.bar].region,
1162                    addr + quirk->data.base_offset, data, size);
1163 }
1164
1165 static const MemoryRegionOps vfio_generic_window_quirk = {
1166     .read = vfio_generic_window_quirk_read,
1167     .write = vfio_generic_window_quirk_write,
1168     .endianness = DEVICE_LITTLE_ENDIAN,
1169 };
1170
1171 static uint64_t vfio_generic_quirk_read(void *opaque,
1172                                         hwaddr addr, unsigned size)
1173 {
1174     VFIOQuirk *quirk = opaque;
1175     VFIOPCIDevice *vdev = quirk->vdev;
1176     hwaddr base = quirk->data.address_match & TARGET_PAGE_MASK;
1177     hwaddr offset = quirk->data.address_match & ~TARGET_PAGE_MASK;
1178     uint64_t data;
1179
1180     if (vfio_flags_enabled(quirk->data.flags, quirk->data.read_flags) &&
1181         ranges_overlap(addr, size, offset, quirk->data.address_mask + 1)) {
1182         if (!vfio_range_contained(addr, size, offset,
1183                                   quirk->data.address_mask + 1)) {
1184             hw_error("%s: read not fully contained: %s",
1185                      __func__, memory_region_name(&quirk->mem));
1186         }
1187
1188         data = vfio_pci_read_config(&vdev->pdev, addr - offset, size);
1189
1190         trace_vfio_generic_quirk_read(memory_region_name(&quirk->mem),
1191                                       vdev->vbasedev.name, quirk->data.bar,
1192                                       addr + base, size, data);
1193     } else {
1194         data = vfio_region_read(&vdev->bars[quirk->data.bar].region,
1195                                 addr + base, size);
1196     }
1197
1198     return data;
1199 }
1200
1201 static void vfio_generic_quirk_write(void *opaque, hwaddr addr,
1202                                      uint64_t data, unsigned size)
1203 {
1204     VFIOQuirk *quirk = opaque;
1205     VFIOPCIDevice *vdev = quirk->vdev;
1206     hwaddr base = quirk->data.address_match & TARGET_PAGE_MASK;
1207     hwaddr offset = quirk->data.address_match & ~TARGET_PAGE_MASK;
1208
1209     if (vfio_flags_enabled(quirk->data.flags, quirk->data.write_flags) &&
1210         ranges_overlap(addr, size, offset, quirk->data.address_mask + 1)) {
1211         if (!vfio_range_contained(addr, size, offset,
1212                                   quirk->data.address_mask + 1)) {
1213             hw_error("%s: write not fully contained: %s",
1214                      __func__, memory_region_name(&quirk->mem));
1215         }
1216
1217         vfio_pci_write_config(&vdev->pdev, addr - offset, data, size);
1218
1219         trace_vfio_generic_quirk_write(memory_region_name(&quirk->mem),
1220                                        vdev->vbasedev.name, quirk->data.bar,
1221                                        addr + base, data, size);
1222     } else {
1223         vfio_region_write(&vdev->bars[quirk->data.bar].region,
1224                           addr + base, data, size);
1225     }
1226 }
1227
1228 static const MemoryRegionOps vfio_generic_quirk = {
1229     .read = vfio_generic_quirk_read,
1230     .write = vfio_generic_quirk_write,
1231     .endianness = DEVICE_LITTLE_ENDIAN,
1232 };
1233
1234 #define PCI_VENDOR_ID_ATI               0x1002
1235
1236 /*
1237  * Radeon HD cards (HD5450 & HD7850) report the upper byte of the I/O port BAR
1238  * through VGA register 0x3c3.  On newer cards, the I/O port BAR is always
1239  * BAR4 (older cards like the X550 used BAR1, but we don't care to support
1240  * those).  Note that on bare metal, a read of 0x3c3 doesn't always return the
1241  * I/O port BAR address.  Originally this was coded to return the virtual BAR
1242  * address only if the physical register read returns the actual BAR address,
1243  * but users have reported greater success if we return the virtual address
1244  * unconditionally.
1245  */
1246 static uint64_t vfio_ati_3c3_quirk_read(void *opaque,
1247                                         hwaddr addr, unsigned size)
1248 {
1249     VFIOQuirk *quirk = opaque;
1250     VFIOPCIDevice *vdev = quirk->vdev;
1251     uint64_t data = vfio_pci_read_config(&vdev->pdev,
1252                                          PCI_BASE_ADDRESS_0 + (4 * 4) + 1,
1253                                          size);
1254     trace_vfio_ati_3c3_quirk_read(data);
1255
1256     return data;
1257 }
1258
1259 static const MemoryRegionOps vfio_ati_3c3_quirk = {
1260     .read = vfio_ati_3c3_quirk_read,
1261     .endianness = DEVICE_LITTLE_ENDIAN,
1262 };
1263
1264 static void vfio_vga_probe_ati_3c3_quirk(VFIOPCIDevice *vdev)
1265 {
1266     PCIDevice *pdev = &vdev->pdev;
1267     VFIOQuirk *quirk;
1268
1269     if (pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_ATI) {
1270         return;
1271     }
1272
1273     /*
1274      * As long as the BAR is >= 256 bytes it will be aligned such that the
1275      * lower byte is always zero.  Filter out anything else, if it exists.
1276      */
1277     if (!vdev->bars[4].ioport || vdev->bars[4].region.size < 256) {
1278         return;
1279     }
1280
1281     quirk = g_malloc0(sizeof(*quirk));
1282     quirk->vdev = vdev;
1283
1284     memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_ati_3c3_quirk, quirk,
1285                           "vfio-ati-3c3-quirk", 1);
1286     memory_region_add_subregion(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem,
1287                                 3 /* offset 3 bytes from 0x3c0 */, &quirk->mem);
1288
1289     QLIST_INSERT_HEAD(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].quirks,
1290                       quirk, next);
1291
1292     trace_vfio_vga_probe_ati_3c3_quirk(vdev->vbasedev.name);
1293 }
1294
1295 /*
1296  * Newer ATI/AMD devices, including HD5450 and HD7850, have a window to PCI
1297  * config space through MMIO BAR2 at offset 0x4000.  Nothing seems to access
1298  * the MMIO space directly, but a window to this space is provided through
1299  * I/O port BAR4.  Offset 0x0 is the address register and offset 0x4 is the
1300  * data register.  When the address is programmed to a range of 0x4000-0x4fff
1301  * PCI configuration space is available.  Experimentation seems to indicate
1302  * that only read-only access is provided, but we drop writes when the window
1303  * is enabled to config space nonetheless.
1304  */
1305 static void vfio_probe_ati_bar4_window_quirk(VFIOPCIDevice *vdev, int nr)
1306 {
1307     PCIDevice *pdev = &vdev->pdev;
1308     VFIOQuirk *quirk;
1309
1310     if (!vdev->has_vga || nr != 4 ||
1311         pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_ATI) {
1312         return;
1313     }
1314
1315     quirk = g_malloc0(sizeof(*quirk));
1316     quirk->vdev = vdev;
1317     quirk->data.address_size = 4;
1318     quirk->data.data_offset = 4;
1319     quirk->data.data_size = 4;
1320     quirk->data.address_match = 0x4000;
1321     quirk->data.address_mask = PCIE_CONFIG_SPACE_SIZE - 1;
1322     quirk->data.bar = nr;
1323     quirk->data.read_flags = quirk->data.write_flags = 1;
1324
1325     memory_region_init_io(&quirk->mem, OBJECT(vdev),
1326                           &vfio_generic_window_quirk, quirk,
1327                           "vfio-ati-bar4-window-quirk", 8);
1328     memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem,
1329                           quirk->data.base_offset, &quirk->mem, 1);
1330
1331     QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1332
1333     trace_vfio_probe_ati_bar4_window_quirk(vdev->vbasedev.name);
1334 }
1335
1336 #define PCI_VENDOR_ID_REALTEK 0x10ec
1337
1338 /*
1339  * RTL8168 devices have a backdoor that can access the MSI-X table.  At BAR2
1340  * offset 0x70 there is a dword data register, offset 0x74 is a dword address
1341  * register.  According to the Linux r8169 driver, the MSI-X table is addressed
1342  * when the "type" portion of the address register is set to 0x1.  This appears
1343  * to be bits 16:30.  Bit 31 is both a write indicator and some sort of
1344  * "address latched" indicator.  Bits 12:15 are a mask field, which we can
1345  * ignore because the MSI-X table should always be accessed as a dword (full
1346  * mask).  Bits 0:11 is offset within the type.
1347  *
1348  * Example trace:
1349  *
1350  * Read from MSI-X table offset 0
1351  * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x1f000, 4) // store read addr
1352  * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x8001f000 // latch
1353  * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x70, 4) = 0xfee00398 // read data
1354  *
1355  * Write 0xfee00000 to MSI-X table offset 0
1356  * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x70, 0xfee00000, 4) // write data
1357  * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x8001f000, 4) // do write
1358  * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x1f000 // complete
1359  */
1360
1361 static uint64_t vfio_rtl8168_window_quirk_read(void *opaque,
1362                                                hwaddr addr, unsigned size)
1363 {
1364     VFIOQuirk *quirk = opaque;
1365     VFIOPCIDevice *vdev = quirk->vdev;
1366     uint64_t val = 0;
1367
1368     if (!quirk->data.flags) { /* Non-MSI-X table access */
1369         return vfio_region_read(&vdev->bars[quirk->data.bar].region,
1370                                 addr + 0x70, size);
1371     }
1372
1373     switch (addr) {
1374     case 4: /* address */
1375         val = quirk->data.address_match ^ 0x80000000U; /* latch/complete */
1376         break;
1377     case 0: /* data */
1378         if ((vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX)) {
1379             memory_region_dispatch_read(&vdev->pdev.msix_table_mmio,
1380                                 (hwaddr)(quirk->data.address_match & 0xfff),
1381                                 &val, size, MEMTXATTRS_UNSPECIFIED);
1382         }
1383         break;
1384     }
1385
1386     trace_vfio_rtl8168_quirk_read(vdev->vbasedev.name,
1387                                   addr ? "address" : "data", val);
1388     return val;
1389 }
1390
1391 static void vfio_rtl8168_window_quirk_write(void *opaque, hwaddr addr,
1392                                             uint64_t data, unsigned size)
1393 {
1394     VFIOQuirk *quirk = opaque;
1395     VFIOPCIDevice *vdev = quirk->vdev;
1396
1397     switch (addr) {
1398     case 4: /* address */
1399         if ((data & 0x7fff0000) == 0x10000) { /* MSI-X table */
1400             quirk->data.flags = 1; /* Activate reads */
1401             quirk->data.address_match = data;
1402
1403             trace_vfio_rtl8168_quirk_write(vdev->vbasedev.name, data);
1404
1405             if (data & 0x80000000U) { /* Do write */
1406                 if (vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX) {
1407                     hwaddr offset = data & 0xfff;
1408                     uint64_t val = quirk->data.address_mask;
1409
1410                     trace_vfio_rtl8168_quirk_msix(vdev->vbasedev.name,
1411                                                   (uint16_t)offset, val);
1412
1413                     /* Write to the proper guest MSI-X table instead */
1414                     memory_region_dispatch_write(&vdev->pdev.msix_table_mmio,
1415                                                  offset, val, size,
1416                                                  MEMTXATTRS_UNSPECIFIED);
1417                 }
1418                 return; /* Do not write guest MSI-X data to hardware */
1419             }
1420         } else {
1421             quirk->data.flags = 0; /* De-activate reads, non-MSI-X */
1422         }
1423         break;
1424     case 0: /* data */
1425         quirk->data.address_mask = data;
1426         break;
1427     }
1428
1429     vfio_region_write(&vdev->bars[quirk->data.bar].region,
1430                       addr + 0x70, data, size);
1431 }
1432
1433 static const MemoryRegionOps vfio_rtl8168_window_quirk = {
1434     .read = vfio_rtl8168_window_quirk_read,
1435     .write = vfio_rtl8168_window_quirk_write,
1436     .valid = {
1437         .min_access_size = 4,
1438         .max_access_size = 4,
1439         .unaligned = false,
1440     },
1441     .endianness = DEVICE_LITTLE_ENDIAN,
1442 };
1443
1444 static void vfio_probe_rtl8168_bar2_window_quirk(VFIOPCIDevice *vdev, int nr)
1445 {
1446     PCIDevice *pdev = &vdev->pdev;
1447     VFIOQuirk *quirk;
1448
1449     if (pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_REALTEK ||
1450         pci_get_word(pdev->config + PCI_DEVICE_ID) != 0x8168 || nr != 2) {
1451         return;
1452     }
1453
1454     quirk = g_malloc0(sizeof(*quirk));
1455     quirk->vdev = vdev;
1456     quirk->data.bar = nr;
1457
1458     memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_rtl8168_window_quirk,
1459                           quirk, "vfio-rtl8168-window-quirk", 8);
1460     memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem,
1461                                         0x70, &quirk->mem, 1);
1462
1463     QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1464
1465     trace_vfio_rtl8168_quirk_enable(vdev->vbasedev.name);
1466 }
1467
1468 /*
1469  * Trap the BAR2 MMIO window to config space as well.
1470  */
1471 static void vfio_probe_ati_bar2_4000_quirk(VFIOPCIDevice *vdev, int nr)
1472 {
1473     PCIDevice *pdev = &vdev->pdev;
1474     VFIOQuirk *quirk;
1475
1476     /* Only enable on newer devices where BAR2 is 64bit */
1477     if (!vdev->has_vga || nr != 2 || !vdev->bars[2].mem64 ||
1478         pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_ATI) {
1479         return;
1480     }
1481
1482     quirk = g_malloc0(sizeof(*quirk));
1483     quirk->vdev = vdev;
1484     quirk->data.flags = quirk->data.read_flags = quirk->data.write_flags = 1;
1485     quirk->data.address_match = 0x4000;
1486     quirk->data.address_mask = PCIE_CONFIG_SPACE_SIZE - 1;
1487     quirk->data.bar = nr;
1488
1489     memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_generic_quirk, quirk,
1490                           "vfio-ati-bar2-4000-quirk",
1491                           TARGET_PAGE_ALIGN(quirk->data.address_mask + 1));
1492     memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem,
1493                           quirk->data.address_match & TARGET_PAGE_MASK,
1494                           &quirk->mem, 1);
1495
1496     QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1497
1498     trace_vfio_probe_ati_bar2_4000_quirk(vdev->vbasedev.name);
1499 }
1500
1501 /*
1502  * Older ATI/AMD cards like the X550 have a similar window to that above.
1503  * I/O port BAR1 provides a window to a mirror of PCI config space located
1504  * in BAR2 at offset 0xf00.  We don't care to support such older cards, but
1505  * note it for future reference.
1506  */
1507
1508 #define PCI_VENDOR_ID_NVIDIA                    0x10de
1509
1510 /*
1511  * Nvidia has several different methods to get to config space, the
1512  * nouveu project has several of these documented here:
1513  * https://github.com/pathscale/envytools/tree/master/hwdocs
1514  *
1515  * The first quirk is actually not documented in envytools and is found
1516  * on 10de:01d1 (NVIDIA Corporation G72 [GeForce 7300 LE]).  This is an
1517  * NV46 chipset.  The backdoor uses the legacy VGA I/O ports to access
1518  * the mirror of PCI config space found at BAR0 offset 0x1800.  The access
1519  * sequence first writes 0x338 to I/O port 0x3d4.  The target offset is
1520  * then written to 0x3d0.  Finally 0x538 is written for a read and 0x738
1521  * is written for a write to 0x3d4.  The BAR0 offset is then accessible
1522  * through 0x3d0.  This quirk doesn't seem to be necessary on newer cards
1523  * that use the I/O port BAR5 window but it doesn't hurt to leave it.
1524  */
1525 enum {
1526     NV_3D0_NONE = 0,
1527     NV_3D0_SELECT,
1528     NV_3D0_WINDOW,
1529     NV_3D0_READ,
1530     NV_3D0_WRITE,
1531 };
1532
1533 static uint64_t vfio_nvidia_3d0_quirk_read(void *opaque,
1534                                            hwaddr addr, unsigned size)
1535 {
1536     VFIOQuirk *quirk = opaque;
1537     VFIOPCIDevice *vdev = quirk->vdev;
1538     PCIDevice *pdev = &vdev->pdev;
1539     uint64_t data = vfio_vga_read(&vdev->vga.region[QEMU_PCI_VGA_IO_HI],
1540                                   addr + quirk->data.base_offset, size);
1541
1542     if (quirk->data.flags == NV_3D0_READ && addr == quirk->data.data_offset) {
1543         data = vfio_pci_read_config(pdev, quirk->data.address_val, size);
1544         trace_vfio_nvidia_3d0_quirk_read(size, data);
1545     }
1546
1547     quirk->data.flags = NV_3D0_NONE;
1548
1549     return data;
1550 }
1551
1552 static void vfio_nvidia_3d0_quirk_write(void *opaque, hwaddr addr,
1553                                         uint64_t data, unsigned size)
1554 {
1555     VFIOQuirk *quirk = opaque;
1556     VFIOPCIDevice *vdev = quirk->vdev;
1557     PCIDevice *pdev = &vdev->pdev;
1558
1559     switch (quirk->data.flags) {
1560     case NV_3D0_NONE:
1561         if (addr == quirk->data.address_offset && data == 0x338) {
1562             quirk->data.flags = NV_3D0_SELECT;
1563         }
1564         break;
1565     case NV_3D0_SELECT:
1566         quirk->data.flags = NV_3D0_NONE;
1567         if (addr == quirk->data.data_offset &&
1568             (data & ~quirk->data.address_mask) == quirk->data.address_match) {
1569             quirk->data.flags = NV_3D0_WINDOW;
1570             quirk->data.address_val = data & quirk->data.address_mask;
1571         }
1572         break;
1573     case NV_3D0_WINDOW:
1574         quirk->data.flags = NV_3D0_NONE;
1575         if (addr == quirk->data.address_offset) {
1576             if (data == 0x538) {
1577                 quirk->data.flags = NV_3D0_READ;
1578             } else if (data == 0x738) {
1579                 quirk->data.flags = NV_3D0_WRITE;
1580             }
1581         }
1582         break;
1583     case NV_3D0_WRITE:
1584         quirk->data.flags = NV_3D0_NONE;
1585         if (addr == quirk->data.data_offset) {
1586             vfio_pci_write_config(pdev, quirk->data.address_val, data, size);
1587             trace_vfio_nvidia_3d0_quirk_write(data, size);
1588             return;
1589         }
1590         break;
1591     }
1592
1593     vfio_vga_write(&vdev->vga.region[QEMU_PCI_VGA_IO_HI],
1594                    addr + quirk->data.base_offset, data, size);
1595 }
1596
1597 static const MemoryRegionOps vfio_nvidia_3d0_quirk = {
1598     .read = vfio_nvidia_3d0_quirk_read,
1599     .write = vfio_nvidia_3d0_quirk_write,
1600     .endianness = DEVICE_LITTLE_ENDIAN,
1601 };
1602
1603 static void vfio_vga_probe_nvidia_3d0_quirk(VFIOPCIDevice *vdev)
1604 {
1605     PCIDevice *pdev = &vdev->pdev;
1606     VFIOQuirk *quirk;
1607
1608     if (pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_NVIDIA ||
1609         !vdev->bars[1].region.size) {
1610         return;
1611     }
1612
1613     quirk = g_malloc0(sizeof(*quirk));
1614     quirk->vdev = vdev;
1615     quirk->data.base_offset = 0x10;
1616     quirk->data.address_offset = 4;
1617     quirk->data.address_size = 2;
1618     quirk->data.address_match = 0x1800;
1619     quirk->data.address_mask = PCI_CONFIG_SPACE_SIZE - 1;
1620     quirk->data.data_offset = 0;
1621     quirk->data.data_size = 4;
1622
1623     memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_nvidia_3d0_quirk,
1624                           quirk, "vfio-nvidia-3d0-quirk", 6);
1625     memory_region_add_subregion(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem,
1626                                 quirk->data.base_offset, &quirk->mem);
1627
1628     QLIST_INSERT_HEAD(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].quirks,
1629                       quirk, next);
1630
1631     trace_vfio_vga_probe_nvidia_3d0_quirk(vdev->vbasedev.name);
1632 }
1633
1634 /*
1635  * The second quirk is documented in envytools.  The I/O port BAR5 is just
1636  * a set of address/data ports to the MMIO BARs.  The BAR we care about is
1637  * again BAR0.  This backdoor is apparently a bit newer than the one above
1638  * so we need to not only trap 256 bytes @0x1800, but all of PCI config
1639  * space, including extended space is available at the 4k @0x88000.
1640  */
1641 enum {
1642     NV_BAR5_ADDRESS = 0x1,
1643     NV_BAR5_ENABLE = 0x2,
1644     NV_BAR5_MASTER = 0x4,
1645     NV_BAR5_VALID = 0x7,
1646 };
1647
1648 static void vfio_nvidia_bar5_window_quirk_write(void *opaque, hwaddr addr,
1649                                                 uint64_t data, unsigned size)
1650 {
1651     VFIOQuirk *quirk = opaque;
1652
1653     switch (addr) {
1654     case 0x0:
1655         if (data & 0x1) {
1656             quirk->data.flags |= NV_BAR5_MASTER;
1657         } else {
1658             quirk->data.flags &= ~NV_BAR5_MASTER;
1659         }
1660         break;
1661     case 0x4:
1662         if (data & 0x1) {
1663             quirk->data.flags |= NV_BAR5_ENABLE;
1664         } else {
1665             quirk->data.flags &= ~NV_BAR5_ENABLE;
1666         }
1667         break;
1668     case 0x8:
1669         if (quirk->data.flags & NV_BAR5_MASTER) {
1670             if ((data & ~0xfff) == 0x88000) {
1671                 quirk->data.flags |= NV_BAR5_ADDRESS;
1672                 quirk->data.address_val = data & 0xfff;
1673             } else if ((data & ~0xff) == 0x1800) {
1674                 quirk->data.flags |= NV_BAR5_ADDRESS;
1675                 quirk->data.address_val = data & 0xff;
1676             } else {
1677                 quirk->data.flags &= ~NV_BAR5_ADDRESS;
1678             }
1679         }
1680         break;
1681     }
1682
1683     vfio_generic_window_quirk_write(opaque, addr, data, size);
1684 }
1685
1686 static const MemoryRegionOps vfio_nvidia_bar5_window_quirk = {
1687     .read = vfio_generic_window_quirk_read,
1688     .write = vfio_nvidia_bar5_window_quirk_write,
1689     .valid.min_access_size = 4,
1690     .endianness = DEVICE_LITTLE_ENDIAN,
1691 };
1692
1693 static void vfio_probe_nvidia_bar5_window_quirk(VFIOPCIDevice *vdev, int nr)
1694 {
1695     PCIDevice *pdev = &vdev->pdev;
1696     VFIOQuirk *quirk;
1697
1698     if (!vdev->has_vga || nr != 5 ||
1699         pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_NVIDIA) {
1700         return;
1701     }
1702
1703     quirk = g_malloc0(sizeof(*quirk));
1704     quirk->vdev = vdev;
1705     quirk->data.read_flags = quirk->data.write_flags = NV_BAR5_VALID;
1706     quirk->data.address_offset = 0x8;
1707     quirk->data.address_size = 0; /* actually 4, but avoids generic code */
1708     quirk->data.data_offset = 0xc;
1709     quirk->data.data_size = 4;
1710     quirk->data.bar = nr;
1711
1712     memory_region_init_io(&quirk->mem, OBJECT(vdev),
1713                           &vfio_nvidia_bar5_window_quirk, quirk,
1714                           "vfio-nvidia-bar5-window-quirk", 16);
1715     memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem,
1716                                         0, &quirk->mem, 1);
1717
1718     QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1719
1720     trace_vfio_probe_nvidia_bar5_window_quirk(vdev->vbasedev.name);
1721 }
1722
1723 static void vfio_nvidia_88000_quirk_write(void *opaque, hwaddr addr,
1724                                           uint64_t data, unsigned size)
1725 {
1726     VFIOQuirk *quirk = opaque;
1727     VFIOPCIDevice *vdev = quirk->vdev;
1728     PCIDevice *pdev = &vdev->pdev;
1729     hwaddr base = quirk->data.address_match & TARGET_PAGE_MASK;
1730
1731     vfio_generic_quirk_write(opaque, addr, data, size);
1732
1733     /*
1734      * Nvidia seems to acknowledge MSI interrupts by writing 0xff to the
1735      * MSI capability ID register.  Both the ID and next register are
1736      * read-only, so we allow writes covering either of those to real hw.
1737      * NB - only fixed for the 0x88000 MMIO window.
1738      */
1739     if ((pdev->cap_present & QEMU_PCI_CAP_MSI) &&
1740         vfio_range_contained(addr, size, pdev->msi_cap, PCI_MSI_FLAGS)) {
1741         vfio_region_write(&vdev->bars[quirk->data.bar].region,
1742                           addr + base, data, size);
1743     }
1744 }
1745
1746 static const MemoryRegionOps vfio_nvidia_88000_quirk = {
1747     .read = vfio_generic_quirk_read,
1748     .write = vfio_nvidia_88000_quirk_write,
1749     .endianness = DEVICE_LITTLE_ENDIAN,
1750 };
1751
1752 /*
1753  * Finally, BAR0 itself.  We want to redirect any accesses to either
1754  * 0x1800 or 0x88000 through the PCI config space access functions.
1755  *
1756  * NB - quirk at a page granularity or else they don't seem to work when
1757  *      BARs are mmap'd
1758  *
1759  * Here's offset 0x88000...
1760  */
1761 static void vfio_probe_nvidia_bar0_88000_quirk(VFIOPCIDevice *vdev, int nr)
1762 {
1763     PCIDevice *pdev = &vdev->pdev;
1764     VFIOQuirk *quirk;
1765     uint16_t vendor, class;
1766
1767     vendor = pci_get_word(pdev->config + PCI_VENDOR_ID);
1768     class = pci_get_word(pdev->config + PCI_CLASS_DEVICE);
1769
1770     if (nr != 0 || vendor != PCI_VENDOR_ID_NVIDIA ||
1771         class != PCI_CLASS_DISPLAY_VGA) {
1772         return;
1773     }
1774
1775     quirk = g_malloc0(sizeof(*quirk));
1776     quirk->vdev = vdev;
1777     quirk->data.flags = quirk->data.read_flags = quirk->data.write_flags = 1;
1778     quirk->data.address_match = 0x88000;
1779     quirk->data.address_mask = PCIE_CONFIG_SPACE_SIZE - 1;
1780     quirk->data.bar = nr;
1781
1782     memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_nvidia_88000_quirk,
1783                           quirk, "vfio-nvidia-bar0-88000-quirk",
1784                           TARGET_PAGE_ALIGN(quirk->data.address_mask + 1));
1785     memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem,
1786                           quirk->data.address_match & TARGET_PAGE_MASK,
1787                           &quirk->mem, 1);
1788
1789     QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1790
1791     trace_vfio_probe_nvidia_bar0_88000_quirk(vdev->vbasedev.name);
1792 }
1793
1794 /*
1795  * And here's the same for BAR0 offset 0x1800...
1796  */
1797 static void vfio_probe_nvidia_bar0_1800_quirk(VFIOPCIDevice *vdev, int nr)
1798 {
1799     PCIDevice *pdev = &vdev->pdev;
1800     VFIOQuirk *quirk;
1801
1802     if (!vdev->has_vga || nr != 0 ||
1803         pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_NVIDIA) {
1804         return;
1805     }
1806
1807     /* Log the chipset ID */
1808     trace_vfio_probe_nvidia_bar0_1800_quirk_id(
1809             (unsigned int)(vfio_region_read(&vdev->bars[0].region, 0, 4) >> 20)
1810             & 0xff);
1811
1812     quirk = g_malloc0(sizeof(*quirk));
1813     quirk->vdev = vdev;
1814     quirk->data.flags = quirk->data.read_flags = quirk->data.write_flags = 1;
1815     quirk->data.address_match = 0x1800;
1816     quirk->data.address_mask = PCI_CONFIG_SPACE_SIZE - 1;
1817     quirk->data.bar = nr;
1818
1819     memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_generic_quirk, quirk,
1820                           "vfio-nvidia-bar0-1800-quirk",
1821                           TARGET_PAGE_ALIGN(quirk->data.address_mask + 1));
1822     memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem,
1823                           quirk->data.address_match & TARGET_PAGE_MASK,
1824                           &quirk->mem, 1);
1825
1826     QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1827
1828     trace_vfio_probe_nvidia_bar0_1800_quirk(vdev->vbasedev.name);
1829 }
1830
1831 /*
1832  * TODO - Some Nvidia devices provide config access to their companion HDA
1833  * device and even to their parent bridge via these config space mirrors.
1834  * Add quirks for those regions.
1835  */
1836
1837 /*
1838  * Common quirk probe entry points.
1839  */
1840 static void vfio_vga_quirk_setup(VFIOPCIDevice *vdev)
1841 {
1842     vfio_vga_probe_ati_3c3_quirk(vdev);
1843     vfio_vga_probe_nvidia_3d0_quirk(vdev);
1844 }
1845
1846 static void vfio_vga_quirk_teardown(VFIOPCIDevice *vdev)
1847 {
1848     VFIOQuirk *quirk;
1849     int i;
1850
1851     for (i = 0; i < ARRAY_SIZE(vdev->vga.region); i++) {
1852         QLIST_FOREACH(quirk, &vdev->vga.region[i].quirks, next) {
1853             memory_region_del_subregion(&vdev->vga.region[i].mem, &quirk->mem);
1854         }
1855     }
1856 }
1857
1858 static void vfio_vga_quirk_free(VFIOPCIDevice *vdev)
1859 {
1860     int i;
1861
1862     for (i = 0; i < ARRAY_SIZE(vdev->vga.region); i++) {
1863         while (!QLIST_EMPTY(&vdev->vga.region[i].quirks)) {
1864             VFIOQuirk *quirk = QLIST_FIRST(&vdev->vga.region[i].quirks);
1865             object_unparent(OBJECT(&quirk->mem));
1866             QLIST_REMOVE(quirk, next);
1867             g_free(quirk);
1868         }
1869     }
1870 }
1871
1872 static void vfio_bar_quirk_setup(VFIOPCIDevice *vdev, int nr)
1873 {
1874     vfio_probe_ati_bar4_window_quirk(vdev, nr);
1875     vfio_probe_ati_bar2_4000_quirk(vdev, nr);
1876     vfio_probe_nvidia_bar5_window_quirk(vdev, nr);
1877     vfio_probe_nvidia_bar0_88000_quirk(vdev, nr);
1878     vfio_probe_nvidia_bar0_1800_quirk(vdev, nr);
1879     vfio_probe_rtl8168_bar2_window_quirk(vdev, nr);
1880 }
1881
1882 static void vfio_bar_quirk_teardown(VFIOPCIDevice *vdev, int nr)
1883 {
1884     VFIOBAR *bar = &vdev->bars[nr];
1885     VFIOQuirk *quirk;
1886
1887     QLIST_FOREACH(quirk, &bar->quirks, next) {
1888         memory_region_del_subregion(&bar->region.mem, &quirk->mem);
1889     }
1890 }
1891
1892 static void vfio_bar_quirk_free(VFIOPCIDevice *vdev, int nr)
1893 {
1894     VFIOBAR *bar = &vdev->bars[nr];
1895
1896     while (!QLIST_EMPTY(&bar->quirks)) {
1897         VFIOQuirk *quirk = QLIST_FIRST(&bar->quirks);
1898         object_unparent(OBJECT(&quirk->mem));
1899         QLIST_REMOVE(quirk, next);
1900         g_free(quirk);
1901     }
1902 }
1903
1904 /*
1905  * PCI config space
1906  */
1907 static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
1908 {
1909     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
1910     uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val;
1911
1912     memcpy(&emu_bits, vdev->emulated_config_bits + addr, len);
1913     emu_bits = le32_to_cpu(emu_bits);
1914
1915     if (emu_bits) {
1916         emu_val = pci_default_read_config(pdev, addr, len);
1917     }
1918
1919     if (~emu_bits & (0xffffffffU >> (32 - len * 8))) {
1920         ssize_t ret;
1921
1922         ret = pread(vdev->vbasedev.fd, &phys_val, len,
1923                     vdev->config_offset + addr);
1924         if (ret != len) {
1925             error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x) failed: %m",
1926                          __func__, vdev->host.domain, vdev->host.bus,
1927                          vdev->host.slot, vdev->host.function, addr, len);
1928             return -errno;
1929         }
1930         phys_val = le32_to_cpu(phys_val);
1931     }
1932
1933     val = (emu_val & emu_bits) | (phys_val & ~emu_bits);
1934
1935     trace_vfio_pci_read_config(vdev->vbasedev.name, addr, len, val);
1936
1937     return val;
1938 }
1939
1940 static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
1941                                   uint32_t val, int len)
1942 {
1943     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
1944     uint32_t val_le = cpu_to_le32(val);
1945
1946     trace_vfio_pci_write_config(vdev->vbasedev.name, addr, val, len);
1947
1948     /* Write everything to VFIO, let it filter out what we can't write */
1949     if (pwrite(vdev->vbasedev.fd, &val_le, len, vdev->config_offset + addr)
1950                 != len) {
1951         error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x, 0x%x) failed: %m",
1952                      __func__, vdev->host.domain, vdev->host.bus,
1953                      vdev->host.slot, vdev->host.function, addr, val, len);
1954     }
1955
1956     /* MSI/MSI-X Enabling/Disabling */
1957     if (pdev->cap_present & QEMU_PCI_CAP_MSI &&
1958         ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size)) {
1959         int is_enabled, was_enabled = msi_enabled(pdev);
1960
1961         pci_default_write_config(pdev, addr, val, len);
1962
1963         is_enabled = msi_enabled(pdev);
1964
1965         if (!was_enabled) {
1966             if (is_enabled) {
1967                 vfio_msi_enable(vdev);
1968             }
1969         } else {
1970             if (!is_enabled) {
1971                 vfio_msi_disable(vdev);
1972             } else {
1973                 vfio_update_msi(vdev);
1974             }
1975         }
1976     } else if (pdev->cap_present & QEMU_PCI_CAP_MSIX &&
1977         ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) {
1978         int is_enabled, was_enabled = msix_enabled(pdev);
1979
1980         pci_default_write_config(pdev, addr, val, len);
1981
1982         is_enabled = msix_enabled(pdev);
1983
1984         if (!was_enabled && is_enabled) {
1985             vfio_msix_enable(vdev);
1986         } else if (was_enabled && !is_enabled) {
1987             vfio_msix_disable(vdev);
1988         }
1989     } else {
1990         /* Write everything to QEMU to keep emulated bits correct */
1991         pci_default_write_config(pdev, addr, val, len);
1992     }
1993 }
1994
1995 /*
1996  * Interrupt setup
1997  */
1998 static void vfio_disable_interrupts(VFIOPCIDevice *vdev)
1999 {
2000     /*
2001      * More complicated than it looks.  Disabling MSI/X transitions the
2002      * device to INTx mode (if supported).  Therefore we need to first
2003      * disable MSI/X and then cleanup by disabling INTx.
2004      */
2005     if (vdev->interrupt == VFIO_INT_MSIX) {
2006         vfio_msix_disable(vdev);
2007     } else if (vdev->interrupt == VFIO_INT_MSI) {
2008         vfio_msi_disable(vdev);
2009     }
2010
2011     if (vdev->interrupt == VFIO_INT_INTx) {
2012         vfio_intx_disable(vdev);
2013     }
2014 }
2015
2016 static int vfio_msi_setup(VFIOPCIDevice *vdev, int pos)
2017 {
2018     uint16_t ctrl;
2019     bool msi_64bit, msi_maskbit;
2020     int ret, entries;
2021
2022     if (pread(vdev->vbasedev.fd, &ctrl, sizeof(ctrl),
2023               vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
2024         return -errno;
2025     }
2026     ctrl = le16_to_cpu(ctrl);
2027
2028     msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT);
2029     msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT);
2030     entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1);
2031
2032     trace_vfio_msi_setup(vdev->vbasedev.name, pos);
2033
2034     ret = msi_init(&vdev->pdev, pos, entries, msi_64bit, msi_maskbit);
2035     if (ret < 0) {
2036         if (ret == -ENOTSUP) {
2037             return 0;
2038         }
2039         error_report("vfio: msi_init failed");
2040         return ret;
2041     }
2042     vdev->msi_cap_size = 0xa + (msi_maskbit ? 0xa : 0) + (msi_64bit ? 0x4 : 0);
2043
2044     return 0;
2045 }
2046
2047 /*
2048  * We don't have any control over how pci_add_capability() inserts
2049  * capabilities into the chain.  In order to setup MSI-X we need a
2050  * MemoryRegion for the BAR.  In order to setup the BAR and not
2051  * attempt to mmap the MSI-X table area, which VFIO won't allow, we
2052  * need to first look for where the MSI-X table lives.  So we
2053  * unfortunately split MSI-X setup across two functions.
2054  */
2055 static int vfio_msix_early_setup(VFIOPCIDevice *vdev)
2056 {
2057     uint8_t pos;
2058     uint16_t ctrl;
2059     uint32_t table, pba;
2060     int fd = vdev->vbasedev.fd;
2061     VFIOMSIXInfo *msix;
2062
2063     pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX);
2064     if (!pos) {
2065         return 0;
2066     }
2067
2068     if (pread(fd, &ctrl, sizeof(ctrl),
2069               vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
2070         return -errno;
2071     }
2072
2073     if (pread(fd, &table, sizeof(table),
2074               vdev->config_offset + pos + PCI_MSIX_TABLE) != sizeof(table)) {
2075         return -errno;
2076     }
2077
2078     if (pread(fd, &pba, sizeof(pba),
2079               vdev->config_offset + pos + PCI_MSIX_PBA) != sizeof(pba)) {
2080         return -errno;
2081     }
2082
2083     ctrl = le16_to_cpu(ctrl);
2084     table = le32_to_cpu(table);
2085     pba = le32_to_cpu(pba);
2086
2087     msix = g_malloc0(sizeof(*msix));
2088     msix->table_bar = table & PCI_MSIX_FLAGS_BIRMASK;
2089     msix->table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
2090     msix->pba_bar = pba & PCI_MSIX_FLAGS_BIRMASK;
2091     msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK;
2092     msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
2093
2094     /*
2095      * Test the size of the pba_offset variable and catch if it extends outside
2096      * of the specified BAR. If it is the case, we need to apply a hardware
2097      * specific quirk if the device is known or we have a broken configuration.
2098      */
2099     if (msix->pba_offset >= vdev->bars[msix->pba_bar].region.size) {
2100         PCIDevice *pdev = &vdev->pdev;
2101         uint16_t vendor = pci_get_word(pdev->config + PCI_VENDOR_ID);
2102         uint16_t device = pci_get_word(pdev->config + PCI_DEVICE_ID);
2103
2104         /*
2105          * Chelsio T5 Virtual Function devices are encoded as 0x58xx for T5
2106          * adapters. The T5 hardware returns an incorrect value of 0x8000 for
2107          * the VF PBA offset while the BAR itself is only 8k. The correct value
2108          * is 0x1000, so we hard code that here.
2109          */
2110         if (vendor == PCI_VENDOR_ID_CHELSIO && (device & 0xff00) == 0x5800) {
2111             msix->pba_offset = 0x1000;
2112         } else {
2113             error_report("vfio: Hardware reports invalid configuration, "
2114                          "MSIX PBA outside of specified BAR");
2115             g_free(msix);
2116             return -EINVAL;
2117         }
2118     }
2119
2120     trace_vfio_msix_early_setup(vdev->vbasedev.name, pos, msix->table_bar,
2121                                 msix->table_offset, msix->entries);
2122     vdev->msix = msix;
2123
2124     return 0;
2125 }
2126
2127 static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos)
2128 {
2129     int ret;
2130
2131     ret = msix_init(&vdev->pdev, vdev->msix->entries,
2132                     &vdev->bars[vdev->msix->table_bar].region.mem,
2133                     vdev->msix->table_bar, vdev->msix->table_offset,
2134                     &vdev->bars[vdev->msix->pba_bar].region.mem,
2135                     vdev->msix->pba_bar, vdev->msix->pba_offset, pos);
2136     if (ret < 0) {
2137         if (ret == -ENOTSUP) {
2138             return 0;
2139         }
2140         error_report("vfio: msix_init failed");
2141         return ret;
2142     }
2143
2144     return 0;
2145 }
2146
2147 static void vfio_teardown_msi(VFIOPCIDevice *vdev)
2148 {
2149     msi_uninit(&vdev->pdev);
2150
2151     if (vdev->msix) {
2152         msix_uninit(&vdev->pdev,
2153                     &vdev->bars[vdev->msix->table_bar].region.mem,
2154                     &vdev->bars[vdev->msix->pba_bar].region.mem);
2155     }
2156 }
2157
2158 /*
2159  * Resource setup
2160  */
2161 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled)
2162 {
2163     int i;
2164
2165     for (i = 0; i < PCI_ROM_SLOT; i++) {
2166         VFIOBAR *bar = &vdev->bars[i];
2167
2168         if (!bar->region.size) {
2169             continue;
2170         }
2171
2172         memory_region_set_enabled(&bar->region.mmap_mem, enabled);
2173         if (vdev->msix && vdev->msix->table_bar == i) {
2174             memory_region_set_enabled(&vdev->msix->mmap_mem, enabled);
2175         }
2176     }
2177 }
2178
2179 static void vfio_unregister_bar(VFIOPCIDevice *vdev, int nr)
2180 {
2181     VFIOBAR *bar = &vdev->bars[nr];
2182
2183     if (!bar->region.size) {
2184         return;
2185     }
2186
2187     vfio_bar_quirk_teardown(vdev, nr);
2188
2189     memory_region_del_subregion(&bar->region.mem, &bar->region.mmap_mem);
2190
2191     if (vdev->msix && vdev->msix->table_bar == nr) {
2192         memory_region_del_subregion(&bar->region.mem, &vdev->msix->mmap_mem);
2193     }
2194 }
2195
2196 static void vfio_unmap_bar(VFIOPCIDevice *vdev, int nr)
2197 {
2198     VFIOBAR *bar = &vdev->bars[nr];
2199
2200     if (!bar->region.size) {
2201         return;
2202     }
2203
2204     vfio_bar_quirk_free(vdev, nr);
2205
2206     munmap(bar->region.mmap, memory_region_size(&bar->region.mmap_mem));
2207
2208     if (vdev->msix && vdev->msix->table_bar == nr) {
2209         munmap(vdev->msix->mmap, memory_region_size(&vdev->msix->mmap_mem));
2210     }
2211 }
2212
2213 static void vfio_map_bar(VFIOPCIDevice *vdev, int nr)
2214 {
2215     VFIOBAR *bar = &vdev->bars[nr];
2216     uint64_t size = bar->region.size;
2217     char name[64];
2218     uint32_t pci_bar;
2219     uint8_t type;
2220     int ret;
2221
2222     /* Skip both unimplemented BARs and the upper half of 64bit BARS. */
2223     if (!size) {
2224         return;
2225     }
2226
2227     snprintf(name, sizeof(name), "VFIO %04x:%02x:%02x.%x BAR %d",
2228              vdev->host.domain, vdev->host.bus, vdev->host.slot,
2229              vdev->host.function, nr);
2230
2231     /* Determine what type of BAR this is for registration */
2232     ret = pread(vdev->vbasedev.fd, &pci_bar, sizeof(pci_bar),
2233                 vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr));
2234     if (ret != sizeof(pci_bar)) {
2235         error_report("vfio: Failed to read BAR %d (%m)", nr);
2236         return;
2237     }
2238
2239     pci_bar = le32_to_cpu(pci_bar);
2240     bar->ioport = (pci_bar & PCI_BASE_ADDRESS_SPACE_IO);
2241     bar->mem64 = bar->ioport ? 0 : (pci_bar & PCI_BASE_ADDRESS_MEM_TYPE_64);
2242     type = pci_bar & (bar->ioport ? ~PCI_BASE_ADDRESS_IO_MASK :
2243                                     ~PCI_BASE_ADDRESS_MEM_MASK);
2244
2245     /* A "slow" read/write mapping underlies all BARs */
2246     memory_region_init_io(&bar->region.mem, OBJECT(vdev), &vfio_region_ops,
2247                           bar, name, size);
2248     pci_register_bar(&vdev->pdev, nr, type, &bar->region.mem);
2249
2250     /*
2251      * We can't mmap areas overlapping the MSIX vector table, so we
2252      * potentially insert a direct-mapped subregion before and after it.
2253      */
2254     if (vdev->msix && vdev->msix->table_bar == nr) {
2255         size = vdev->msix->table_offset & qemu_real_host_page_mask;
2256     }
2257
2258     strncat(name, " mmap", sizeof(name) - strlen(name) - 1);
2259     if (vfio_mmap_region(OBJECT(vdev), &bar->region, &bar->region.mem,
2260                       &bar->region.mmap_mem, &bar->region.mmap,
2261                       size, 0, name)) {
2262         error_report("%s unsupported. Performance may be slow", name);
2263     }
2264
2265     if (vdev->msix && vdev->msix->table_bar == nr) {
2266         uint64_t start;
2267
2268         start = REAL_HOST_PAGE_ALIGN((uint64_t)vdev->msix->table_offset +
2269                                      (vdev->msix->entries *
2270                                       PCI_MSIX_ENTRY_SIZE));
2271
2272         size = start < bar->region.size ? bar->region.size - start : 0;
2273         strncat(name, " msix-hi", sizeof(name) - strlen(name) - 1);
2274         /* VFIOMSIXInfo contains another MemoryRegion for this mapping */
2275         if (vfio_mmap_region(OBJECT(vdev), &bar->region, &bar->region.mem,
2276                           &vdev->msix->mmap_mem,
2277                           &vdev->msix->mmap, size, start, name)) {
2278             error_report("%s unsupported. Performance may be slow", name);
2279         }
2280     }
2281
2282     vfio_bar_quirk_setup(vdev, nr);
2283 }
2284
2285 static void vfio_map_bars(VFIOPCIDevice *vdev)
2286 {
2287     int i;
2288
2289     for (i = 0; i < PCI_ROM_SLOT; i++) {
2290         vfio_map_bar(vdev, i);
2291     }
2292
2293     if (vdev->has_vga) {
2294         memory_region_init_io(&vdev->vga.region[QEMU_PCI_VGA_MEM].mem,
2295                               OBJECT(vdev), &vfio_vga_ops,
2296                               &vdev->vga.region[QEMU_PCI_VGA_MEM],
2297                               "vfio-vga-mmio@0xa0000",
2298                               QEMU_PCI_VGA_MEM_SIZE);
2299         memory_region_init_io(&vdev->vga.region[QEMU_PCI_VGA_IO_LO].mem,
2300                               OBJECT(vdev), &vfio_vga_ops,
2301                               &vdev->vga.region[QEMU_PCI_VGA_IO_LO],
2302                               "vfio-vga-io@0x3b0",
2303                               QEMU_PCI_VGA_IO_LO_SIZE);
2304         memory_region_init_io(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem,
2305                               OBJECT(vdev), &vfio_vga_ops,
2306                               &vdev->vga.region[QEMU_PCI_VGA_IO_HI],
2307                               "vfio-vga-io@0x3c0",
2308                               QEMU_PCI_VGA_IO_HI_SIZE);
2309
2310         pci_register_vga(&vdev->pdev, &vdev->vga.region[QEMU_PCI_VGA_MEM].mem,
2311                          &vdev->vga.region[QEMU_PCI_VGA_IO_LO].mem,
2312                          &vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem);
2313         vfio_vga_quirk_setup(vdev);
2314     }
2315 }
2316
2317 static void vfio_unregister_bars(VFIOPCIDevice *vdev)
2318 {
2319     int i;
2320
2321     for (i = 0; i < PCI_ROM_SLOT; i++) {
2322         vfio_unregister_bar(vdev, i);
2323     }
2324
2325     if (vdev->has_vga) {
2326         vfio_vga_quirk_teardown(vdev);
2327         pci_unregister_vga(&vdev->pdev);
2328     }
2329 }
2330
2331 static void vfio_unmap_bars(VFIOPCIDevice *vdev)
2332 {
2333     int i;
2334
2335     for (i = 0; i < PCI_ROM_SLOT; i++) {
2336         vfio_unmap_bar(vdev, i);
2337     }
2338
2339     if (vdev->has_vga) {
2340         vfio_vga_quirk_free(vdev);
2341     }
2342 }
2343
2344 /*
2345  * General setup
2346  */
2347 static uint8_t vfio_std_cap_max_size(PCIDevice *pdev, uint8_t pos)
2348 {
2349     uint8_t tmp, next = 0xff;
2350
2351     for (tmp = pdev->config[PCI_CAPABILITY_LIST]; tmp;
2352          tmp = pdev->config[tmp + 1]) {
2353         if (tmp > pos && tmp < next) {
2354             next = tmp;
2355         }
2356     }
2357
2358     return next - pos;
2359 }
2360
2361 static void vfio_set_word_bits(uint8_t *buf, uint16_t val, uint16_t mask)
2362 {
2363     pci_set_word(buf, (pci_get_word(buf) & ~mask) | val);
2364 }
2365
2366 static void vfio_add_emulated_word(VFIOPCIDevice *vdev, int pos,
2367                                    uint16_t val, uint16_t mask)
2368 {
2369     vfio_set_word_bits(vdev->pdev.config + pos, val, mask);
2370     vfio_set_word_bits(vdev->pdev.wmask + pos, ~mask, mask);
2371     vfio_set_word_bits(vdev->emulated_config_bits + pos, mask, mask);
2372 }
2373
2374 static void vfio_set_long_bits(uint8_t *buf, uint32_t val, uint32_t mask)
2375 {
2376     pci_set_long(buf, (pci_get_long(buf) & ~mask) | val);
2377 }
2378
2379 static void vfio_add_emulated_long(VFIOPCIDevice *vdev, int pos,
2380                                    uint32_t val, uint32_t mask)
2381 {
2382     vfio_set_long_bits(vdev->pdev.config + pos, val, mask);
2383     vfio_set_long_bits(vdev->pdev.wmask + pos, ~mask, mask);
2384     vfio_set_long_bits(vdev->emulated_config_bits + pos, mask, mask);
2385 }
2386
2387 static int vfio_setup_pcie_cap(VFIOPCIDevice *vdev, int pos, uint8_t size)
2388 {
2389     uint16_t flags;
2390     uint8_t type;
2391
2392     flags = pci_get_word(vdev->pdev.config + pos + PCI_CAP_FLAGS);
2393     type = (flags & PCI_EXP_FLAGS_TYPE) >> 4;
2394
2395     if (type != PCI_EXP_TYPE_ENDPOINT &&
2396         type != PCI_EXP_TYPE_LEG_END &&
2397         type != PCI_EXP_TYPE_RC_END) {
2398
2399         error_report("vfio: Assignment of PCIe type 0x%x "
2400                      "devices is not currently supported", type);
2401         return -EINVAL;
2402     }
2403
2404     if (!pci_bus_is_express(vdev->pdev.bus)) {
2405         /*
2406          * Use express capability as-is on PCI bus.  It doesn't make much
2407          * sense to even expose, but some drivers (ex. tg3) depend on it
2408          * and guests don't seem to be particular about it.  We'll need
2409          * to revist this or force express devices to express buses if we
2410          * ever expose an IOMMU to the guest.
2411          */
2412     } else if (pci_bus_is_root(vdev->pdev.bus)) {
2413         /*
2414          * On a Root Complex bus Endpoints become Root Complex Integrated
2415          * Endpoints, which changes the type and clears the LNK & LNK2 fields.
2416          */
2417         if (type == PCI_EXP_TYPE_ENDPOINT) {
2418             vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
2419                                    PCI_EXP_TYPE_RC_END << 4,
2420                                    PCI_EXP_FLAGS_TYPE);
2421
2422             /* Link Capabilities, Status, and Control goes away */
2423             if (size > PCI_EXP_LNKCTL) {
2424                 vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP, 0, ~0);
2425                 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0);
2426                 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA, 0, ~0);
2427
2428 #ifndef PCI_EXP_LNKCAP2
2429 #define PCI_EXP_LNKCAP2 44
2430 #endif
2431 #ifndef PCI_EXP_LNKSTA2
2432 #define PCI_EXP_LNKSTA2 50
2433 #endif
2434                 /* Link 2 Capabilities, Status, and Control goes away */
2435                 if (size > PCI_EXP_LNKCAP2) {
2436                     vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP2, 0, ~0);
2437                     vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL2, 0, ~0);
2438                     vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA2, 0, ~0);
2439                 }
2440             }
2441
2442         } else if (type == PCI_EXP_TYPE_LEG_END) {
2443             /*
2444              * Legacy endpoints don't belong on the root complex.  Windows
2445              * seems to be happier with devices if we skip the capability.
2446              */
2447             return 0;
2448         }
2449
2450     } else {
2451         /*
2452          * Convert Root Complex Integrated Endpoints to regular endpoints.
2453          * These devices don't support LNK/LNK2 capabilities, so make them up.
2454          */
2455         if (type == PCI_EXP_TYPE_RC_END) {
2456             vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
2457                                    PCI_EXP_TYPE_ENDPOINT << 4,
2458                                    PCI_EXP_FLAGS_TYPE);
2459             vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP,
2460                                    PCI_EXP_LNK_MLW_1 | PCI_EXP_LNK_LS_25, ~0);
2461             vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0);
2462         }
2463
2464         /* Mark the Link Status bits as emulated to allow virtual negotiation */
2465         vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA,
2466                                pci_get_word(vdev->pdev.config + pos +
2467                                             PCI_EXP_LNKSTA),
2468                                PCI_EXP_LNKCAP_MLW | PCI_EXP_LNKCAP_SLS);
2469     }
2470
2471     pos = pci_add_capability(&vdev->pdev, PCI_CAP_ID_EXP, pos, size);
2472     if (pos >= 0) {
2473         vdev->pdev.exp.exp_cap = pos;
2474     }
2475
2476     return pos;
2477 }
2478
2479 static void vfio_check_pcie_flr(VFIOPCIDevice *vdev, uint8_t pos)
2480 {
2481     uint32_t cap = pci_get_long(vdev->pdev.config + pos + PCI_EXP_DEVCAP);
2482
2483     if (cap & PCI_EXP_DEVCAP_FLR) {
2484         trace_vfio_check_pcie_flr(vdev->vbasedev.name);
2485         vdev->has_flr = true;
2486     }
2487 }
2488
2489 static void vfio_check_pm_reset(VFIOPCIDevice *vdev, uint8_t pos)
2490 {
2491     uint16_t csr = pci_get_word(vdev->pdev.config + pos + PCI_PM_CTRL);
2492
2493     if (!(csr & PCI_PM_CTRL_NO_SOFT_RESET)) {
2494         trace_vfio_check_pm_reset(vdev->vbasedev.name);
2495         vdev->has_pm_reset = true;
2496     }
2497 }
2498
2499 static void vfio_check_af_flr(VFIOPCIDevice *vdev, uint8_t pos)
2500 {
2501     uint8_t cap = pci_get_byte(vdev->pdev.config + pos + PCI_AF_CAP);
2502
2503     if ((cap & PCI_AF_CAP_TP) && (cap & PCI_AF_CAP_FLR)) {
2504         trace_vfio_check_af_flr(vdev->vbasedev.name);
2505         vdev->has_flr = true;
2506     }
2507 }
2508
2509 static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos)
2510 {
2511     PCIDevice *pdev = &vdev->pdev;
2512     uint8_t cap_id, next, size;
2513     int ret;
2514
2515     cap_id = pdev->config[pos];
2516     next = pdev->config[pos + 1];
2517
2518     /*
2519      * If it becomes important to configure capabilities to their actual
2520      * size, use this as the default when it's something we don't recognize.
2521      * Since QEMU doesn't actually handle many of the config accesses,
2522      * exact size doesn't seem worthwhile.
2523      */
2524     size = vfio_std_cap_max_size(pdev, pos);
2525
2526     /*
2527      * pci_add_capability always inserts the new capability at the head
2528      * of the chain.  Therefore to end up with a chain that matches the
2529      * physical device, we insert from the end by making this recursive.
2530      * This is also why we pre-caclulate size above as cached config space
2531      * will be changed as we unwind the stack.
2532      */
2533     if (next) {
2534         ret = vfio_add_std_cap(vdev, next);
2535         if (ret) {
2536             return ret;
2537         }
2538     } else {
2539         /* Begin the rebuild, use QEMU emulated list bits */
2540         pdev->config[PCI_CAPABILITY_LIST] = 0;
2541         vdev->emulated_config_bits[PCI_CAPABILITY_LIST] = 0xff;
2542         vdev->emulated_config_bits[PCI_STATUS] |= PCI_STATUS_CAP_LIST;
2543     }
2544
2545     /* Use emulated next pointer to allow dropping caps */
2546     pci_set_byte(vdev->emulated_config_bits + pos + 1, 0xff);
2547
2548     switch (cap_id) {
2549     case PCI_CAP_ID_MSI:
2550         ret = vfio_msi_setup(vdev, pos);
2551         break;
2552     case PCI_CAP_ID_EXP:
2553         vfio_check_pcie_flr(vdev, pos);
2554         ret = vfio_setup_pcie_cap(vdev, pos, size);
2555         break;
2556     case PCI_CAP_ID_MSIX:
2557         ret = vfio_msix_setup(vdev, pos);
2558         break;
2559     case PCI_CAP_ID_PM:
2560         vfio_check_pm_reset(vdev, pos);
2561         vdev->pm_cap = pos;
2562         ret = pci_add_capability(pdev, cap_id, pos, size);
2563         break;
2564     case PCI_CAP_ID_AF:
2565         vfio_check_af_flr(vdev, pos);
2566         ret = pci_add_capability(pdev, cap_id, pos, size);
2567         break;
2568     default:
2569         ret = pci_add_capability(pdev, cap_id, pos, size);
2570         break;
2571     }
2572
2573     if (ret < 0) {
2574         error_report("vfio: %04x:%02x:%02x.%x Error adding PCI capability "
2575                      "0x%x[0x%x]@0x%x: %d", vdev->host.domain,
2576                      vdev->host.bus, vdev->host.slot, vdev->host.function,
2577                      cap_id, size, pos, ret);
2578         return ret;
2579     }
2580
2581     return 0;
2582 }
2583
2584 static int vfio_add_capabilities(VFIOPCIDevice *vdev)
2585 {
2586     PCIDevice *pdev = &vdev->pdev;
2587
2588     if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST) ||
2589         !pdev->config[PCI_CAPABILITY_LIST]) {
2590         return 0; /* Nothing to add */
2591     }
2592
2593     return vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]);
2594 }
2595
2596 static void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
2597 {
2598     PCIDevice *pdev = &vdev->pdev;
2599     uint16_t cmd;
2600
2601     vfio_disable_interrupts(vdev);
2602
2603     /* Make sure the device is in D0 */
2604     if (vdev->pm_cap) {
2605         uint16_t pmcsr;
2606         uint8_t state;
2607
2608         pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
2609         state = pmcsr & PCI_PM_CTRL_STATE_MASK;
2610         if (state) {
2611             pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
2612             vfio_pci_write_config(pdev, vdev->pm_cap + PCI_PM_CTRL, pmcsr, 2);
2613             /* vfio handles the necessary delay here */
2614             pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
2615             state = pmcsr & PCI_PM_CTRL_STATE_MASK;
2616             if (state) {
2617                 error_report("vfio: Unable to power on device, stuck in D%d",
2618                              state);
2619             }
2620         }
2621     }
2622
2623     /*
2624      * Stop any ongoing DMA by disconecting I/O, MMIO, and bus master.
2625      * Also put INTx Disable in known state.
2626      */
2627     cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2);
2628     cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER |
2629              PCI_COMMAND_INTX_DISABLE);
2630     vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2);
2631 }
2632
2633 static void vfio_pci_post_reset(VFIOPCIDevice *vdev)
2634 {
2635     vfio_intx_enable(vdev);
2636 }
2637
2638 static bool vfio_pci_host_match(PCIHostDeviceAddress *host1,
2639                                 PCIHostDeviceAddress *host2)
2640 {
2641     return (host1->domain == host2->domain && host1->bus == host2->bus &&
2642             host1->slot == host2->slot && host1->function == host2->function);
2643 }
2644
2645 static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
2646 {
2647     VFIOGroup *group;
2648     struct vfio_pci_hot_reset_info *info;
2649     struct vfio_pci_dependent_device *devices;
2650     struct vfio_pci_hot_reset *reset;
2651     int32_t *fds;
2652     int ret, i, count;
2653     bool multi = false;
2654
2655     trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi");
2656
2657     vfio_pci_pre_reset(vdev);
2658     vdev->vbasedev.needs_reset = false;
2659
2660     info = g_malloc0(sizeof(*info));
2661     info->argsz = sizeof(*info);
2662
2663     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
2664     if (ret && errno != ENOSPC) {
2665         ret = -errno;
2666         if (!vdev->has_pm_reset) {
2667             error_report("vfio: Cannot reset device %04x:%02x:%02x.%x, "
2668                          "no available reset mechanism.", vdev->host.domain,
2669                          vdev->host.bus, vdev->host.slot, vdev->host.function);
2670         }
2671         goto out_single;
2672     }
2673
2674     count = info->count;
2675     info = g_realloc(info, sizeof(*info) + (count * sizeof(*devices)));
2676     info->argsz = sizeof(*info) + (count * sizeof(*devices));
2677     devices = &info->devices[0];
2678
2679     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
2680     if (ret) {
2681         ret = -errno;
2682         error_report("vfio: hot reset info failed: %m");
2683         goto out_single;
2684     }
2685
2686     trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name);
2687
2688     /* Verify that we have all the groups required */
2689     for (i = 0; i < info->count; i++) {
2690         PCIHostDeviceAddress host;
2691         VFIOPCIDevice *tmp;
2692         VFIODevice *vbasedev_iter;
2693
2694         host.domain = devices[i].segment;
2695         host.bus = devices[i].bus;
2696         host.slot = PCI_SLOT(devices[i].devfn);
2697         host.function = PCI_FUNC(devices[i].devfn);
2698
2699         trace_vfio_pci_hot_reset_dep_devices(host.domain,
2700                 host.bus, host.slot, host.function, devices[i].group_id);
2701
2702         if (vfio_pci_host_match(&host, &vdev->host)) {
2703             continue;
2704         }
2705
2706         QLIST_FOREACH(group, &vfio_group_list, next) {
2707             if (group->groupid == devices[i].group_id) {
2708                 break;
2709             }
2710         }
2711
2712         if (!group) {
2713             if (!vdev->has_pm_reset) {
2714                 error_report("vfio: Cannot reset device %s, "
2715                              "depends on group %d which is not owned.",
2716                              vdev->vbasedev.name, devices[i].group_id);
2717             }
2718             ret = -EPERM;
2719             goto out;
2720         }
2721
2722         /* Prep dependent devices for reset and clear our marker. */
2723         QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
2724             if (vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
2725                 continue;
2726             }
2727             tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
2728             if (vfio_pci_host_match(&host, &tmp->host)) {
2729                 if (single) {
2730                     ret = -EINVAL;
2731                     goto out_single;
2732                 }
2733                 vfio_pci_pre_reset(tmp);
2734                 tmp->vbasedev.needs_reset = false;
2735                 multi = true;
2736                 break;
2737             }
2738         }
2739     }
2740
2741     if (!single && !multi) {
2742         ret = -EINVAL;
2743         goto out_single;
2744     }
2745
2746     /* Determine how many group fds need to be passed */
2747     count = 0;
2748     QLIST_FOREACH(group, &vfio_group_list, next) {
2749         for (i = 0; i < info->count; i++) {
2750             if (group->groupid == devices[i].group_id) {
2751                 count++;
2752                 break;
2753             }
2754         }
2755     }
2756
2757     reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds)));
2758     reset->argsz = sizeof(*reset) + (count * sizeof(*fds));
2759     fds = &reset->group_fds[0];
2760
2761     /* Fill in group fds */
2762     QLIST_FOREACH(group, &vfio_group_list, next) {
2763         for (i = 0; i < info->count; i++) {
2764             if (group->groupid == devices[i].group_id) {
2765                 fds[reset->count++] = group->fd;
2766                 break;
2767             }
2768         }
2769     }
2770
2771     /* Bus reset! */
2772     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
2773     g_free(reset);
2774
2775     trace_vfio_pci_hot_reset_result(vdev->vbasedev.name,
2776                                     ret ? "%m" : "Success");
2777
2778 out:
2779     /* Re-enable INTx on affected devices */
2780     for (i = 0; i < info->count; i++) {
2781         PCIHostDeviceAddress host;
2782         VFIOPCIDevice *tmp;
2783         VFIODevice *vbasedev_iter;
2784
2785         host.domain = devices[i].segment;
2786         host.bus = devices[i].bus;
2787         host.slot = PCI_SLOT(devices[i].devfn);
2788         host.function = PCI_FUNC(devices[i].devfn);
2789
2790         if (vfio_pci_host_match(&host, &vdev->host)) {
2791             continue;
2792         }
2793
2794         QLIST_FOREACH(group, &vfio_group_list, next) {
2795             if (group->groupid == devices[i].group_id) {
2796                 break;
2797             }
2798         }
2799
2800         if (!group) {
2801             break;
2802         }
2803
2804         QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
2805             if (vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
2806                 continue;
2807             }
2808             tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
2809             if (vfio_pci_host_match(&host, &tmp->host)) {
2810                 vfio_pci_post_reset(tmp);
2811                 break;
2812             }
2813         }
2814     }
2815 out_single:
2816     vfio_pci_post_reset(vdev);
2817     g_free(info);
2818
2819     return ret;
2820 }
2821
2822 /*
2823  * We want to differentiate hot reset of mulitple in-use devices vs hot reset
2824  * of a single in-use device.  VFIO_DEVICE_RESET will already handle the case
2825  * of doing hot resets when there is only a single device per bus.  The in-use
2826  * here refers to how many VFIODevices are affected.  A hot reset that affects
2827  * multiple devices, but only a single in-use device, means that we can call
2828  * it from our bus ->reset() callback since the extent is effectively a single
2829  * device.  This allows us to make use of it in the hotplug path.  When there
2830  * are multiple in-use devices, we can only trigger the hot reset during a
2831  * system reset and thus from our reset handler.  We separate _one vs _multi
2832  * here so that we don't overlap and do a double reset on the system reset
2833  * path where both our reset handler and ->reset() callback are used.  Calling
2834  * _one() will only do a hot reset for the one in-use devices case, calling
2835  * _multi() will do nothing if a _one() would have been sufficient.
2836  */
2837 static int vfio_pci_hot_reset_one(VFIOPCIDevice *vdev)
2838 {
2839     return vfio_pci_hot_reset(vdev, true);
2840 }
2841
2842 static int vfio_pci_hot_reset_multi(VFIODevice *vbasedev)
2843 {
2844     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
2845     return vfio_pci_hot_reset(vdev, false);
2846 }
2847
2848 static void vfio_pci_compute_needs_reset(VFIODevice *vbasedev)
2849 {
2850     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
2851     if (!vbasedev->reset_works || (!vdev->has_flr && vdev->has_pm_reset)) {
2852         vbasedev->needs_reset = true;
2853     }
2854 }
2855
2856 static VFIODeviceOps vfio_pci_ops = {
2857     .vfio_compute_needs_reset = vfio_pci_compute_needs_reset,
2858     .vfio_hot_reset_multi = vfio_pci_hot_reset_multi,
2859     .vfio_eoi = vfio_intx_eoi,
2860 };
2861
2862 static int vfio_populate_device(VFIOPCIDevice *vdev)
2863 {
2864     VFIODevice *vbasedev = &vdev->vbasedev;
2865     struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) };
2866     struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
2867     int i, ret = -1;
2868
2869     /* Sanity check device */
2870     if (!(vbasedev->flags & VFIO_DEVICE_FLAGS_PCI)) {
2871         error_report("vfio: Um, this isn't a PCI device");
2872         goto error;
2873     }
2874
2875     if (vbasedev->num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1) {
2876         error_report("vfio: unexpected number of io regions %u",
2877                      vbasedev->num_regions);
2878         goto error;
2879     }
2880
2881     if (vbasedev->num_irqs < VFIO_PCI_MSIX_IRQ_INDEX + 1) {
2882         error_report("vfio: unexpected number of irqs %u", vbasedev->num_irqs);
2883         goto error;
2884     }
2885
2886     for (i = VFIO_PCI_BAR0_REGION_INDEX; i < VFIO_PCI_ROM_REGION_INDEX; i++) {
2887         reg_info.index = i;
2888
2889         ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
2890         if (ret) {
2891             error_report("vfio: Error getting region %d info: %m", i);
2892             goto error;
2893         }
2894
2895         trace_vfio_populate_device_region(vbasedev->name, i,
2896                                           (unsigned long)reg_info.size,
2897                                           (unsigned long)reg_info.offset,
2898                                           (unsigned long)reg_info.flags);
2899
2900         vdev->bars[i].region.vbasedev = vbasedev;
2901         vdev->bars[i].region.flags = reg_info.flags;
2902         vdev->bars[i].region.size = reg_info.size;
2903         vdev->bars[i].region.fd_offset = reg_info.offset;
2904         vdev->bars[i].region.nr = i;
2905         QLIST_INIT(&vdev->bars[i].quirks);
2906     }
2907
2908     reg_info.index = VFIO_PCI_CONFIG_REGION_INDEX;
2909
2910     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
2911     if (ret) {
2912         error_report("vfio: Error getting config info: %m");
2913         goto error;
2914     }
2915
2916     trace_vfio_populate_device_config(vdev->vbasedev.name,
2917                                       (unsigned long)reg_info.size,
2918                                       (unsigned long)reg_info.offset,
2919                                       (unsigned long)reg_info.flags);
2920
2921     vdev->config_size = reg_info.size;
2922     if (vdev->config_size == PCI_CONFIG_SPACE_SIZE) {
2923         vdev->pdev.cap_present &= ~QEMU_PCI_CAP_EXPRESS;
2924     }
2925     vdev->config_offset = reg_info.offset;
2926
2927     if ((vdev->features & VFIO_FEATURE_ENABLE_VGA) &&
2928         vbasedev->num_regions > VFIO_PCI_VGA_REGION_INDEX) {
2929         struct vfio_region_info vga_info = {
2930             .argsz = sizeof(vga_info),
2931             .index = VFIO_PCI_VGA_REGION_INDEX,
2932          };
2933
2934         ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_REGION_INFO, &vga_info);
2935         if (ret) {
2936             error_report(
2937                 "vfio: Device does not support requested feature x-vga");
2938             goto error;
2939         }
2940
2941         if (!(vga_info.flags & VFIO_REGION_INFO_FLAG_READ) ||
2942             !(vga_info.flags & VFIO_REGION_INFO_FLAG_WRITE) ||
2943             vga_info.size < 0xbffff + 1) {
2944             error_report("vfio: Unexpected VGA info, flags 0x%lx, size 0x%lx",
2945                          (unsigned long)vga_info.flags,
2946                          (unsigned long)vga_info.size);
2947             goto error;
2948         }
2949
2950         vdev->vga.fd_offset = vga_info.offset;
2951         vdev->vga.fd = vdev->vbasedev.fd;
2952
2953         vdev->vga.region[QEMU_PCI_VGA_MEM].offset = QEMU_PCI_VGA_MEM_BASE;
2954         vdev->vga.region[QEMU_PCI_VGA_MEM].nr = QEMU_PCI_VGA_MEM;
2955         QLIST_INIT(&vdev->vga.region[QEMU_PCI_VGA_MEM].quirks);
2956
2957         vdev->vga.region[QEMU_PCI_VGA_IO_LO].offset = QEMU_PCI_VGA_IO_LO_BASE;
2958         vdev->vga.region[QEMU_PCI_VGA_IO_LO].nr = QEMU_PCI_VGA_IO_LO;
2959         QLIST_INIT(&vdev->vga.region[QEMU_PCI_VGA_IO_LO].quirks);
2960
2961         vdev->vga.region[QEMU_PCI_VGA_IO_HI].offset = QEMU_PCI_VGA_IO_HI_BASE;
2962         vdev->vga.region[QEMU_PCI_VGA_IO_HI].nr = QEMU_PCI_VGA_IO_HI;
2963         QLIST_INIT(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].quirks);
2964
2965         vdev->has_vga = true;
2966     }
2967
2968     irq_info.index = VFIO_PCI_ERR_IRQ_INDEX;
2969
2970     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
2971     if (ret) {
2972         /* This can fail for an old kernel or legacy PCI dev */
2973         trace_vfio_populate_device_get_irq_info_failure();
2974         ret = 0;
2975     } else if (irq_info.count == 1) {
2976         vdev->pci_aer = true;
2977     } else {
2978         error_report("vfio: %s "
2979                      "Could not enable error recovery for the device",
2980                      vbasedev->name);
2981     }
2982
2983 error:
2984     return ret;
2985 }
2986
2987 static void vfio_put_device(VFIOPCIDevice *vdev)
2988 {
2989     g_free(vdev->vbasedev.name);
2990     if (vdev->msix) {
2991         object_unparent(OBJECT(&vdev->msix->mmap_mem));
2992         g_free(vdev->msix);
2993         vdev->msix = NULL;
2994     }
2995     vfio_put_base_device(&vdev->vbasedev);
2996 }
2997
2998 static void vfio_err_notifier_handler(void *opaque)
2999 {
3000     VFIOPCIDevice *vdev = opaque;
3001
3002     if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
3003         return;
3004     }
3005
3006     /*
3007      * TBD. Retrieve the error details and decide what action
3008      * needs to be taken. One of the actions could be to pass
3009      * the error to the guest and have the guest driver recover
3010      * from the error. This requires that PCIe capabilities be
3011      * exposed to the guest. For now, we just terminate the
3012      * guest to contain the error.
3013      */
3014
3015     error_report("%s(%04x:%02x:%02x.%x) Unrecoverable error detected.  "
3016                  "Please collect any data possible and then kill the guest",
3017                  __func__, vdev->host.domain, vdev->host.bus,
3018                  vdev->host.slot, vdev->host.function);
3019
3020     vm_stop(RUN_STATE_INTERNAL_ERROR);
3021 }
3022
3023 /*
3024  * Registers error notifier for devices supporting error recovery.
3025  * If we encounter a failure in this function, we report an error
3026  * and continue after disabling error recovery support for the
3027  * device.
3028  */
3029 static void vfio_register_err_notifier(VFIOPCIDevice *vdev)
3030 {
3031     int ret;
3032     int argsz;
3033     struct vfio_irq_set *irq_set;
3034     int32_t *pfd;
3035
3036     if (!vdev->pci_aer) {
3037         return;
3038     }
3039
3040     if (event_notifier_init(&vdev->err_notifier, 0)) {
3041         error_report("vfio: Unable to init event notifier for error detection");
3042         vdev->pci_aer = false;
3043         return;
3044     }
3045
3046     argsz = sizeof(*irq_set) + sizeof(*pfd);
3047
3048     irq_set = g_malloc0(argsz);
3049     irq_set->argsz = argsz;
3050     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
3051                      VFIO_IRQ_SET_ACTION_TRIGGER;
3052     irq_set->index = VFIO_PCI_ERR_IRQ_INDEX;
3053     irq_set->start = 0;
3054     irq_set->count = 1;
3055     pfd = (int32_t *)&irq_set->data;
3056
3057     *pfd = event_notifier_get_fd(&vdev->err_notifier);
3058     qemu_set_fd_handler(*pfd, vfio_err_notifier_handler, NULL, vdev);
3059
3060     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
3061     if (ret) {
3062         error_report("vfio: Failed to set up error notification");
3063         qemu_set_fd_handler(*pfd, NULL, NULL, vdev);
3064         event_notifier_cleanup(&vdev->err_notifier);
3065         vdev->pci_aer = false;
3066     }
3067     g_free(irq_set);
3068 }
3069
3070 static void vfio_unregister_err_notifier(VFIOPCIDevice *vdev)
3071 {
3072     int argsz;
3073     struct vfio_irq_set *irq_set;
3074     int32_t *pfd;
3075     int ret;
3076
3077     if (!vdev->pci_aer) {
3078         return;
3079     }
3080
3081     argsz = sizeof(*irq_set) + sizeof(*pfd);
3082
3083     irq_set = g_malloc0(argsz);
3084     irq_set->argsz = argsz;
3085     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
3086                      VFIO_IRQ_SET_ACTION_TRIGGER;
3087     irq_set->index = VFIO_PCI_ERR_IRQ_INDEX;
3088     irq_set->start = 0;
3089     irq_set->count = 1;
3090     pfd = (int32_t *)&irq_set->data;
3091     *pfd = -1;
3092
3093     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
3094     if (ret) {
3095         error_report("vfio: Failed to de-assign error fd: %m");
3096     }
3097     g_free(irq_set);
3098     qemu_set_fd_handler(event_notifier_get_fd(&vdev->err_notifier),
3099                         NULL, NULL, vdev);
3100     event_notifier_cleanup(&vdev->err_notifier);
3101 }
3102
3103 static void vfio_req_notifier_handler(void *opaque)
3104 {
3105     VFIOPCIDevice *vdev = opaque;
3106
3107     if (!event_notifier_test_and_clear(&vdev->req_notifier)) {
3108         return;
3109     }
3110
3111     qdev_unplug(&vdev->pdev.qdev, NULL);
3112 }
3113
3114 static void vfio_register_req_notifier(VFIOPCIDevice *vdev)
3115 {
3116     struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info),
3117                                       .index = VFIO_PCI_REQ_IRQ_INDEX };
3118     int argsz;
3119     struct vfio_irq_set *irq_set;
3120     int32_t *pfd;
3121
3122     if (!(vdev->features & VFIO_FEATURE_ENABLE_REQ)) {
3123         return;
3124     }
3125
3126     if (ioctl(vdev->vbasedev.fd,
3127               VFIO_DEVICE_GET_IRQ_INFO, &irq_info) < 0 || irq_info.count < 1) {
3128         return;
3129     }
3130
3131     if (event_notifier_init(&vdev->req_notifier, 0)) {
3132         error_report("vfio: Unable to init event notifier for device request");
3133         return;
3134     }
3135
3136     argsz = sizeof(*irq_set) + sizeof(*pfd);
3137
3138     irq_set = g_malloc0(argsz);
3139     irq_set->argsz = argsz;
3140     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
3141                      VFIO_IRQ_SET_ACTION_TRIGGER;
3142     irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
3143     irq_set->start = 0;
3144     irq_set->count = 1;
3145     pfd = (int32_t *)&irq_set->data;
3146
3147     *pfd = event_notifier_get_fd(&vdev->req_notifier);
3148     qemu_set_fd_handler(*pfd, vfio_req_notifier_handler, NULL, vdev);
3149
3150     if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set)) {
3151         error_report("vfio: Failed to set up device request notification");
3152         qemu_set_fd_handler(*pfd, NULL, NULL, vdev);
3153         event_notifier_cleanup(&vdev->req_notifier);
3154     } else {
3155         vdev->req_enabled = true;
3156     }
3157
3158     g_free(irq_set);
3159 }
3160
3161 static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
3162 {
3163     int argsz;
3164     struct vfio_irq_set *irq_set;
3165     int32_t *pfd;
3166
3167     if (!vdev->req_enabled) {
3168         return;
3169     }
3170
3171     argsz = sizeof(*irq_set) + sizeof(*pfd);
3172
3173     irq_set = g_malloc0(argsz);
3174     irq_set->argsz = argsz;
3175     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
3176                      VFIO_IRQ_SET_ACTION_TRIGGER;
3177     irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
3178     irq_set->start = 0;
3179     irq_set->count = 1;
3180     pfd = (int32_t *)&irq_set->data;
3181     *pfd = -1;
3182
3183     if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set)) {
3184         error_report("vfio: Failed to de-assign device request fd: %m");
3185     }
3186     g_free(irq_set);
3187     qemu_set_fd_handler(event_notifier_get_fd(&vdev->req_notifier),
3188                         NULL, NULL, vdev);
3189     event_notifier_cleanup(&vdev->req_notifier);
3190
3191     vdev->req_enabled = false;
3192 }
3193
3194 /*
3195  * AMD Radeon PCI config reset, based on Linux:
3196  *   drivers/gpu/drm/radeon/ci_smc.c:ci_is_smc_running()
3197  *   drivers/gpu/drm/radeon/radeon_device.c:radeon_pci_config_reset
3198  *   drivers/gpu/drm/radeon/ci_smc.c:ci_reset_smc()
3199  *   drivers/gpu/drm/radeon/ci_smc.c:ci_stop_smc_clock()
3200  * IDs: include/drm/drm_pciids.h
3201  * Registers: http://cgit.freedesktop.org/~agd5f/linux/commit/?id=4e2aa447f6f0
3202  *
3203  * Bonaire and Hawaii GPUs do not respond to a bus reset.  This is a bug in the
3204  * hardware that should be fixed on future ASICs.  The symptom of this is that
3205  * once the accerlated driver loads, Windows guests will bsod on subsequent
3206  * attmpts to load the driver, such as after VM reset or shutdown/restart.  To
3207  * work around this, we do an AMD specific PCI config reset, followed by an SMC
3208  * reset.  The PCI config reset only works if SMC firmware is running, so we
3209  * have a dependency on the state of the device as to whether this reset will
3210  * be effective.  There are still cases where we won't be able to kick the
3211  * device into working, but this greatly improves the usability overall.  The
3212  * config reset magic is relatively common on AMD GPUs, but the setup and SMC
3213  * poking is largely ASIC specific.
3214  */
3215 static bool vfio_radeon_smc_is_running(VFIOPCIDevice *vdev)
3216 {
3217     uint32_t clk, pc_c;
3218
3219     /*
3220      * Registers 200h and 204h are index and data registers for accessing
3221      * indirect configuration registers within the device.
3222      */
3223     vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000004, 4);
3224     clk = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
3225     vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000370, 4);
3226     pc_c = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
3227
3228     return (!(clk & 1) && (0x20100 <= pc_c));
3229 }
3230
3231 /*
3232  * The scope of a config reset is controlled by a mode bit in the misc register
3233  * and a fuse, exposed as a bit in another register.  The fuse is the default
3234  * (0 = GFX, 1 = whole GPU), the misc bit is a toggle, with the forumula
3235  * scope = !(misc ^ fuse), where the resulting scope is defined the same as
3236  * the fuse.  A truth table therefore tells us that if misc == fuse, we need
3237  * to flip the value of the bit in the misc register.
3238  */
3239 static void vfio_radeon_set_gfx_only_reset(VFIOPCIDevice *vdev)
3240 {
3241     uint32_t misc, fuse;
3242     bool a, b;
3243
3244     vfio_region_write(&vdev->bars[5].region, 0x200, 0xc00c0000, 4);
3245     fuse = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
3246     b = fuse & 64;
3247
3248     vfio_region_write(&vdev->bars[5].region, 0x200, 0xc0000010, 4);
3249     misc = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
3250     a = misc & 2;
3251
3252     if (a == b) {
3253         vfio_region_write(&vdev->bars[5].region, 0x204, misc ^ 2, 4);
3254         vfio_region_read(&vdev->bars[5].region, 0x204, 4); /* flush */
3255     }
3256 }
3257
3258 static int vfio_radeon_reset(VFIOPCIDevice *vdev)
3259 {
3260     PCIDevice *pdev = &vdev->pdev;
3261     int i, ret = 0;
3262     uint32_t data;
3263
3264     /* Defer to a kernel implemented reset */
3265     if (vdev->vbasedev.reset_works) {
3266         return -ENODEV;
3267     }
3268
3269     /* Enable only memory BAR access */
3270     vfio_pci_write_config(pdev, PCI_COMMAND, PCI_COMMAND_MEMORY, 2);
3271
3272     /* Reset only works if SMC firmware is loaded and running */
3273     if (!vfio_radeon_smc_is_running(vdev)) {
3274         ret = -EINVAL;
3275         goto out;
3276     }
3277
3278     /* Make sure only the GFX function is reset */
3279     vfio_radeon_set_gfx_only_reset(vdev);
3280
3281     /* AMD PCI config reset */
3282     vfio_pci_write_config(pdev, 0x7c, 0x39d5e86b, 4);
3283     usleep(100);
3284
3285     /* Read back the memory size to make sure we're out of reset */
3286     for (i = 0; i < 100000; i++) {
3287         if (vfio_region_read(&vdev->bars[5].region, 0x5428, 4) != 0xffffffff) {
3288             break;
3289         }
3290         usleep(1);
3291     }
3292
3293     /* Reset SMC */
3294     vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000000, 4);
3295     data = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
3296     data |= 1;
3297     vfio_region_write(&vdev->bars[5].region, 0x204, data, 4);
3298
3299     /* Disable SMC clock */
3300     vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000004, 4);
3301     data = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
3302     data |= 1;
3303     vfio_region_write(&vdev->bars[5].region, 0x204, data, 4);
3304
3305 out:
3306     /* Restore PCI command register */
3307     vfio_pci_write_config(pdev, PCI_COMMAND, 0, 2);
3308
3309     return ret;
3310 }
3311
3312 static void vfio_setup_resetfn(VFIOPCIDevice *vdev)
3313 {
3314     PCIDevice *pdev = &vdev->pdev;
3315     uint16_t vendor, device;
3316
3317     vendor = pci_get_word(pdev->config + PCI_VENDOR_ID);
3318     device = pci_get_word(pdev->config + PCI_DEVICE_ID);
3319
3320     switch (vendor) {
3321     case 0x1002:
3322         switch (device) {
3323         /* Bonaire */
3324         case 0x6649: /* Bonaire [FirePro W5100] */
3325         case 0x6650:
3326         case 0x6651:
3327         case 0x6658: /* Bonaire XTX [Radeon R7 260X] */
3328         case 0x665c: /* Bonaire XT [Radeon HD 7790/8770 / R9 260 OEM] */
3329         case 0x665d: /* Bonaire [Radeon R7 200 Series] */
3330         /* Hawaii */
3331         case 0x67A0: /* Hawaii XT GL [FirePro W9100] */
3332         case 0x67A1: /* Hawaii PRO GL [FirePro W8100] */
3333         case 0x67A2:
3334         case 0x67A8:
3335         case 0x67A9:
3336         case 0x67AA:
3337         case 0x67B0: /* Hawaii XT [Radeon R9 290X] */
3338         case 0x67B1: /* Hawaii PRO [Radeon R9 290] */
3339         case 0x67B8:
3340         case 0x67B9:
3341         case 0x67BA:
3342         case 0x67BE:
3343             vdev->resetfn = vfio_radeon_reset;
3344             break;
3345         }
3346         break;
3347     }
3348 }
3349
3350 static int vfio_initfn(PCIDevice *pdev)
3351 {
3352     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
3353     VFIODevice *vbasedev_iter;
3354     VFIOGroup *group;
3355     char path[PATH_MAX], iommu_group_path[PATH_MAX], *group_name;
3356     ssize_t len;
3357     struct stat st;
3358     int groupid;
3359     int ret;
3360
3361     /* Check that the host device exists */
3362     snprintf(path, sizeof(path),
3363              "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
3364              vdev->host.domain, vdev->host.bus, vdev->host.slot,
3365              vdev->host.function);
3366     if (stat(path, &st) < 0) {
3367         error_report("vfio: error: no such host device: %s", path);
3368         return -errno;
3369     }
3370
3371     vdev->vbasedev.ops = &vfio_pci_ops;
3372
3373     vdev->vbasedev.type = VFIO_DEVICE_TYPE_PCI;
3374     vdev->vbasedev.name = g_strdup_printf("%04x:%02x:%02x.%01x",
3375                                           vdev->host.domain, vdev->host.bus,
3376                                           vdev->host.slot, vdev->host.function);
3377
3378     strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1);
3379
3380     len = readlink(path, iommu_group_path, sizeof(path));
3381     if (len <= 0 || len >= sizeof(path)) {
3382         error_report("vfio: error no iommu_group for device");
3383         return len < 0 ? -errno : -ENAMETOOLONG;
3384     }
3385
3386     iommu_group_path[len] = 0;
3387     group_name = basename(iommu_group_path);
3388
3389     if (sscanf(group_name, "%d", &groupid) != 1) {
3390         error_report("vfio: error reading %s: %m", path);
3391         return -errno;
3392     }
3393
3394     trace_vfio_initfn(vdev->vbasedev.name, groupid);
3395
3396     group = vfio_get_group(groupid, pci_device_iommu_address_space(pdev));
3397     if (!group) {
3398         error_report("vfio: failed to get group %d", groupid);
3399         return -ENOENT;
3400     }
3401
3402     snprintf(path, sizeof(path), "%04x:%02x:%02x.%01x",
3403             vdev->host.domain, vdev->host.bus, vdev->host.slot,
3404             vdev->host.function);
3405
3406     QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
3407         if (strcmp(vbasedev_iter->name, vdev->vbasedev.name) == 0) {
3408             error_report("vfio: error: device %s is already attached", path);
3409             vfio_put_group(group);
3410             return -EBUSY;
3411         }
3412     }
3413
3414     ret = vfio_get_device(group, path, &vdev->vbasedev);
3415     if (ret) {
3416         error_report("vfio: failed to get device %s", path);
3417         vfio_put_group(group);
3418         return ret;
3419     }
3420
3421     ret = vfio_populate_device(vdev);
3422     if (ret) {
3423         return ret;
3424     }
3425
3426     /* Get a copy of config space */
3427     ret = pread(vdev->vbasedev.fd, vdev->pdev.config,
3428                 MIN(pci_config_size(&vdev->pdev), vdev->config_size),
3429                 vdev->config_offset);
3430     if (ret < (int)MIN(pci_config_size(&vdev->pdev), vdev->config_size)) {
3431         ret = ret < 0 ? -errno : -EFAULT;
3432         error_report("vfio: Failed to read device config space");
3433         return ret;
3434     }
3435
3436     /* vfio emulates a lot for us, but some bits need extra love */
3437     vdev->emulated_config_bits = g_malloc0(vdev->config_size);
3438
3439     /* QEMU can choose to expose the ROM or not */
3440     memset(vdev->emulated_config_bits + PCI_ROM_ADDRESS, 0xff, 4);
3441
3442     /* QEMU can change multi-function devices to single function, or reverse */
3443     vdev->emulated_config_bits[PCI_HEADER_TYPE] =
3444                                               PCI_HEADER_TYPE_MULTI_FUNCTION;
3445
3446     /* Restore or clear multifunction, this is always controlled by QEMU */
3447     if (vdev->pdev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
3448         vdev->pdev.config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION;
3449     } else {
3450         vdev->pdev.config[PCI_HEADER_TYPE] &= ~PCI_HEADER_TYPE_MULTI_FUNCTION;
3451     }
3452
3453     /*
3454      * Clear host resource mapping info.  If we choose not to register a
3455      * BAR, such as might be the case with the option ROM, we can get
3456      * confusing, unwritable, residual addresses from the host here.
3457      */
3458     memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24);
3459     memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4);
3460
3461     vfio_pci_size_rom(vdev);
3462
3463     ret = vfio_msix_early_setup(vdev);
3464     if (ret) {
3465         return ret;
3466     }
3467
3468     vfio_map_bars(vdev);
3469
3470     ret = vfio_add_capabilities(vdev);
3471     if (ret) {
3472         goto out_teardown;
3473     }
3474
3475     /* QEMU emulates all of MSI & MSIX */
3476     if (pdev->cap_present & QEMU_PCI_CAP_MSIX) {
3477         memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff,
3478                MSIX_CAP_LENGTH);
3479     }
3480
3481     if (pdev->cap_present & QEMU_PCI_CAP_MSI) {
3482         memset(vdev->emulated_config_bits + pdev->msi_cap, 0xff,
3483                vdev->msi_cap_size);
3484     }
3485
3486     if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
3487         vdev->intx.mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL,
3488                                                   vfio_intx_mmap_enable, vdev);
3489         pci_device_set_intx_routing_notifier(&vdev->pdev, vfio_intx_update);
3490         ret = vfio_intx_enable(vdev);
3491         if (ret) {
3492             goto out_teardown;
3493         }
3494     }
3495
3496     vfio_register_err_notifier(vdev);
3497     vfio_register_req_notifier(vdev);
3498     vfio_setup_resetfn(vdev);
3499
3500     return 0;
3501
3502 out_teardown:
3503     pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
3504     vfio_teardown_msi(vdev);
3505     vfio_unregister_bars(vdev);
3506     return ret;
3507 }
3508
3509 static void vfio_instance_finalize(Object *obj)
3510 {
3511     PCIDevice *pci_dev = PCI_DEVICE(obj);
3512     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pci_dev);
3513     VFIOGroup *group = vdev->vbasedev.group;
3514
3515     vfio_unmap_bars(vdev);
3516     g_free(vdev->emulated_config_bits);
3517     g_free(vdev->rom);
3518     vfio_put_device(vdev);
3519     vfio_put_group(group);
3520 }
3521
3522 static void vfio_exitfn(PCIDevice *pdev)
3523 {
3524     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
3525
3526     vfio_unregister_req_notifier(vdev);
3527     vfio_unregister_err_notifier(vdev);
3528     pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
3529     vfio_disable_interrupts(vdev);
3530     if (vdev->intx.mmap_timer) {
3531         timer_free(vdev->intx.mmap_timer);
3532     }
3533     vfio_teardown_msi(vdev);
3534     vfio_unregister_bars(vdev);
3535 }
3536
3537 static void vfio_pci_reset(DeviceState *dev)
3538 {
3539     PCIDevice *pdev = DO_UPCAST(PCIDevice, qdev, dev);
3540     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
3541
3542     trace_vfio_pci_reset(vdev->vbasedev.name);
3543
3544     vfio_pci_pre_reset(vdev);
3545
3546     if (vdev->resetfn && !vdev->resetfn(vdev)) {
3547         goto post_reset;
3548     }
3549
3550     if (vdev->vbasedev.reset_works &&
3551         (vdev->has_flr || !vdev->has_pm_reset) &&
3552         !ioctl(vdev->vbasedev.fd, VFIO_DEVICE_RESET)) {
3553         trace_vfio_pci_reset_flr(vdev->vbasedev.name);
3554         goto post_reset;
3555     }
3556
3557     /* See if we can do our own bus reset */
3558     if (!vfio_pci_hot_reset_one(vdev)) {
3559         goto post_reset;
3560     }
3561
3562     /* If nothing else works and the device supports PM reset, use it */
3563     if (vdev->vbasedev.reset_works && vdev->has_pm_reset &&
3564         !ioctl(vdev->vbasedev.fd, VFIO_DEVICE_RESET)) {
3565         trace_vfio_pci_reset_pm(vdev->vbasedev.name);
3566         goto post_reset;
3567     }
3568
3569 post_reset:
3570     vfio_pci_post_reset(vdev);
3571 }
3572
3573 static void vfio_instance_init(Object *obj)
3574 {
3575     PCIDevice *pci_dev = PCI_DEVICE(obj);
3576     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, PCI_DEVICE(obj));
3577
3578     device_add_bootindex_property(obj, &vdev->bootindex,
3579                                   "bootindex", NULL,
3580                                   &pci_dev->qdev, NULL);
3581 }
3582
3583 static Property vfio_pci_dev_properties[] = {
3584     DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIOPCIDevice, host),
3585     DEFINE_PROP_UINT32("x-intx-mmap-timeout-ms", VFIOPCIDevice,
3586                        intx.mmap_timeout, 1100),
3587     DEFINE_PROP_BIT("x-vga", VFIOPCIDevice, features,
3588                     VFIO_FEATURE_ENABLE_VGA_BIT, false),
3589     DEFINE_PROP_BIT("x-req", VFIOPCIDevice, features,
3590                     VFIO_FEATURE_ENABLE_REQ_BIT, true),
3591     DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false),
3592     DEFINE_PROP_BOOL("x-no-kvm-intx", VFIOPCIDevice, no_kvm_intx, false),
3593     DEFINE_PROP_BOOL("x-no-kvm-msi", VFIOPCIDevice, no_kvm_msi, false),
3594     DEFINE_PROP_BOOL("x-no-kvm-msix", VFIOPCIDevice, no_kvm_msix, false),
3595     /*
3596      * TODO - support passed fds... is this necessary?
3597      * DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name),
3598      * DEFINE_PROP_STRING("vfiogroupfd, VFIOPCIDevice, vfiogroupfd_name),
3599      */
3600     DEFINE_PROP_END_OF_LIST(),
3601 };
3602
3603 static const VMStateDescription vfio_pci_vmstate = {
3604     .name = "vfio-pci",
3605     .unmigratable = 1,
3606 };
3607
3608 static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
3609 {
3610     DeviceClass *dc = DEVICE_CLASS(klass);
3611     PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass);
3612
3613     dc->reset = vfio_pci_reset;
3614     dc->props = vfio_pci_dev_properties;
3615     dc->vmsd = &vfio_pci_vmstate;
3616     dc->desc = "VFIO-based PCI device assignment";
3617     set_bit(DEVICE_CATEGORY_MISC, dc->categories);
3618     pdc->init = vfio_initfn;
3619     pdc->exit = vfio_exitfn;
3620     pdc->config_read = vfio_pci_read_config;
3621     pdc->config_write = vfio_pci_write_config;
3622     pdc->is_express = 1; /* We might be */
3623 }
3624
3625 static const TypeInfo vfio_pci_dev_info = {
3626     .name = "vfio-pci",
3627     .parent = TYPE_PCI_DEVICE,
3628     .instance_size = sizeof(VFIOPCIDevice),
3629     .class_init = vfio_pci_dev_class_init,
3630     .instance_init = vfio_instance_init,
3631     .instance_finalize = vfio_instance_finalize,
3632 };
3633
3634 static void register_vfio_pci_dev_type(void)
3635 {
3636     type_register_static(&vfio_pci_dev_info);
3637 }
3638
3639 type_init(register_vfio_pci_dev_type)