vfio/pci: Add virtual capabilities quirk infrastructure

[mirror_qemu.git] / hw / vfio / pci.c
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c

index d7dbe0e3e04ebd73293e37490d0ace721fa97f01..bfeaaef22d00d4737c9d9db695e81b87f902ae30 100644 (file)
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -257,7 +257,7 @@ static void vfio_intx_update(PCIDevice *pdev)
  static int vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp)
  {
      uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1);
-    int ret, argsz;
+    int ret, argsz, retval = 0;
      struct vfio_irq_set *irq_set;
      int32_t *pfd;
      Error *err = NULL;
@@ -302,12 +302,12 @@ static int vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp)
      qemu_set_fd_handler(*pfd, vfio_intx_interrupt, NULL, vdev);
  
      ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
-    g_free(irq_set);
      if (ret) {
          error_setg_errno(errp, -ret, "failed to setup INTx fd");
          qemu_set_fd_handler(*pfd, NULL, NULL, vdev);
          event_notifier_cleanup(&vdev->intx.interrupt);
-        return -errno;
+        retval = -errno;
+        goto cleanup;
      }
  
      vfio_intx_enable_kvm(vdev, &err);
@@ -319,7 +319,10 @@ static int vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp)
  
      trace_vfio_intx_enable(vdev->vbasedev.name);
  
-    return 0;
+cleanup:
+    g_free(irq_set);
+
+    return retval;
  }
  
  static void vfio_intx_disable(VFIOPCIDevice *vdev)
@@ -1432,6 +1435,7 @@ static void vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp)
  static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
  {
      int ret;
+    Error *err = NULL;
  
      vdev->msix->pending = g_malloc0(BITS_TO_LONGS(vdev->msix->entries) *
                                      sizeof(unsigned long));
@@ -1439,12 +1443,15 @@ static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
                      vdev->bars[vdev->msix->table_bar].region.mem,
                      vdev->msix->table_bar, vdev->msix->table_offset,
                      vdev->bars[vdev->msix->pba_bar].region.mem,
-                    vdev->msix->pba_bar, vdev->msix->pba_offset, pos);
+                    vdev->msix->pba_bar, vdev->msix->pba_offset, pos,
+                    &err);
      if (ret < 0) {
          if (ret == -ENOTSUP) {
+            error_report_err(err);
              return 0;
          }
-        error_setg(errp, "msix_init failed");
+
+        error_propagate(errp, err);
          return ret;
      }
  
@@ -1739,11 +1746,26 @@ static int vfio_setup_pcie_cap(VFIOPCIDevice *vdev, int pos, uint8_t size,
                                 PCI_EXP_LNKCAP_MLW | PCI_EXP_LNKCAP_SLS);
      }
  
-    pos = pci_add_capability(&vdev->pdev, PCI_CAP_ID_EXP, pos, size);
-    if (pos >= 0) {
-        vdev->pdev.exp.exp_cap = pos;
+    /*
+     * Intel 82599 SR-IOV VFs report an invalid PCIe capability version 0
+     * (Niantic errate #35) causing Windows to error with a Code 10 for the
+     * device on Q35.  Fixup any such devices to report version 1.  If we
+     * were to remove the capability entirely the guest would lose extended
+     * config space.
+     */
+    if ((flags & PCI_EXP_FLAGS_VERS) == 0) {
+        vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
+                               1, PCI_EXP_FLAGS_VERS);
+    }
+
+    pos = pci_add_capability(&vdev->pdev, PCI_CAP_ID_EXP, pos, size,
+                             errp);
+    if (pos < 0) {
+        return pos;
      }
  
+    vdev->pdev.exp.exp_cap = pos;
+
      return pos;
  }
  
@@ -1804,15 +1826,23 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos, Error **errp)
      if (next) {
          ret = vfio_add_std_cap(vdev, next, errp);
          if (ret) {
-            goto out;
+            return ret;
          }
      } else {
          /* Begin the rebuild, use QEMU emulated list bits */
          pdev->config[PCI_CAPABILITY_LIST] = 0;
          vdev->emulated_config_bits[PCI_CAPABILITY_LIST] = 0xff;
          vdev->emulated_config_bits[PCI_STATUS] |= PCI_STATUS_CAP_LIST;
+
+        ret = vfio_add_virt_caps(vdev, errp);
+        if (ret) {
+            return ret;
+        }
      }
  
+    /* Scale down size, esp in case virt caps were added above */
+    size = MIN(size, vfio_std_cap_max_size(pdev, pos));
+
      /* Use emulated next pointer to allow dropping caps */
      pci_set_byte(vdev->emulated_config_bits + pos + PCI_CAP_LIST_NEXT, 0xff);
  
@@ -1830,17 +1860,17 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos, Error **errp)
      case PCI_CAP_ID_PM:
          vfio_check_pm_reset(vdev, pos);
          vdev->pm_cap = pos;
-        ret = pci_add_capability2(pdev, cap_id, pos, size, errp);
+        ret = pci_add_capability(pdev, cap_id, pos, size, errp);
          break;
      case PCI_CAP_ID_AF:
          vfio_check_af_flr(vdev, pos);
-        ret = pci_add_capability2(pdev, cap_id, pos, size, errp);
+        ret = pci_add_capability(pdev, cap_id, pos, size, errp);
          break;
      default:
-        ret = pci_add_capability2(pdev, cap_id, pos, size, errp);
+        ret = pci_add_capability(pdev, cap_id, pos, size, errp);
          break;
      }
-out:
+
      if (ret < 0) {
          error_prepend(errp,
                        "failed to add PCI capability 0x%x[0x%x]@0x%x: ",
@@ -1876,16 +1906,26 @@ static void vfio_add_ext_cap(VFIOPCIDevice *vdev)
      /*
       * Extended capabilities are chained with each pointing to the next, so we
       * can drop anything other than the head of the chain simply by modifying
-     * the previous next pointer.  For the head of the chain, we can modify the
-     * capability ID to something that cannot match a valid capability.  ID
-     * 0 is reserved for this since absence of capabilities is indicated by
-     * 0 for the ID, version, AND next pointer.  However, pcie_add_capability()
-     * uses ID 0 as reserved for list management and will incorrectly match and
-     * assert if we attempt to pre-load the head of the chain with with this
-     * ID.  Use ID 0xFFFF temporarily since it is also seems to be reserved in
-     * part for identifying absence of capabilities in a root complex register
-     * block.  If the ID still exists after adding capabilities, switch back to
-     * zero.  We'll mark this entire first dword as emulated for this purpose.
+     * the previous next pointer.  Seed the head of the chain here such that
+     * we can simply skip any capabilities we want to drop below, regardless
+     * of their position in the chain.  If this stub capability still exists
+     * after we add the capabilities we want to expose, update the capability
+     * ID to zero.  Note that we cannot seed with the capability header being
+     * zero as this conflicts with definition of an absent capability chain
+     * and prevents capabilities beyond the head of the list from being added.
+     * By replacing the dummy capability ID with zero after walking the device
+     * chain, we also transparently mark extended capabilities as absent if
+     * no capabilities were added.  Note that the PCIe spec defines an absence
+     * of extended capabilities to be determined by a value of zero for the
+     * capability ID, version, AND next pointer.  A non-zero next pointer
+     * should be sufficient to indicate additional capabilities are present,
+     * which will occur if we call pcie_add_capability() below.  The entire
+     * first dword is emulated to support this.
+     *
+     * NB. The kernel side does similar masking, so be prepared that our
+     * view of the device may also contain a capability ID zero in the head
+     * of the chain.  Skip it for the same reason that we cannot seed the
+     * chain with a zero capability.
       */
      pci_set_long(pdev->config + PCI_CONFIG_SPACE_SIZE,
                   PCI_EXT_CAP(0xFFFF, 0, 0));
@@ -1911,6 +1951,7 @@ static void vfio_add_ext_cap(VFIOPCIDevice *vdev)
                                     PCI_EXT_CAP_NEXT_MASK);
  
          switch (cap_id) {
+        case 0: /* kernel masked capability */
          case PCI_EXT_CAP_ID_SRIOV: /* Read-only VF BARs confuse OVMF */
          case PCI_EXT_CAP_ID_ARI: /* XXX Needs next function virtualization */
              trace_vfio_add_ext_cap_dropped(vdev->vbasedev.name, cap_id, next);
@@ -2098,7 +2139,8 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
  
          /* Prep dependent devices for reset and clear our marker. */
          QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
-            if (vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
+            if (!vbasedev_iter->dev->realized ||
+                vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
                  continue;
              }
              tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
@@ -2179,7 +2221,8 @@ out:
          }
  
          QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
-            if (vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
+            if (!vbasedev_iter->dev->realized ||
+                vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
                  continue;
              }
              tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
@@ -2502,12 +2545,16 @@ static void vfio_unregister_err_notifier(VFIOPCIDevice *vdev)
  static void vfio_req_notifier_handler(void *opaque)
  {
      VFIOPCIDevice *vdev = opaque;
+    Error *err = NULL;
  
      if (!event_notifier_test_and_clear(&vdev->req_notifier)) {
          return;
      }
  
-    qdev_unplug(&vdev->pdev.qdev, NULL);
+    qdev_unplug(&vdev->pdev.qdev, &err);
+    if (err) {
+        error_reportf_err(err, WARN_PREFIX, vdev->vbasedev.name);
+    }
  }
  
  static void vfio_register_req_notifier(VFIOPCIDevice *vdev)
@@ -2606,8 +2653,8 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
          if (!(~vdev->host.domain || ~vdev->host.bus ||
                ~vdev->host.slot || ~vdev->host.function)) {
              error_setg(errp, "No provided host device");
-            error_append_hint(errp, "Use -vfio-pci,host=DDDD:BB:DD.F "
-                              "or -vfio-pci,sysfsdev=PATH_TO_DEVICE\n");
+            error_append_hint(errp, "Use -device vfio-pci,host=DDDD:BB:DD.F "
+                              "or -device vfio-pci,sysfsdev=PATH_TO_DEVICE\n");
              return;
          }
          vdev->vbasedev.sysfsdev =
@@ -2625,6 +2672,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
      vdev->vbasedev.name = g_strdup(basename(vdev->vbasedev.sysfsdev));
      vdev->vbasedev.ops = &vfio_pci_ops;
      vdev->vbasedev.type = VFIO_DEVICE_TYPE_PCI;
+    vdev->vbasedev.dev = &vdev->pdev.qdev;
  
      tmp = g_strdup_printf("%s/iommu_group", vdev->vbasedev.sysfsdev);
      len = readlink(tmp, group_path, sizeof(group_path));