Merge remote-tracking branch 'remotes/kevin/tags/for-upstream' into staging

[mirror_qemu.git] / hw / virtio / vhost.c
diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c

index 2386b511f311223c1938aea5c2db5dbcdafa4080..614ccc2bcb6cfc98222c36a333884f1066f3488a 100644 (file)
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -27,6 +27,7 @@
  #include "migration/blocker.h"
  #include "migration/qemu-file-types.h"
  #include "sysemu/dma.h"
+#include "sysemu/tcg.h"
  #include "trace.h"
  
  /* enabled until disconnected backend stabilizes */
@@ -89,7 +90,7 @@ static void vhost_dev_sync_region(struct vhost_dev *dev,
          }
          /* Data must be read atomically. We don't really need barrier semantics
           * but it's easier to use atomic_* than roll our own. */
-        log = atomic_xchg(from, 0);
+        log = qatomic_xchg(from, 0);
          while (log) {
              int bit = ctzl(log);
              hwaddr page_addr;
@@ -171,16 +172,6 @@ static uint64_t vhost_get_log_size(struct vhost_dev *dev)
                                         reg->memory_size);
          log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
      }
-    for (i = 0; i < dev->nvqs; ++i) {
-        struct vhost_virtqueue *vq = dev->vqs + i;
-
-        if (!vq->used_phys && !vq->used_size) {
-            continue;
-        }
-
-        uint64_t last = vq->used_phys + vq->used_size - 1;
-        log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
-    }
      return log_size;
  }
  
@@ -290,11 +281,18 @@ static int vhost_dev_has_iommu(struct vhost_dev *dev)
  {
      VirtIODevice *vdev = dev->vdev;
  
-    return virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
+    /*
+     * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support
+     * incremental memory mapping API via IOTLB API. For platform that
+     * does not have IOMMU, there's no need to enable this feature
+     * which may cause unnecessary IOTLB miss/update trnasactions.
+     */
+    return vdev->dma_as != &address_space_memory &&
+           virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
  }
  
  static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr,
-                              hwaddr *plen, int is_write)
+                              hwaddr *plen, bool is_write)
  {
      if (!vhost_dev_has_iommu(dev)) {
          return cpu_physical_memory_map(addr, plen, is_write);
@@ -396,26 +394,50 @@ static int vhost_verify_ring_mappings(struct vhost_dev *dev,
      return r;
  }
  
+/*
+ * vhost_section: identify sections needed for vhost access
+ *
+ * We only care about RAM sections here (where virtqueue and guest
+ * internals accessed by virtio might live). If we find one we still
+ * allow the backend to potentially filter it out of our list.
+ */
  static bool vhost_section(struct vhost_dev *dev, MemoryRegionSection *section)
  {
-    bool result;
-    bool log_dirty = memory_region_get_dirty_log_mask(section->mr) &
-                     ~(1 << DIRTY_MEMORY_MIGRATION);
-    result = memory_region_is_ram(section->mr) &&
-        !memory_region_is_rom(section->mr);
+    MemoryRegion *mr = section->mr;
+
+    if (memory_region_is_ram(mr) && !memory_region_is_rom(mr)) {
+        uint8_t dirty_mask = memory_region_get_dirty_log_mask(mr);
+        uint8_t handled_dirty;
+
+        /*
+         * Kernel based vhost doesn't handle any block which is doing
+         * dirty-tracking other than migration for which it has
+         * specific logging support. However for TCG the kernel never
+         * gets involved anyway so we can also ignore it's
+         * self-modiying code detection flags. However a vhost-user
+         * client could still confuse a TCG guest if it re-writes
+         * executable memory that has already been translated.
+         */
+        handled_dirty = (1 << DIRTY_MEMORY_MIGRATION) |
+            (1 << DIRTY_MEMORY_CODE);
  
-    /* Vhost doesn't handle any block which is doing dirty-tracking other
-     * than migration; this typically fires on VGA areas.
-     */
-    result &= !log_dirty;
+        if (dirty_mask & ~handled_dirty) {
+            trace_vhost_reject_section(mr->name, 1);
+            return false;
+        }
  
-    if (result && dev->vhost_ops->vhost_backend_mem_section_filter) {
-        result &=
-            dev->vhost_ops->vhost_backend_mem_section_filter(dev, section);
-    }
+        if (dev->vhost_ops->vhost_backend_mem_section_filter &&
+            !dev->vhost_ops->vhost_backend_mem_section_filter(dev, section)) {
+            trace_vhost_reject_section(mr->name, 2);
+            return false;
+        }
  
-    trace_vhost_section(section->mr->name, result);
-    return result;
+        trace_vhost_section(mr->name);
+        return true;
+    } else {
+        trace_vhost_reject_section(mr->name, 3);
+        return false;
+    }
  }
  
  static void vhost_begin(MemoryListener *listener)
@@ -547,26 +569,28 @@ static void vhost_region_add_section(struct vhost_dev *dev,
      uintptr_t mrs_host = (uintptr_t)memory_region_get_ram_ptr(section->mr) +
                           section->offset_within_region;
      RAMBlock *mrs_rb = section->mr->ram_block;
-    size_t mrs_page = qemu_ram_pagesize(mrs_rb);
  
      trace_vhost_region_add_section(section->mr->name, mrs_gpa, mrs_size,
                                     mrs_host);
  
-    /* Round the section to it's page size */
-    /* First align the start down to a page boundary */
-    uint64_t alignage = mrs_host & (mrs_page - 1);
-    if (alignage) {
-        mrs_host -= alignage;
-        mrs_size += alignage;
-        mrs_gpa  -= alignage;
-    }
-    /* Now align the size up to a page boundary */
-    alignage = mrs_size & (mrs_page - 1);
-    if (alignage) {
-        mrs_size += mrs_page - alignage;
+    if (dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER) {
+        /* Round the section to it's page size */
+        /* First align the start down to a page boundary */
+        size_t mrs_page = qemu_ram_pagesize(mrs_rb);
+        uint64_t alignage = mrs_host & (mrs_page - 1);
+        if (alignage) {
+            mrs_host -= alignage;
+            mrs_size += alignage;
+            mrs_gpa  -= alignage;
+        }
+        /* Now align the size up to a page boundary */
+        alignage = mrs_size & (mrs_page - 1);
+        if (alignage) {
+            mrs_size += mrs_page - alignage;
+        }
+        trace_vhost_region_add_section_aligned(section->mr->name, mrs_gpa,
+                                               mrs_size, mrs_host);
      }
-    trace_vhost_region_add_section_aligned(section->mr->name, mrs_gpa, mrs_size,
-                                           mrs_host);
  
      if (dev->n_tmp_sections) {
          /* Since we already have at least one section, lets see if
@@ -590,9 +614,10 @@ static void vhost_region_add_section(struct vhost_dev *dev,
               * match up in the same RAMBlock if they do.
               */
              if (mrs_gpa < prev_gpa_start) {
-                error_report("%s:Section rounded to %"PRIx64
-                             " prior to previous %"PRIx64,
-                             __func__, mrs_gpa, prev_gpa_start);
+                error_report("%s:Section '%s' rounded to %"PRIx64
+                             " prior to previous '%s' %"PRIx64,
+                             __func__, section->mr->name, mrs_gpa,
+                             prev_sec->mr->name, prev_gpa_start);
                  /* A way to cleanly fail here would be better */
                  return;
              }
@@ -701,7 +726,8 @@ static void vhost_iommu_region_add(MemoryListener *listener,
      iommu->iommu_offset = section->offset_within_address_space -
                            section->offset_within_region;
      iommu->hdev = dev;
-    memory_region_register_iommu_notifier(section->mr, &iommu->n);
+    memory_region_register_iommu_notifier(section->mr, &iommu->n,
+                                          &error_fatal);
      QLIST_INSERT_HEAD(&dev->iommu_list, iommu, iommu_next);
      /* TODO: can replay help performance here? */
  }
@@ -733,15 +759,25 @@ static int vhost_virtqueue_set_addr(struct vhost_dev *dev,
                                      struct vhost_virtqueue *vq,
                                      unsigned idx, bool enable_log)
  {
-    struct vhost_vring_addr addr = {
-        .index = idx,
-        .desc_user_addr = (uint64_t)(unsigned long)vq->desc,
-        .avail_user_addr = (uint64_t)(unsigned long)vq->avail,
-        .used_user_addr = (uint64_t)(unsigned long)vq->used,
-        .log_guest_addr = vq->used_phys,
-        .flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0,
-    };
-    int r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr);
+    struct vhost_vring_addr addr;
+    int r;
+    memset(&addr, 0, sizeof(struct vhost_vring_addr));
+
+    if (dev->vhost_ops->vhost_vq_get_addr) {
+        r = dev->vhost_ops->vhost_vq_get_addr(dev, &addr, vq);
+        if (r < 0) {
+            VHOST_OPS_DEBUG("vhost_vq_get_addr failed");
+            return -errno;
+        }
+    } else {
+        addr.desc_user_addr = (uint64_t)(unsigned long)vq->desc;
+        addr.avail_user_addr = (uint64_t)(unsigned long)vq->avail;
+        addr.used_user_addr = (uint64_t)(unsigned long)vq->used;
+    }
+    addr.index = idx;
+    addr.log_guest_addr = vq->used_phys;
+    addr.flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0;
+    r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr);
      if (r < 0) {
          VHOST_OPS_DEBUG("vhost_set_vring_addr failed");
          return -errno;
@@ -757,22 +793,52 @@ static int vhost_dev_set_features(struct vhost_dev *dev,
      if (enable_log) {
          features |= 0x1ULL << VHOST_F_LOG_ALL;
      }
+    if (!vhost_dev_has_iommu(dev)) {
+        features &= ~(0x1ULL << VIRTIO_F_IOMMU_PLATFORM);
+    }
+    if (dev->vhost_ops->vhost_force_iommu) {
+        if (dev->vhost_ops->vhost_force_iommu(dev) == true) {
+            features |= 0x1ULL << VIRTIO_F_IOMMU_PLATFORM;
+       }
+    }
      r = dev->vhost_ops->vhost_set_features(dev, features);
      if (r < 0) {
          VHOST_OPS_DEBUG("vhost_set_features failed");
+        goto out;
+    }
+    if (dev->vhost_ops->vhost_set_backend_cap) {
+        r = dev->vhost_ops->vhost_set_backend_cap(dev);
+        if (r < 0) {
+            VHOST_OPS_DEBUG("vhost_set_backend_cap failed");
+            goto out;
+        }
      }
+
+out:
      return r < 0 ? -errno : 0;
  }
  
  static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log)
  {
      int r, i, idx;
+    hwaddr addr;
+
      r = vhost_dev_set_features(dev, enable_log);
      if (r < 0) {
          goto err_features;
      }
      for (i = 0; i < dev->nvqs; ++i) {
          idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
+        addr = virtio_queue_get_desc_addr(dev->vdev, idx);
+        if (!addr) {
+            /*
+             * The queue might not be ready for start. If this
+             * is the case there is no reason to continue the process.
+             * The similar logic is used by the vhost_virtqueue_start()
+             * routine.
+             */
+            continue;
+        }
          r = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
                                       enable_log);
          if (r < 0) {
@@ -791,33 +857,54 @@ err_features:
      return r;
  }
  
-static int vhost_migration_log(MemoryListener *listener, int enable)
+static int vhost_migration_log(MemoryListener *listener, bool enable)
  {
      struct vhost_dev *dev = container_of(listener, struct vhost_dev,
                                           memory_listener);
      int r;
-    if (!!enable == dev->log_enabled) {
+    if (enable == dev->log_enabled) {
          return 0;
      }
      if (!dev->started) {
          dev->log_enabled = enable;
          return 0;
      }
+
+    r = 0;
      if (!enable) {
          r = vhost_dev_set_log(dev, false);
          if (r < 0) {
-            return r;
+            goto check_dev_state;
          }
          vhost_log_put(dev, false);
      } else {
          vhost_dev_log_resize(dev, vhost_get_log_size(dev));
          r = vhost_dev_set_log(dev, true);
          if (r < 0) {
-            return r;
+            goto check_dev_state;
          }
      }
+
+check_dev_state:
      dev->log_enabled = enable;
-    return 0;
+    /*
+     * vhost-user-* devices could change their state during log
+     * initialization due to disconnect. So check dev state after
+     * vhost communication.
+     */
+    if (!dev->started) {
+        /*
+         * Since device is in the stopped state, it is okay for
+         * migration. Return success.
+         */
+        r = 0;
+    }
+    if (r) {
+        /* An error is occured. */
+        dev->log_enabled = false;
+    }
+
+    return r;
  }
  
  static void vhost_log_global_start(MemoryListener *listener)
@@ -919,7 +1006,7 @@ int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write)
      uint64_t uaddr, len;
      int ret = -EFAULT;
  
-    rcu_read_lock();
+    RCU_READ_LOCK_GUARD();
  
      trace_vhost_iotlb_miss(dev, 1);
  
@@ -951,8 +1038,6 @@ int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write)
      trace_vhost_iotlb_miss(dev, 2);
  
  out:
-    rcu_read_unlock();
-
      return ret;
  }
  
@@ -1006,21 +1091,21 @@ static int vhost_virtqueue_start(struct vhost_dev *dev,
  
      vq->desc_size = s = l = virtio_queue_get_desc_size(vdev, idx);
      vq->desc_phys = a;
-    vq->desc = vhost_memory_map(dev, a, &l, 0);
+    vq->desc = vhost_memory_map(dev, a, &l, false);
      if (!vq->desc || l != s) {
          r = -ENOMEM;
          goto fail_alloc_desc;
      }
      vq->avail_size = s = l = virtio_queue_get_avail_size(vdev, idx);
      vq->avail_phys = a = virtio_queue_get_avail_addr(vdev, idx);
-    vq->avail = vhost_memory_map(dev, a, &l, 0);
+    vq->avail = vhost_memory_map(dev, a, &l, false);
      if (!vq->avail || l != s) {
          r = -ENOMEM;
          goto fail_alloc_avail;
      }
      vq->used_size = s = l = virtio_queue_get_used_size(vdev, idx);
      vq->used_phys = a = virtio_queue_get_used_addr(vdev, idx);
-    vq->used = vhost_memory_map(dev, a, &l, 1);
+    vq->used = vhost_memory_map(dev, a, &l, true);
      if (!vq->used || l != s) {
          r = -ENOMEM;
          goto fail_alloc_used;
@@ -1498,7 +1583,7 @@ void vhost_dev_set_config_notifier(struct vhost_dev *hdev,
  
  void vhost_dev_free_inflight(struct vhost_inflight *inflight)
  {
-    if (inflight->addr) {
+    if (inflight && inflight->addr) {
          qemu_memfd_free(inflight->addr, inflight->size, inflight->fd);
          inflight->addr = NULL;
          inflight->fd = -1;
@@ -1560,6 +1645,26 @@ int vhost_dev_load_inflight(struct vhost_inflight *inflight, QEMUFile *f)
      return 0;
  }
  
+int vhost_dev_prepare_inflight(struct vhost_dev *hdev, VirtIODevice *vdev)
+{
+    int r;
+
+    if (hdev->vhost_ops->vhost_get_inflight_fd == NULL ||
+        hdev->vhost_ops->vhost_set_inflight_fd == NULL) {
+        return 0;
+    }
+
+    hdev->vdev = vdev;
+
+    r = vhost_dev_set_features(hdev, hdev->log_enabled);
+    if (r < 0) {
+        VHOST_OPS_DEBUG("vhost_dev_prepare_inflight failed");
+        return r;
+    }
+
+    return 0;
+}
+
  int vhost_dev_set_inflight(struct vhost_dev *dev,
                             struct vhost_inflight *inflight)
  {
@@ -1644,9 +1749,15 @@ int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
              goto fail_log;
          }
      }
-
-    if (vhost_dev_has_iommu(hdev)) {
-        hdev->vhost_ops->vhost_set_iotlb_callback(hdev, true);
+    if (hdev->vhost_ops->vhost_dev_start) {
+        r = hdev->vhost_ops->vhost_dev_start(hdev, true);
+        if (r) {
+            goto fail_log;
+        }
+    }
+    if (vhost_dev_has_iommu(hdev) &&
+        hdev->vhost_ops->vhost_set_iotlb_callback) {
+            hdev->vhost_ops->vhost_set_iotlb_callback(hdev, true);
  
          /* Update used ring information for IOTLB to work correctly,
           * vhost-kernel code requires for this.*/
@@ -1681,6 +1792,9 @@ void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
      /* should only be called after backend is connected */
      assert(hdev->vhost_ops);
  
+    if (hdev->vhost_ops->vhost_dev_start) {
+        hdev->vhost_ops->vhost_dev_start(hdev, false);
+    }
      for (i = 0; i < hdev->nvqs; ++i) {
          vhost_virtqueue_stop(hdev,
                               vdev,
@@ -1689,7 +1803,9 @@ void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
      }
  
      if (vhost_dev_has_iommu(hdev)) {
-        hdev->vhost_ops->vhost_set_iotlb_callback(hdev, false);
+        if (hdev->vhost_ops->vhost_set_iotlb_callback) {
+            hdev->vhost_ops->vhost_set_iotlb_callback(hdev, false);
+        }
          memory_listener_unregister(&hdev->iommu_listener);
      }
      vhost_log_put(hdev, true);