update source to Ceph Pacific 16.2.2

[ceph.git] / ceph / src / spdk / lib / vhost / vhost.c
diff --git a/ceph/src/spdk/lib/vhost/vhost.c b/ceph/src/spdk/lib/vhost/vhost.c

index 7ccf19d32ed811a444f4415ede2cd87eccc30e44..b904d8bf907067a807ae239cb206001054c00a4d 100644 (file)
--- a/ceph/src/spdk/lib/vhost/vhost.c
+++ b/ceph/src/spdk/lib/vhost/vhost.c
@@ -37,78 +37,56 @@
  #include "spdk/likely.h"
  #include "spdk/string.h"
  #include "spdk/util.h"
+#include "spdk/memory.h"
  #include "spdk/barrier.h"
  #include "spdk/vhost.h"
  #include "vhost_internal.h"
  
-#include "spdk_internal/memory.h"
-
-struct vhost_poll_group {
-       struct spdk_thread *thread;
-       TAILQ_ENTRY(vhost_poll_group) tailq;
-};
-
-static TAILQ_HEAD(, vhost_poll_group) g_poll_groups = TAILQ_HEAD_INITIALIZER(g_poll_groups);
-
-static uint32_t *g_num_ctrlrs;
+static struct spdk_cpuset g_vhost_core_mask;
  
  /* Path to folder where character device will be created. Can be set by user. */
  static char dev_dirname[PATH_MAX] = "";
  
-static struct spdk_thread *g_fini_thread;
+/* Thread performing all vhost management operations */
+static struct spdk_thread *g_vhost_init_thread;
+
  static spdk_vhost_fini_cb g_fini_cpl_cb;
  
-struct spdk_vhost_session_fn_ctx {
+/**
+ * DPDK calls our callbacks synchronously but the work those callbacks
+ * perform needs to be async. Luckily, all DPDK callbacks are called on
+ * a DPDK-internal pthread, so we'll just wait on a semaphore in there.
+ */
+static sem_t g_dpdk_sem;
+
+/** Return code for the current DPDK callback */
+static int g_dpdk_response;
+
+struct vhost_session_fn_ctx {
         /** Device pointer obtained before enqueuing the event */
         struct spdk_vhost_dev *vdev;
  
         /** ID of the session to send event to. */
         uint32_t vsession_id;
  
-       /** User callback function to be executed on given lcore. */
+       /** User provided function to be executed on session's thread. */
         spdk_vhost_session_fn cb_fn;
  
-       /** Semaphore used to signal that event is done. */
-       sem_t sem;
-
-       /** Response to be written by enqueued event. */
-       int response;
+       /**
+        * User provided function to be called on the init thread
+        * after iterating through all sessions.
+        */
+       spdk_vhost_dev_fn cpl_fn;
  
         /** Custom user context */
         void *user_ctx;
  };
  
-static int new_connection(int vid);
-static int start_device(int vid);
-static void stop_device(int vid);
-static void destroy_connection(int vid);
+static TAILQ_HEAD(, spdk_vhost_dev) g_vhost_devices = TAILQ_HEAD_INITIALIZER(
+                       g_vhost_devices);
+static pthread_mutex_t g_vhost_mutex = PTHREAD_MUTEX_INITIALIZER;
  
-#ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB
-static int get_config(int vid, uint8_t *config, uint32_t len);
-static int set_config(int vid, uint8_t *config, uint32_t offset,
-                     uint32_t size, uint32_t flags);
-#endif
-
-const struct vhost_device_ops g_spdk_vhost_ops = {
-       .new_device =  start_device,
-       .destroy_device = stop_device,
-       .new_connection = new_connection,
-       .destroy_connection = destroy_connection,
-#ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB
-       .get_config = get_config,
-       .set_config = set_config,
-       .vhost_nvme_admin_passthrough = spdk_vhost_nvme_admin_passthrough,
-       .vhost_nvme_set_cq_call = spdk_vhost_nvme_set_cq_call,
-       .vhost_nvme_get_cap = spdk_vhost_nvme_get_cap,
-       .vhost_nvme_set_bar_mr = spdk_vhost_nvme_set_bar_mr,
-#endif
-};
-
-static TAILQ_HEAD(, spdk_vhost_dev) g_spdk_vhost_devices = TAILQ_HEAD_INITIALIZER(
-                       g_spdk_vhost_devices);
-static pthread_mutex_t g_spdk_vhost_mutex = PTHREAD_MUTEX_INITIALIZER;
-
-void *spdk_vhost_gpa_to_vva(struct spdk_vhost_session *vsession, uint64_t addr, uint64_t len)
+void *vhost_gpa_to_vva(struct spdk_vhost_session *vsession, uint64_t addr, uint64_t len)
  {
         void *vva;
         uint64_t newlen;
@@ -124,62 +102,65 @@ void *spdk_vhost_gpa_to_vva(struct spdk_vhost_session *vsession, uint64_t addr,
  }
  
  static void
-spdk_vhost_log_req_desc(struct spdk_vhost_session *vsession, struct spdk_vhost_virtqueue *virtqueue,
-                       uint16_t req_id)
+vhost_log_req_desc(struct spdk_vhost_session *vsession, struct spdk_vhost_virtqueue *virtqueue,
+                  uint16_t req_id)
  {
         struct vring_desc *desc, *desc_table;
         uint32_t desc_table_size;
         int rc;
  
-       if (spdk_likely(!spdk_vhost_dev_has_feature(vsession, VHOST_F_LOG_ALL))) {
+       if (spdk_likely(!vhost_dev_has_feature(vsession, VHOST_F_LOG_ALL))) {
                 return;
         }
  
-       rc = spdk_vhost_vq_get_desc(vsession, virtqueue, req_id, &desc, &desc_table, &desc_table_size);
+       rc = vhost_vq_get_desc(vsession, virtqueue, req_id, &desc, &desc_table, &desc_table_size);
         if (spdk_unlikely(rc != 0)) {
                 SPDK_ERRLOG("Can't log used ring descriptors!\n");
                 return;
         }
  
         do {
-               if (spdk_vhost_vring_desc_is_wr(desc)) {
+               if (vhost_vring_desc_is_wr(desc)) {
                         /* To be honest, only pages realy touched should be logged, but
                          * doing so would require tracking those changes in each backed.
                          * Also backend most likely will touch all/most of those pages so
                          * for lets assume we touched all pages passed to as writeable buffers. */
                         rte_vhost_log_write(vsession->vid, desc->addr, desc->len);
                 }
-               spdk_vhost_vring_desc_get_next(&desc, desc_table, desc_table_size);
+               vhost_vring_desc_get_next(&desc, desc_table, desc_table_size);
         } while (desc);
  }
  
  static void
-spdk_vhost_log_used_vring_elem(struct spdk_vhost_session *vsession,
-                              struct spdk_vhost_virtqueue *virtqueue,
-                              uint16_t idx)
+vhost_log_used_vring_elem(struct spdk_vhost_session *vsession,
+                         struct spdk_vhost_virtqueue *virtqueue,
+                         uint16_t idx)
  {
         uint64_t offset, len;
-       uint16_t vq_idx;
  
-       if (spdk_likely(!spdk_vhost_dev_has_feature(vsession, VHOST_F_LOG_ALL))) {
+       if (spdk_likely(!vhost_dev_has_feature(vsession, VHOST_F_LOG_ALL))) {
                 return;
         }
  
-       offset = offsetof(struct vring_used, ring[idx]);
-       len = sizeof(virtqueue->vring.used->ring[idx]);
-       vq_idx = virtqueue - vsession->virtqueue;
+       if (spdk_unlikely(virtqueue->packed.packed_ring)) {
+               offset = idx * sizeof(struct vring_packed_desc);
+               len = sizeof(struct vring_packed_desc);
+       } else {
+               offset = offsetof(struct vring_used, ring[idx]);
+               len = sizeof(virtqueue->vring.used->ring[idx]);
+       }
  
-       rte_vhost_log_used_vring(vsession->vid, vq_idx, offset, len);
+       rte_vhost_log_used_vring(vsession->vid, virtqueue->vring_idx, offset, len);
  }
  
  static void
-spdk_vhost_log_used_vring_idx(struct spdk_vhost_session *vsession,
-                             struct spdk_vhost_virtqueue *virtqueue)
+vhost_log_used_vring_idx(struct spdk_vhost_session *vsession,
+                        struct spdk_vhost_virtqueue *virtqueue)
  {
         uint64_t offset, len;
         uint16_t vq_idx;
  
-       if (spdk_likely(!spdk_vhost_dev_has_feature(vsession, VHOST_F_LOG_ALL))) {
+       if (spdk_likely(!vhost_dev_has_feature(vsession, VHOST_F_LOG_ALL))) {
                 return;
         }
  
@@ -194,8 +175,8 @@ spdk_vhost_log_used_vring_idx(struct spdk_vhost_session *vsession,
   * Get available requests from avail ring.
   */
  uint16_t
-spdk_vhost_vq_avail_ring_get(struct spdk_vhost_virtqueue *virtqueue, uint16_t *reqs,
-                            uint16_t reqs_len)
+vhost_vq_avail_ring_get(struct spdk_vhost_virtqueue *virtqueue, uint16_t *reqs,
+                       uint16_t reqs_len)
  {
         struct rte_vhost_vring *vring = &virtqueue->vring;
         struct vring_avail *avail = vring->avail;
@@ -229,15 +210,21 @@ spdk_vhost_vq_avail_ring_get(struct spdk_vhost_virtqueue *virtqueue, uint16_t *r
  }
  
  static bool
-spdk_vhost_vring_desc_is_indirect(struct vring_desc *cur_desc)
+vhost_vring_desc_is_indirect(struct vring_desc *cur_desc)
  {
         return !!(cur_desc->flags & VRING_DESC_F_INDIRECT);
  }
  
+static bool
+vhost_vring_packed_desc_is_indirect(struct vring_packed_desc *cur_desc)
+{
+       return (cur_desc->flags & VRING_DESC_F_INDIRECT) != 0;
+}
+
  int
-spdk_vhost_vq_get_desc(struct spdk_vhost_session *vsession, struct spdk_vhost_virtqueue *virtqueue,
-                      uint16_t req_idx, struct vring_desc **desc, struct vring_desc **desc_table,
-                      uint32_t *desc_table_size)
+vhost_vq_get_desc(struct spdk_vhost_session *vsession, struct spdk_vhost_virtqueue *virtqueue,
+                 uint16_t req_idx, struct vring_desc **desc, struct vring_desc **desc_table,
+                 uint32_t *desc_table_size)
  {
         if (spdk_unlikely(req_idx >= virtqueue->vring.size)) {
                 return -1;
@@ -245,10 +232,10 @@ spdk_vhost_vq_get_desc(struct spdk_vhost_session *vsession, struct spdk_vhost_vi
  
         *desc = &virtqueue->vring.desc[req_idx];
  
-       if (spdk_vhost_vring_desc_is_indirect(*desc)) {
+       if (vhost_vring_desc_is_indirect(*desc)) {
                 *desc_table_size = (*desc)->len / sizeof(**desc);
-               *desc_table = spdk_vhost_gpa_to_vva(vsession, (*desc)->addr,
-                                                   sizeof(**desc) * *desc_table_size);
+               *desc_table = vhost_gpa_to_vva(vsession, (*desc)->addr,
+                                              sizeof(**desc) * *desc_table_size);
                 *desc = *desc_table;
                 if (*desc == NULL) {
                         return -1;
@@ -264,8 +251,37 @@ spdk_vhost_vq_get_desc(struct spdk_vhost_session *vsession, struct spdk_vhost_vi
  }
  
  int
-spdk_vhost_vq_used_signal(struct spdk_vhost_session *vsession,
-                         struct spdk_vhost_virtqueue *virtqueue)
+vhost_vq_get_desc_packed(struct spdk_vhost_session *vsession,
+                        struct spdk_vhost_virtqueue *virtqueue,
+                        uint16_t req_idx, struct vring_packed_desc **desc,
+                        struct vring_packed_desc **desc_table, uint32_t *desc_table_size)
+{
+       *desc =  &virtqueue->vring.desc_packed[req_idx];
+
+       /* In packed ring when the desc is non-indirect we get next desc
+        * by judging (desc->flag & VRING_DESC_F_NEXT) != 0. When the desc
+        * is indirect we get next desc by idx and desc_table_size. It's
+        * different from split ring.
+        */
+       if (vhost_vring_packed_desc_is_indirect(*desc)) {
+               *desc_table_size = (*desc)->len / sizeof(struct vring_packed_desc);
+               *desc_table = vhost_gpa_to_vva(vsession, (*desc)->addr,
+                                              (*desc)->len);
+               *desc = *desc_table;
+               if (spdk_unlikely(*desc == NULL)) {
+                       return -1;
+               }
+       } else {
+               *desc_table = NULL;
+               *desc_table_size  = 0;
+       }
+
+       return 0;
+}
+
+int
+vhost_vq_used_signal(struct spdk_vhost_session *vsession,
+                    struct spdk_vhost_virtqueue *virtqueue)
  {
         if (virtqueue->used_req_cnt == 0) {
                 return 0;
@@ -319,8 +335,24 @@ check_session_io_stats(struct spdk_vhost_session *vsession, uint64_t now)
         }
  }
  
+static inline bool
+vhost_vq_event_is_suppressed(struct spdk_vhost_virtqueue *vq)
+{
+       if (spdk_unlikely(vq->packed.packed_ring)) {
+               if (vq->vring.driver_event->flags & VRING_PACKED_EVENT_FLAG_DISABLE) {
+                       return true;
+               }
+       } else {
+               if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) {
+                       return true;
+               }
+       }
+
+       return false;
+}
+
  void
-spdk_vhost_session_used_signal(struct spdk_vhost_session *vsession)
+vhost_session_used_signal(struct spdk_vhost_session *vsession)
  {
         struct spdk_vhost_virtqueue *virtqueue;
         uint64_t now;
@@ -330,12 +362,15 @@ spdk_vhost_session_used_signal(struct spdk_vhost_session *vsession)
                 for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
                         virtqueue = &vsession->virtqueue[q_idx];
  
-                       if (virtqueue->vring.desc == NULL ||
-                           (virtqueue->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) {
+                       if (virtqueue->vring.desc == NULL) {
+                               continue;
+                       }
+
+                       if (vhost_vq_event_is_suppressed(virtqueue)) {
                                 continue;
                         }
  
-                       spdk_vhost_vq_used_signal(vsession, virtqueue);
+                       vhost_vq_used_signal(vsession, virtqueue);
                 }
         } else {
                 now = spdk_get_ticks();
@@ -345,12 +380,15 @@ spdk_vhost_session_used_signal(struct spdk_vhost_session *vsession)
                         virtqueue = &vsession->virtqueue[q_idx];
  
                         /* No need for event right now */
-                       if (now < virtqueue->next_event_time ||
-                           (virtqueue->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) {
+                       if (now < virtqueue->next_event_time) {
                                 continue;
                         }
  
-                       if (!spdk_vhost_vq_used_signal(vsession, virtqueue)) {
+                       if (vhost_vq_event_is_suppressed(virtqueue)) {
+                               continue;
+                       }
+
+                       if (!vhost_vq_used_signal(vsession, virtqueue)) {
                                 continue;
                         }
  
@@ -362,14 +400,9 @@ spdk_vhost_session_used_signal(struct spdk_vhost_session *vsession)
  }
  
  static int
-spdk_vhost_session_set_coalescing(struct spdk_vhost_dev *vdev,
-                                 struct spdk_vhost_session *vsession, void *ctx)
+vhost_session_set_coalescing(struct spdk_vhost_dev *vdev,
+                            struct spdk_vhost_session *vsession, void *ctx)
  {
-       if (vdev == NULL || vsession == NULL) {
-               /* nothing to do */
-               return 0;
-       }
-
         vsession->coalescing_delay_time_base =
                 vdev->coalescing_delay_us * spdk_get_ticks_hz() / 1000000ULL;
         vsession->coalescing_io_rate_threshold =
@@ -377,9 +410,9 @@ spdk_vhost_session_set_coalescing(struct spdk_vhost_dev *vdev,
         return 0;
  }
  
-int
-spdk_vhost_set_coalescing(struct spdk_vhost_dev *vdev, uint32_t delay_base_us,
-                         uint32_t iops_threshold)
+static int
+vhost_dev_set_coalescing(struct spdk_vhost_dev *vdev, uint32_t delay_base_us,
+                        uint32_t iops_threshold)
  {
         uint64_t delay_time_base = delay_base_us * spdk_get_ticks_hz() / 1000000ULL;
         uint32_t io_rate = iops_threshold * SPDK_VHOST_STATS_CHECK_INTERVAL_MS / 1000U;
@@ -395,8 +428,21 @@ spdk_vhost_set_coalescing(struct spdk_vhost_dev *vdev, uint32_t delay_base_us,
  
         vdev->coalescing_delay_us = delay_base_us;
         vdev->coalescing_iops_threshold = iops_threshold;
+       return 0;
+}
  
-       spdk_vhost_dev_foreach_session(vdev, spdk_vhost_session_set_coalescing, NULL);
+int
+spdk_vhost_set_coalescing(struct spdk_vhost_dev *vdev, uint32_t delay_base_us,
+                         uint32_t iops_threshold)
+{
+       int rc;
+
+       rc = vhost_dev_set_coalescing(vdev, delay_base_us, iops_threshold);
+       if (rc != 0) {
+               return rc;
+       }
+
+       vhost_dev_foreach_session(vdev, vhost_session_set_coalescing, NULL, NULL);
         return 0;
  }
  
@@ -417,19 +463,20 @@ spdk_vhost_get_coalescing(struct spdk_vhost_dev *vdev, uint32_t *delay_base_us,
   * Enqueue id and len to used ring.
   */
  void
-spdk_vhost_vq_used_ring_enqueue(struct spdk_vhost_session *vsession,
-                               struct spdk_vhost_virtqueue *virtqueue,
-                               uint16_t id, uint32_t len)
+vhost_vq_used_ring_enqueue(struct spdk_vhost_session *vsession,
+                          struct spdk_vhost_virtqueue *virtqueue,
+                          uint16_t id, uint32_t len)
  {
         struct rte_vhost_vring *vring = &virtqueue->vring;
         struct vring_used *used = vring->used;
         uint16_t last_idx = virtqueue->last_used_idx & (vring->size - 1);
+       uint16_t vq_idx = virtqueue->vring_idx;
  
         SPDK_DEBUGLOG(SPDK_LOG_VHOST_RING,
                       "Queue %td - USED RING: last_idx=%"PRIu16" req id=%"PRIu16" len=%"PRIu32"\n",
                       virtqueue - vsession->virtqueue, virtqueue->last_used_idx, id, len);
  
-       spdk_vhost_log_req_desc(vsession, virtqueue, id);
+       vhost_log_req_desc(vsession, virtqueue, id);
  
         virtqueue->last_used_idx++;
         used->ring[last_idx].id = id;
@@ -438,56 +485,141 @@ spdk_vhost_vq_used_ring_enqueue(struct spdk_vhost_session *vsession,
         /* Ensure the used ring is updated before we log it or increment used->idx. */
         spdk_smp_wmb();
  
-       spdk_vhost_log_used_vring_elem(vsession, virtqueue, last_idx);
+       rte_vhost_set_last_inflight_io_split(vsession->vid, vq_idx, id);
+
+       vhost_log_used_vring_elem(vsession, virtqueue, last_idx);
         * (volatile uint16_t *) &used->idx = virtqueue->last_used_idx;
-       spdk_vhost_log_used_vring_idx(vsession, virtqueue);
+       vhost_log_used_vring_idx(vsession, virtqueue);
  
-       /* Ensure all our used ring changes are visible to the guest at the time
-        * of interrupt.
-        * TODO: this is currently an sfence on x86. For other architectures we
-        * will most likely need an smp_mb(), but smp_mb() is an overkill for x86.
-        */
-       spdk_wmb();
+       rte_vhost_clr_inflight_desc_split(vsession->vid, vq_idx, virtqueue->last_used_idx, id);
  
         virtqueue->used_req_cnt++;
  }
  
-int
-spdk_vhost_vring_desc_get_next(struct vring_desc **desc,
-                              struct vring_desc *desc_table, uint32_t desc_table_size)
+void
+vhost_vq_packed_ring_enqueue(struct spdk_vhost_session *vsession,
+                            struct spdk_vhost_virtqueue *virtqueue,
+                            uint16_t num_descs, uint16_t buffer_id,
+                            uint32_t length)
  {
-       struct vring_desc *old_desc = *desc;
-       uint16_t next_idx;
+       struct vring_packed_desc *desc = &virtqueue->vring.desc_packed[virtqueue->last_used_idx];
+       bool used, avail;
  
-       if ((old_desc->flags & VRING_DESC_F_NEXT) == 0) {
-               *desc = NULL;
-               return 0;
+       SPDK_DEBUGLOG(SPDK_LOG_VHOST_RING,
+                     "Queue %td - RING: buffer_id=%"PRIu16"\n",
+                     virtqueue - vsession->virtqueue, buffer_id);
+
+       /* When the descriptor is used, two flags in descriptor
+        * avail flag and used flag are set to equal
+        * and used flag value == used_wrap_counter.
+        */
+       used = !!(desc->flags & VRING_DESC_F_USED);
+       avail = !!(desc->flags & VRING_DESC_F_AVAIL);
+       if (spdk_unlikely(used == virtqueue->packed.used_phase && used == avail)) {
+               SPDK_ERRLOG("descriptor has been used before\n");
+               return;
         }
  
-       next_idx = old_desc->next;
-       if (spdk_unlikely(next_idx >= desc_table_size)) {
-               *desc = NULL;
-               return -1;
+       /* In used desc addr is unused and len specifies the buffer length
+        * that has been written to by the device.
+        */
+       desc->addr = 0;
+       desc->len = length;
+
+       /* This bit specifies whether any data has been written by the device */
+       if (length != 0) {
+               desc->flags |= VRING_DESC_F_WRITE;
         }
  
-       *desc = &desc_table[next_idx];
-       return 0;
+       /* Buffer ID is included in the last descriptor in the list.
+        * The driver needs to keep track of the size of the list corresponding
+        * to each buffer ID.
+        */
+       desc->id = buffer_id;
+
+       /* A device MUST NOT make the descriptor used before buffer_id is
+        * written to the descriptor.
+        */
+       spdk_smp_wmb();
+       /* To mark a desc as used, the device sets the F_USED bit in flags to match
+        * the internal Device ring wrap counter. It also sets the F_AVAIL bit to
+        * match the same value.
+        */
+       if (virtqueue->packed.used_phase) {
+               desc->flags |= VRING_DESC_F_AVAIL_USED;
+       } else {
+               desc->flags &= ~VRING_DESC_F_AVAIL_USED;
+       }
+
+       vhost_log_used_vring_elem(vsession, virtqueue, virtqueue->last_used_idx);
+       virtqueue->last_used_idx += num_descs;
+       if (virtqueue->last_used_idx >= virtqueue->vring.size) {
+               virtqueue->last_used_idx -= virtqueue->vring.size;
+               virtqueue->packed.used_phase = !virtqueue->packed.used_phase;
+       }
+
+       virtqueue->used_req_cnt++;
+}
+
+bool
+vhost_vq_packed_ring_is_avail(struct spdk_vhost_virtqueue *virtqueue)
+{
+       uint16_t flags = virtqueue->vring.desc_packed[virtqueue->last_avail_idx].flags;
+
+       /* To mark a desc as available, the driver sets the F_AVAIL bit in flags
+        * to match the internal avail wrap counter. It also sets the F_USED bit to
+        * match the inverse value but it's not mandatory.
+        */
+       return (!!(flags & VRING_DESC_F_AVAIL) == virtqueue->packed.avail_phase);
  }
  
  bool
-spdk_vhost_vring_desc_is_wr(struct vring_desc *cur_desc)
+vhost_vring_packed_desc_is_wr(struct vring_packed_desc *cur_desc)
  {
-       return !!(cur_desc->flags & VRING_DESC_F_WRITE);
+       return (cur_desc->flags & VRING_DESC_F_WRITE) != 0;
  }
  
  int
-spdk_vhost_vring_desc_to_iov(struct spdk_vhost_session *vsession, struct iovec *iov,
-                            uint16_t *iov_index, const struct vring_desc *desc)
+vhost_vring_packed_desc_get_next(struct vring_packed_desc **desc, uint16_t *req_idx,
+                                struct spdk_vhost_virtqueue *vq,
+                                struct vring_packed_desc *desc_table,
+                                uint32_t desc_table_size)
+{
+       if (desc_table != NULL) {
+               /* When the desc_table isn't NULL means it's indirect and we get the next
+                * desc by req_idx and desc_table_size. The return value is NULL means
+                * we reach the last desc of this request.
+                */
+               (*req_idx)++;
+               if (*req_idx < desc_table_size) {
+                       *desc = &desc_table[*req_idx];
+               } else {
+                       *desc = NULL;
+               }
+       } else {
+               /* When the desc_table is NULL means it's non-indirect and we get the next
+                * desc by req_idx and F_NEXT in flags. The return value is NULL means
+                * we reach the last desc of this request. When return new desc
+                * we update the req_idx too.
+                */
+               if (((*desc)->flags & VRING_DESC_F_NEXT) == 0) {
+                       *desc = NULL;
+                       return 0;
+               }
+
+               *req_idx = (*req_idx + 1) % vq->vring.size;
+               *desc = &vq->vring.desc_packed[*req_idx];
+       }
+
+       return 0;
+}
+
+static int
+vhost_vring_desc_payload_to_iov(struct spdk_vhost_session *vsession, struct iovec *iov,
+                               uint16_t *iov_index, uintptr_t payload, uint64_t remaining)
  {
-       uint64_t len;
-       uint64_t remaining = desc->len;
-       uintptr_t payload = desc->addr;
         uintptr_t vva;
+       uint64_t len;
  
         do {
                 if (*iov_index >= SPDK_VHOST_IOVS_MAX) {
@@ -510,8 +642,80 @@ spdk_vhost_vring_desc_to_iov(struct spdk_vhost_session *vsession, struct iovec *
         return 0;
  }
  
+int
+vhost_vring_packed_desc_to_iov(struct spdk_vhost_session *vsession, struct iovec *iov,
+                              uint16_t *iov_index, const struct vring_packed_desc *desc)
+{
+       return vhost_vring_desc_payload_to_iov(vsession, iov, iov_index,
+                                              desc->addr, desc->len);
+}
+
+/* 1, Traverse the desc chain to get the buffer_id and return buffer_id as task_idx.
+ * 2, Update the vq->last_avail_idx to point next available desc chain.
+ * 3, Update the avail_wrap_counter if last_avail_idx overturn.
+ */
+uint16_t
+vhost_vring_packed_desc_get_buffer_id(struct spdk_vhost_virtqueue *vq, uint16_t req_idx,
+                                     uint16_t *num_descs)
+{
+       struct vring_packed_desc *desc;
+       uint16_t desc_head = req_idx;
+
+       *num_descs = 1;
+
+       desc =  &vq->vring.desc_packed[req_idx];
+       if (!vhost_vring_packed_desc_is_indirect(desc)) {
+               while ((desc->flags & VRING_DESC_F_NEXT) != 0) {
+                       req_idx = (req_idx + 1) % vq->vring.size;
+                       desc = &vq->vring.desc_packed[req_idx];
+                       (*num_descs)++;
+               }
+       }
+
+       /* Queue Size doesn't have to be a power of 2
+        * Device maintains last_avail_idx so we can make sure
+        * the value is valid(0 ~ vring.size - 1)
+        */
+       vq->last_avail_idx = (req_idx + 1) % vq->vring.size;
+       if (vq->last_avail_idx < desc_head) {
+               vq->packed.avail_phase = !vq->packed.avail_phase;
+       }
+
+       return desc->id;
+}
+
+int
+vhost_vring_desc_get_next(struct vring_desc **desc,
+                         struct vring_desc *desc_table, uint32_t desc_table_size)
+{
+       struct vring_desc *old_desc = *desc;
+       uint16_t next_idx;
+
+       if ((old_desc->flags & VRING_DESC_F_NEXT) == 0) {
+               *desc = NULL;
+               return 0;
+       }
+
+       next_idx = old_desc->next;
+       if (spdk_unlikely(next_idx >= desc_table_size)) {
+               *desc = NULL;
+               return -1;
+       }
+
+       *desc = &desc_table[next_idx];
+       return 0;
+}
+
+int
+vhost_vring_desc_to_iov(struct spdk_vhost_session *vsession, struct iovec *iov,
+                       uint16_t *iov_index, const struct vring_desc *desc)
+{
+       return vhost_vring_desc_payload_to_iov(vsession, iov, iov_index,
+                                              desc->addr, desc->len);
+}
+
  static struct spdk_vhost_session *
-spdk_vhost_session_find_by_id(struct spdk_vhost_dev *vdev, unsigned id)
+vhost_session_find_by_id(struct spdk_vhost_dev *vdev, unsigned id)
  {
         struct spdk_vhost_session *vsession;
  
@@ -525,12 +729,12 @@ spdk_vhost_session_find_by_id(struct spdk_vhost_dev *vdev, unsigned id)
  }
  
  struct spdk_vhost_session *
-spdk_vhost_session_find_by_vid(int vid)
+vhost_session_find_by_vid(int vid)
  {
         struct spdk_vhost_dev *vdev;
         struct spdk_vhost_session *vsession;
  
-       TAILQ_FOREACH(vdev, &g_spdk_vhost_devices, tailq) {
+       TAILQ_FOREACH(vdev, &g_vhost_devices, tailq) {
                 TAILQ_FOREACH(vsession, &vdev->vsessions, tailq) {
                         if (vsession->vid == vid) {
                                 return vsession;
@@ -541,79 +745,11 @@ spdk_vhost_session_find_by_vid(int vid)
         return NULL;
  }
  
-#define SHIFT_2MB      21
-#define SIZE_2MB       (1ULL << SHIFT_2MB)
-#define FLOOR_2MB(x)   (((uintptr_t)x) / SIZE_2MB) << SHIFT_2MB
-#define CEIL_2MB(x)    ((((uintptr_t)x) + SIZE_2MB - 1) / SIZE_2MB) << SHIFT_2MB
-
-static void
-spdk_vhost_session_mem_register(struct spdk_vhost_session *vsession)
-{
-       struct rte_vhost_mem_region *region;
-       uint32_t i;
-       uint64_t previous_start = UINT64_MAX;
-
-       for (i = 0; i < vsession->mem->nregions; i++) {
-               uint64_t start, end, len;
-               region = &vsession->mem->regions[i];
-               start = FLOOR_2MB(region->mmap_addr);
-               end = CEIL_2MB(region->mmap_addr + region->mmap_size);
-               if (start == previous_start) {
-                       start += (size_t) SIZE_2MB;
-               }
-               previous_start = start;
-               len = end - start;
-               SPDK_INFOLOG(SPDK_LOG_VHOST, "Registering VM memory for vtophys translation - 0x%jx len:0x%jx\n",
-                            start, len);
-
-               if (spdk_mem_register((void *)start, len) != 0) {
-                       SPDK_WARNLOG("Failed to register memory region %"PRIu32". Future vtophys translation might fail.\n",
-                                    i);
-                       continue;
-               }
-       }
-}
-
-static void
-spdk_vhost_session_mem_unregister(struct spdk_vhost_session *vsession)
-{
-       struct rte_vhost_mem_region *region;
-       uint32_t i;
-       uint64_t previous_start = UINT64_MAX;
-
-       for (i = 0; i < vsession->mem->nregions; i++) {
-               uint64_t start, end, len;
-               region = &vsession->mem->regions[i];
-               start = FLOOR_2MB(region->mmap_addr);
-               end = CEIL_2MB(region->mmap_addr + region->mmap_size);
-               if (start == previous_start) {
-                       start += (size_t) SIZE_2MB;
-               }
-               previous_start = start;
-               len = end - start;
-
-               if (spdk_vtophys((void *) start, NULL) == SPDK_VTOPHYS_ERROR) {
-                       continue; /* region has not been registered */
-               }
-
-               if (spdk_mem_unregister((void *)start, len) != 0) {
-                       assert(false);
-               }
-       }
-
-}
-
-void
-spdk_vhost_free_reactor(uint32_t lcore)
-{
-       g_num_ctrlrs[lcore]--;
-}
-
  struct spdk_vhost_dev *
  spdk_vhost_dev_next(struct spdk_vhost_dev *vdev)
  {
         if (vdev == NULL) {
-               return TAILQ_FIRST(&g_spdk_vhost_devices);
+               return TAILQ_FIRST(&g_vhost_devices);
         }
  
         return TAILQ_NEXT(vdev, tailq);
@@ -629,7 +765,7 @@ spdk_vhost_dev_find(const char *ctrlr_name)
                 ctrlr_name += dev_dirname_len;
         }
  
-       TAILQ_FOREACH(vdev, &g_spdk_vhost_devices, tailq) {
+       TAILQ_FOREACH(vdev, &g_vhost_devices, tailq) {
                 if (strcmp(vdev->name, ctrlr_name) == 0) {
                         return vdev;
                 }
@@ -639,7 +775,7 @@ spdk_vhost_dev_find(const char *ctrlr_name)
  }
  
  static int
-spdk_vhost_parse_core_mask(const char *mask, struct spdk_cpuset *cpumask)
+vhost_parse_core_mask(const char *mask, struct spdk_cpuset *cpumask)
  {
         int rc;
  
@@ -648,44 +784,59 @@ spdk_vhost_parse_core_mask(const char *mask, struct spdk_cpuset *cpumask)
         }
  
         if (mask == NULL) {
-               spdk_cpuset_copy(cpumask, spdk_app_get_core_mask());
+               spdk_cpuset_copy(cpumask, &g_vhost_core_mask);
                 return 0;
         }
  
-       rc = spdk_app_parse_core_mask(mask, cpumask);
+       rc = spdk_cpuset_parse(cpumask, mask);
         if (rc < 0) {
                 SPDK_ERRLOG("invalid cpumask %s\n", mask);
                 return -1;
         }
  
+       spdk_cpuset_and(cpumask, &g_vhost_core_mask);
+
         if (spdk_cpuset_count(cpumask) == 0) {
-               SPDK_ERRLOG("no cpu is selected among reactor mask(=%s)\n",
-                           spdk_cpuset_fmt(spdk_app_get_core_mask()));
+               SPDK_ERRLOG("no cpu is selected among core mask(=%s)\n",
+                           spdk_cpuset_fmt(&g_vhost_core_mask));
                 return -1;
         }
  
         return 0;
  }
  
-static void *
-_start_rte_driver(void *arg)
+static void
+vhost_setup_core_mask(void *ctx)
  {
-       char *path = arg;
+       struct spdk_thread *thread = spdk_get_thread();
+       spdk_cpuset_or(&g_vhost_core_mask, spdk_thread_get_cpumask(thread));
+}
  
-       if (rte_vhost_driver_start(path) != 0) {
-               return NULL;
+static void
+vhost_setup_core_mask_done(void *ctx)
+{
+       spdk_vhost_init_cb init_cb = ctx;
+
+       if (spdk_cpuset_count(&g_vhost_core_mask) == 0) {
+               init_cb(-ECHILD);
+               return;
         }
  
-       return path;
+       init_cb(0);
+}
+
+static void
+vhost_dev_thread_exit(void *arg1)
+{
+       spdk_thread_exit(spdk_get_thread());
  }
  
  int
-spdk_vhost_dev_register(struct spdk_vhost_dev *vdev, const char *name, const char *mask_str,
-                       const struct spdk_vhost_dev_backend *backend)
+vhost_dev_register(struct spdk_vhost_dev *vdev, const char *name, const char *mask_str,
+                  const struct spdk_vhost_dev_backend *backend)
  {
         char path[PATH_MAX];
-       struct stat file_stat;
-       struct spdk_cpuset *cpumask;
+       struct spdk_cpuset cpumask = {};
         int rc;
  
         assert(vdev);
@@ -694,125 +845,71 @@ spdk_vhost_dev_register(struct spdk_vhost_dev *vdev, const char *name, const cha
                 return -EINVAL;
         }
  
-       cpumask = spdk_cpuset_alloc();
-       if (!cpumask) {
-               SPDK_ERRLOG("spdk_cpuset_alloc failed\n");
-               return -ENOMEM;
-       }
-
-       if (spdk_vhost_parse_core_mask(mask_str, cpumask) != 0) {
-               SPDK_ERRLOG("cpumask %s is invalid (app mask is 0x%s)\n",
-                           mask_str, spdk_cpuset_fmt(spdk_app_get_core_mask()));
-               rc = -EINVAL;
-               goto out;
+       if (vhost_parse_core_mask(mask_str, &cpumask) != 0) {
+               SPDK_ERRLOG("cpumask %s is invalid (core mask is 0x%s)\n",
+                           mask_str, spdk_cpuset_fmt(&g_vhost_core_mask));
+               return -EINVAL;
         }
  
         if (spdk_vhost_dev_find(name)) {
                 SPDK_ERRLOG("vhost controller %s already exists.\n", name);
-               rc = -EEXIST;
-               goto out;
+               return -EEXIST;
         }
  
         if (snprintf(path, sizeof(path), "%s%s", dev_dirname, name) >= (int)sizeof(path)) {
                 SPDK_ERRLOG("Resulting socket path for controller %s is too long: %s%s\n", name, dev_dirname,
                             name);
-               rc = -EINVAL;
-               goto out;
-       }
-
-       /* Register vhost driver to handle vhost messages. */
-       if (stat(path, &file_stat) != -1) {
-               if (!S_ISSOCK(file_stat.st_mode)) {
-                       SPDK_ERRLOG("Cannot create a domain socket at path \"%s\": "
-                                   "The file already exists and is not a socket.\n",
-                                   path);
-                       rc = -EIO;
-                       goto out;
-               } else if (unlink(path) != 0) {
-                       SPDK_ERRLOG("Cannot create a domain socket at path \"%s\": "
-                                   "The socket already exists and failed to unlink.\n",
-                                   path);
-                       rc = -EIO;
-                       goto out;
-               }
-       }
-
-       if (rte_vhost_driver_register(path, 0) != 0) {
-               SPDK_ERRLOG("Could not register controller %s with vhost library\n", name);
-               SPDK_ERRLOG("Check if domain socket %s already exists\n", path);
-               rc = -EIO;
-               goto out;
-       }
-       if (rte_vhost_driver_set_features(path, backend->virtio_features) ||
-           rte_vhost_driver_disable_features(path, backend->disabled_features)) {
-               SPDK_ERRLOG("Couldn't set vhost features for controller %s\n", name);
-
-               rte_vhost_driver_unregister(path);
-               rc = -EIO;
-               goto out;
+               return -EINVAL;
         }
  
-       if (rte_vhost_driver_callback_register(path, &g_spdk_vhost_ops) != 0) {
-               rte_vhost_driver_unregister(path);
-               SPDK_ERRLOG("Couldn't register callbacks for controller %s\n", name);
+       vdev->name = strdup(name);
+       vdev->path = strdup(path);
+       if (vdev->name == NULL || vdev->path == NULL) {
                 rc = -EIO;
                 goto out;
         }
  
-       vdev->name = strdup(name);
-       vdev->path = strdup(path);
-       if (vdev->name == NULL || vdev->path == NULL) {
-               free(vdev->name);
-               free(vdev->path);
-               rte_vhost_driver_unregister(path);
+       vdev->thread = spdk_thread_create(vdev->name, &cpumask);
+       if (vdev->thread == NULL) {
+               SPDK_ERRLOG("Failed to create thread for vhost controller %s.\n", name);
                 rc = -EIO;
                 goto out;
         }
  
-       vdev->cpumask = cpumask;
         vdev->registered = true;
         vdev->backend = backend;
         TAILQ_INIT(&vdev->vsessions);
-       TAILQ_INSERT_TAIL(&g_spdk_vhost_devices, vdev, tailq);
  
-       spdk_vhost_set_coalescing(vdev, SPDK_VHOST_COALESCING_DELAY_BASE_US,
-                                 SPDK_VHOST_VQ_IOPS_COALESCING_THRESHOLD);
+       vhost_dev_set_coalescing(vdev, SPDK_VHOST_COALESCING_DELAY_BASE_US,
+                                SPDK_VHOST_VQ_IOPS_COALESCING_THRESHOLD);
  
-       spdk_vhost_dev_install_rte_compat_hooks(vdev);
-
-       /* The following might start a POSIX thread that polls for incoming
-        * socket connections and calls backend->start/stop_device. These backend
-        * callbacks are also protected by the global SPDK vhost mutex, so we're
-        * safe with not initializing the vdev just yet.
-        */
-       if (spdk_call_unaffinitized(_start_rte_driver, path) == NULL) {
-               SPDK_ERRLOG("Failed to start vhost driver for controller %s (%d): %s\n",
-                           name, errno, spdk_strerror(errno));
-               rte_vhost_driver_unregister(path);
-               TAILQ_REMOVE(&g_spdk_vhost_devices, vdev, tailq);
-               free(vdev->name);
-               free(vdev->path);
+       if (vhost_register_unix_socket(path, name, vdev->virtio_features, vdev->disabled_features,
+                                      vdev->protocol_features)) {
+               spdk_thread_send_msg(vdev->thread, vhost_dev_thread_exit, NULL);
                 rc = -EIO;
                 goto out;
         }
  
+       TAILQ_INSERT_TAIL(&g_vhost_devices, vdev, tailq);
+
         SPDK_INFOLOG(SPDK_LOG_VHOST, "Controller %s: new controller added\n", vdev->name);
         return 0;
  
  out:
-       spdk_cpuset_free(cpumask);
+       free(vdev->name);
+       free(vdev->path);
         return rc;
  }
  
  int
-spdk_vhost_dev_unregister(struct spdk_vhost_dev *vdev)
+vhost_dev_unregister(struct spdk_vhost_dev *vdev)
  {
         if (!TAILQ_EMPTY(&vdev->vsessions)) {
                 SPDK_ERRLOG("Controller %s has still valid connection.\n", vdev->name);
                 return -EBUSY;
         }
  
-       if (vdev->registered && rte_vhost_driver_unregister(vdev->path) != 0) {
+       if (vdev->registered && vhost_driver_unregister(vdev->path) != 0) {
                 SPDK_ERRLOG("Could not unregister controller %s with vhost library\n"
                             "Check if domain socket %s still exists\n",
                             vdev->name, vdev->path);
@@ -821,27 +918,14 @@ spdk_vhost_dev_unregister(struct spdk_vhost_dev *vdev)
  
         SPDK_INFOLOG(SPDK_LOG_VHOST, "Controller %s: removed\n", vdev->name);
  
+       spdk_thread_send_msg(vdev->thread, vhost_dev_thread_exit, NULL);
+
         free(vdev->name);
         free(vdev->path);
-       spdk_cpuset_free(vdev->cpumask);
-       TAILQ_REMOVE(&g_spdk_vhost_devices, vdev, tailq);
+       TAILQ_REMOVE(&g_vhost_devices, vdev, tailq);
         return 0;
  }
  
-static struct spdk_vhost_session *
-spdk_vhost_session_next(struct spdk_vhost_dev *vdev, unsigned prev_id)
-{
-       struct spdk_vhost_session *vsession;
-
-       TAILQ_FOREACH(vsession, &vdev->vsessions, tailq) {
-               if (vsession->id > prev_id) {
-                       return vsession;
-               }
-       }
-
-       return NULL;
-}
-
  const char *
  spdk_vhost_dev_get_name(struct spdk_vhost_dev *vdev)
  {
@@ -853,209 +937,171 @@ const struct spdk_cpuset *
  spdk_vhost_dev_get_cpumask(struct spdk_vhost_dev *vdev)
  {
         assert(vdev != NULL);
-       return vdev->cpumask;
+       return spdk_thread_get_cpumask(vdev->thread);
  }
  
-uint32_t
-spdk_vhost_allocate_reactor(struct spdk_cpuset *cpumask)
+static void
+wait_for_semaphore(int timeout_sec, const char *errmsg)
  {
-       uint32_t i, selected_core;
-       uint32_t min_ctrlrs;
-
-       min_ctrlrs = INT_MAX;
-       selected_core = spdk_env_get_first_core();
-
-       SPDK_ENV_FOREACH_CORE(i) {
-               if (!spdk_cpuset_get_cpu(cpumask, i)) {
-                       continue;
-               }
+       struct timespec timeout;
+       int rc;
  
-               if (g_num_ctrlrs[i] < min_ctrlrs) {
-                       selected_core = i;
-                       min_ctrlrs = g_num_ctrlrs[i];
-               }
+       clock_gettime(CLOCK_REALTIME, &timeout);
+       timeout.tv_sec += timeout_sec;
+       rc = sem_timedwait(&g_dpdk_sem, &timeout);
+       if (rc != 0) {
+               SPDK_ERRLOG("Timeout waiting for event: %s.\n", errmsg);
+               sem_wait(&g_dpdk_sem);
         }
-
-       g_num_ctrlrs[selected_core]++;
-       return selected_core;
  }
  
  static void
-complete_session_event(struct spdk_vhost_session *vsession, int response)
+vhost_session_cb_done(int rc)
  {
-       struct spdk_vhost_session_fn_ctx *ctx = vsession->event_ctx;
-
-       ctx->response = response;
-       sem_post(&ctx->sem);
+       g_dpdk_response = rc;
+       sem_post(&g_dpdk_sem);
  }
  
  void
-spdk_vhost_session_start_done(struct spdk_vhost_session *vsession, int response)
+vhost_session_start_done(struct spdk_vhost_session *vsession, int response)
  {
         if (response == 0) {
                 vsession->started = true;
+
                 assert(vsession->vdev->active_session_num < UINT32_MAX);
                 vsession->vdev->active_session_num++;
         }
-       complete_session_event(vsession, response);
+
+       vhost_session_cb_done(response);
  }
  
  void
-spdk_vhost_session_stop_done(struct spdk_vhost_session *vsession, int response)
+vhost_session_stop_done(struct spdk_vhost_session *vsession, int response)
  {
         if (response == 0) {
                 vsession->started = false;
+
                 assert(vsession->vdev->active_session_num > 0);
                 vsession->vdev->active_session_num--;
         }
-       complete_session_event(vsession, response);
+
+       vhost_session_cb_done(response);
  }
  
  static void
-spdk_vhost_event_cb(void *arg1, void *arg2)
+vhost_event_cb(void *arg1)
  {
-       struct spdk_vhost_session_fn_ctx *ctx = arg1;
+       struct vhost_session_fn_ctx *ctx = arg1;
         struct spdk_vhost_session *vsession;
-       struct spdk_event *ev;
  
-       if (pthread_mutex_trylock(&g_spdk_vhost_mutex) != 0) {
-               ev = spdk_event_allocate(spdk_env_get_current_core(),
-                                        spdk_vhost_event_cb, arg1, NULL);
-               spdk_event_call(ev);
+       if (pthread_mutex_trylock(&g_vhost_mutex) != 0) {
+               spdk_thread_send_msg(spdk_get_thread(), vhost_event_cb, arg1);
                 return;
         }
  
-       vsession = spdk_vhost_session_find_by_id(ctx->vdev, ctx->vsession_id);
+       vsession = vhost_session_find_by_id(ctx->vdev, ctx->vsession_id);
         ctx->cb_fn(ctx->vdev, vsession, NULL);
-       pthread_mutex_unlock(&g_spdk_vhost_mutex);
+       pthread_mutex_unlock(&g_vhost_mutex);
  }
  
-static void spdk_vhost_external_event_foreach_continue(struct spdk_vhost_dev *vdev,
-               struct spdk_vhost_session *vsession,
-               spdk_vhost_session_fn fn, void *arg);
-
-static void
-spdk_vhost_event_async_foreach_fn(void *arg1, void *arg2)
+int
+vhost_session_send_event(struct spdk_vhost_session *vsession,
+                        spdk_vhost_session_fn cb_fn, unsigned timeout_sec,
+                        const char *errmsg)
  {
-       struct spdk_vhost_session_fn_ctx *ctx = arg1;
-       struct spdk_vhost_session *vsession = NULL;
-       struct spdk_vhost_dev *vdev = ctx->vdev;
-       struct spdk_event *ev;
-       int rc;
+       struct vhost_session_fn_ctx ev_ctx = {0};
+       struct spdk_vhost_dev *vdev = vsession->vdev;
  
-       if (pthread_mutex_trylock(&g_spdk_vhost_mutex) != 0) {
-               ev = spdk_event_allocate(spdk_env_get_current_core(),
-                                        spdk_vhost_event_async_foreach_fn, arg1, NULL);
-               spdk_event_call(ev);
-               return;
-       }
+       ev_ctx.vdev = vdev;
+       ev_ctx.vsession_id = vsession->id;
+       ev_ctx.cb_fn = cb_fn;
  
-       vsession = spdk_vhost_session_find_by_id(vdev, ctx->vsession_id);
-       if (vsession == NULL || !vsession->initialized) {
-               /* The session must have been removed in the meantime, so we
-                * just skip it in our foreach chain
-                */
-               goto out_unlock_continue;
-       }
+       spdk_thread_send_msg(vdev->thread, vhost_event_cb, &ev_ctx);
  
-       if (vsession->started &&
-           (uint32_t)vsession->lcore != spdk_env_get_current_core()) {
-               /* if session has been relocated to other core, it is no longer thread-safe
-                * to access its contents here. Even though we're running under the global
-                * vhost mutex, the session itself (and its pollers) are not. We need to chase
-                * the session thread as many times as necessary.
-                */
-               ev = spdk_event_allocate(vsession->lcore,
-                                        spdk_vhost_event_async_foreach_fn, arg1, NULL);
-               spdk_event_call(ev);
-               pthread_mutex_unlock(&g_spdk_vhost_mutex);
+       pthread_mutex_unlock(&g_vhost_mutex);
+       wait_for_semaphore(timeout_sec, errmsg);
+       pthread_mutex_lock(&g_vhost_mutex);
+
+       return g_dpdk_response;
+}
+
+static void
+foreach_session_finish_cb(void *arg1)
+{
+       struct vhost_session_fn_ctx *ev_ctx = arg1;
+       struct spdk_vhost_dev *vdev = ev_ctx->vdev;
+
+       if (pthread_mutex_trylock(&g_vhost_mutex) != 0) {
+               spdk_thread_send_msg(spdk_get_thread(),
+                                    foreach_session_finish_cb, arg1);
                 return;
         }
  
-       rc = ctx->cb_fn(vdev, vsession, ctx->user_ctx);
-       if (rc < 0) {
-               goto out_unlock;
+       assert(vdev->pending_async_op_num > 0);
+       vdev->pending_async_op_num--;
+       if (ev_ctx->cpl_fn != NULL) {
+               ev_ctx->cpl_fn(vdev, ev_ctx->user_ctx);
         }
  
-out_unlock_continue:
-       vsession = spdk_vhost_session_next(vdev, ctx->vsession_id);
-       spdk_vhost_external_event_foreach_continue(vdev, vsession, ctx->cb_fn, ctx->user_ctx);
-out_unlock:
-       pthread_mutex_unlock(&g_spdk_vhost_mutex);
-       free(ctx);
+       pthread_mutex_unlock(&g_vhost_mutex);
+       free(ev_ctx);
  }
  
-int
-spdk_vhost_session_send_event(int32_t lcore, struct spdk_vhost_session *vsession,
-                             spdk_vhost_session_fn cb_fn, unsigned timeout_sec,
-                             const char *errmsg)
+static void
+foreach_session(void *arg1)
  {
-       struct spdk_vhost_session_fn_ctx ev_ctx = {0};
-       struct spdk_event *ev;
-       struct timespec timeout;
+       struct vhost_session_fn_ctx *ev_ctx = arg1;
+       struct spdk_vhost_session *vsession;
+       struct spdk_vhost_dev *vdev = ev_ctx->vdev;
         int rc;
  
-       rc = sem_init(&ev_ctx.sem, 0, 0);
-       if (rc != 0) {
-               SPDK_ERRLOG("Failed to initialize semaphore for vhost timed event\n");
-               return -errno;
+       if (pthread_mutex_trylock(&g_vhost_mutex) != 0) {
+               spdk_thread_send_msg(spdk_get_thread(), foreach_session, arg1);
+               return;
         }
  
-       ev_ctx.vdev = vsession->vdev;
-       ev_ctx.vsession_id = vsession->id;
-       ev_ctx.cb_fn = cb_fn;
-
-       vsession->lcore = lcore;
-       vsession->event_ctx = &ev_ctx;
-       ev = spdk_event_allocate(lcore, spdk_vhost_event_cb, &ev_ctx, NULL);
-       assert(ev);
-       spdk_event_call(ev);
-       pthread_mutex_unlock(&g_spdk_vhost_mutex);
-
-       clock_gettime(CLOCK_REALTIME, &timeout);
-       timeout.tv_sec += timeout_sec;
-
-       rc = sem_timedwait(&ev_ctx.sem, &timeout);
-       if (rc != 0) {
-               SPDK_ERRLOG("Timeout waiting for event: %s.\n", errmsg);
-               sem_wait(&ev_ctx.sem);
+       TAILQ_FOREACH(vsession, &vdev->vsessions, tailq) {
+               if (vsession->initialized) {
+                       rc = ev_ctx->cb_fn(vdev, vsession, ev_ctx->user_ctx);
+                       if (rc < 0) {
+                               goto out;
+                       }
+               }
         }
  
-       sem_destroy(&ev_ctx.sem);
-       pthread_mutex_lock(&g_spdk_vhost_mutex);
-       vsession->event_ctx = NULL;
-       return ev_ctx.response;
+out:
+       pthread_mutex_unlock(&g_vhost_mutex);
+
+       spdk_thread_send_msg(g_vhost_init_thread, foreach_session_finish_cb, arg1);
  }
  
-static int
-spdk_vhost_event_async_send_foreach_continue(struct spdk_vhost_session *vsession,
-               spdk_vhost_session_fn cb_fn, void *arg)
+void
+vhost_dev_foreach_session(struct spdk_vhost_dev *vdev,
+                         spdk_vhost_session_fn fn,
+                         spdk_vhost_dev_fn cpl_fn,
+                         void *arg)
  {
-       struct spdk_vhost_dev *vdev = vsession->vdev;
-       struct spdk_vhost_session_fn_ctx *ev_ctx;
-       struct spdk_event *ev;
+       struct vhost_session_fn_ctx *ev_ctx;
  
         ev_ctx = calloc(1, sizeof(*ev_ctx));
         if (ev_ctx == NULL) {
                 SPDK_ERRLOG("Failed to alloc vhost event.\n");
                 assert(false);
-               return -ENOMEM;
+               return;
         }
  
         ev_ctx->vdev = vdev;
-       ev_ctx->vsession_id = vsession->id;
-       ev_ctx->cb_fn = cb_fn;
+       ev_ctx->cb_fn = fn;
+       ev_ctx->cpl_fn = cpl_fn;
         ev_ctx->user_ctx = arg;
  
-       ev = spdk_event_allocate(vsession->lcore,
-                                spdk_vhost_event_async_foreach_fn, ev_ctx, NULL);
-       assert(ev);
-       spdk_event_call(ev);
+       assert(vdev->pending_async_op_num < UINT32_MAX);
+       vdev->pending_async_op_num++;
  
-       return 0;
+       spdk_thread_send_msg(vdev->thread, foreach_session, ev_ctx);
  }
  
-static void
+static int
  _stop_session(struct spdk_vhost_session *vsession)
  {
         struct spdk_vhost_dev *vdev = vsession->vdev;
@@ -1066,56 +1112,77 @@ _stop_session(struct spdk_vhost_session *vsession)
         rc = vdev->backend->stop_session(vsession);
         if (rc != 0) {
                 SPDK_ERRLOG("Couldn't stop device with vid %d.\n", vsession->vid);
-               pthread_mutex_unlock(&g_spdk_vhost_mutex);
-               return;
+               pthread_mutex_unlock(&g_vhost_mutex);
+               return rc;
         }
  
         for (i = 0; i < vsession->max_queues; i++) {
                 q = &vsession->virtqueue[i];
+
+               /* vring.desc and vring.desc_packed are in a union struct
+                * so q->vring.desc can replace q->vring.desc_packed.
+                */
                 if (q->vring.desc == NULL) {
                         continue;
                 }
+
+               /* Packed virtqueues support up to 2^15 entries each
+                * so left one bit can be used as wrap counter.
+                */
+               if (q->packed.packed_ring) {
+                       q->last_avail_idx = q->last_avail_idx |
+                                           ((uint16_t)q->packed.avail_phase << 15);
+                       q->last_used_idx = q->last_used_idx |
+                                          ((uint16_t)q->packed.used_phase << 15);
+               }
+
                 rte_vhost_set_vring_base(vsession->vid, i, q->last_avail_idx, q->last_used_idx);
         }
  
-       spdk_vhost_session_mem_unregister(vsession);
+       vhost_session_mem_unregister(vsession->mem);
         free(vsession->mem);
+
+       return 0;
  }
  
-static void
-stop_device(int vid)
+int
+vhost_stop_device_cb(int vid)
  {
         struct spdk_vhost_session *vsession;
+       int rc;
  
-       pthread_mutex_lock(&g_spdk_vhost_mutex);
-       vsession = spdk_vhost_session_find_by_vid(vid);
+       pthread_mutex_lock(&g_vhost_mutex);
+       vsession = vhost_session_find_by_vid(vid);
         if (vsession == NULL) {
                 SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid);
-               pthread_mutex_unlock(&g_spdk_vhost_mutex);
-               return;
+               pthread_mutex_unlock(&g_vhost_mutex);
+               return -EINVAL;
         }
  
         if (!vsession->started) {
                 /* already stopped, nothing to do */
-               pthread_mutex_unlock(&g_spdk_vhost_mutex);
-               return;
+               pthread_mutex_unlock(&g_vhost_mutex);
+               return -EALREADY;
         }
  
-       _stop_session(vsession);
-       pthread_mutex_unlock(&g_spdk_vhost_mutex);
+       rc = _stop_session(vsession);
+       pthread_mutex_unlock(&g_vhost_mutex);
+
+       return rc;
  }
  
-static int
-start_device(int vid)
+int
+vhost_start_device_cb(int vid)
  {
         struct spdk_vhost_dev *vdev;
         struct spdk_vhost_session *vsession;
         int rc = -1;
         uint16_t i;
+       bool packed_ring;
  
-       pthread_mutex_lock(&g_spdk_vhost_mutex);
+       pthread_mutex_lock(&g_vhost_mutex);
  
-       vsession = spdk_vhost_session_find_by_vid(vid);
+       vsession = vhost_session_find_by_vid(vid);
         if (vsession == NULL) {
                 SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid);
                 goto out;
@@ -1128,6 +1195,13 @@ start_device(int vid)
                 goto out;
         }
  
+       if (vhost_get_negotiated_features(vid, &vsession->negotiated_features) != 0) {
+               SPDK_ERRLOG("vhost device %d: Failed to get negotiated driver features\n", vid);
+               goto out;
+       }
+
+       packed_ring = ((vsession->negotiated_features & (1ULL << VIRTIO_F_RING_PACKED)) != 0);
+
         vsession->max_queues = 0;
         memset(vsession->virtqueue, 0, sizeof(vsession->virtqueue));
         for (i = 0; i < SPDK_VHOST_MAX_VQUEUES; i++) {
@@ -1138,7 +1212,11 @@ start_device(int vid)
                         continue;
                 }
                 q->vring_idx = i;
+               rte_vhost_get_vhost_ring_inflight(vid, i, &q->vring_inflight);
  
+               /* vring.desc and vring.desc_packed are in a union struct
+                * so q->vring.desc can replace q->vring.desc_packed.
+                */
                 if (q->vring.desc == NULL || q->vring.size == 0) {
                         continue;
                 }
@@ -1148,21 +1226,27 @@ start_device(int vid)
                         continue;
                 }
  
-               /* Disable notifications. */
-               if (rte_vhost_enable_guest_notification(vid, i, 0) != 0) {
-                       SPDK_ERRLOG("vhost device %d: Failed to disable guest notification on queue %"PRIu16"\n", vid, i);
-                       goto out;
+               if (packed_ring) {
+                       /* Packed virtqueues support up to 2^15 entries each
+                        * so left one bit can be used as wrap counter.
+                        */
+                       q->packed.avail_phase = q->last_avail_idx >> 15;
+                       q->last_avail_idx = q->last_avail_idx & 0x7FFF;
+                       q->packed.used_phase = q->last_used_idx >> 15;
+                       q->last_used_idx = q->last_used_idx & 0x7FFF;
+
+                       /* Disable I/O submission notifications, we'll be polling. */
+                       q->vring.device_event->flags = VRING_PACKED_EVENT_FLAG_DISABLE;
+               } else {
+                       /* Disable I/O submission notifications, we'll be polling. */
+                       q->vring.used->flags = VRING_USED_F_NO_NOTIFY;
                 }
  
+               q->packed.packed_ring = packed_ring;
                 vsession->max_queues = i + 1;
         }
  
-       if (rte_vhost_get_negotiated_features(vid, &vsession->negotiated_features) != 0) {
-               SPDK_ERRLOG("vhost device %d: Failed to get negotiated driver features\n", vid);
-               goto out;
-       }
-
-       if (rte_vhost_get_mem_table(vid, &vsession->mem) != 0) {
+       if (vhost_get_mem_table(vid, &vsession->mem) != 0) {
                 SPDK_ERRLOG("vhost device %d: Failed to get guest memory table\n", vid);
                 goto out;
         }
@@ -1179,36 +1263,39 @@ start_device(int vid)
         for (i = 0; i < vsession->max_queues; i++) {
                 struct spdk_vhost_virtqueue *q = &vsession->virtqueue[i];
  
+               /* vring.desc and vring.desc_packed are in a union struct
+                * so q->vring.desc can replace q->vring.desc_packed.
+                */
                 if (q->vring.desc != NULL && q->vring.size > 0) {
                         rte_vhost_vring_call(vsession->vid, q->vring_idx);
                 }
         }
  
-       spdk_vhost_session_set_coalescing(vdev, vsession, NULL);
-       spdk_vhost_session_mem_register(vsession);
+       vhost_session_set_coalescing(vdev, vsession, NULL);
+       vhost_session_mem_register(vsession->mem);
         vsession->initialized = true;
         rc = vdev->backend->start_session(vsession);
         if (rc != 0) {
-               spdk_vhost_session_mem_unregister(vsession);
+               vhost_session_mem_unregister(vsession->mem);
                 free(vsession->mem);
                 goto out;
         }
  
  out:
-       pthread_mutex_unlock(&g_spdk_vhost_mutex);
+       pthread_mutex_unlock(&g_vhost_mutex);
         return rc;
  }
  
  #ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB
-static int
-get_config(int vid, uint8_t *config, uint32_t len)
+int
+vhost_get_config_cb(int vid, uint8_t *config, uint32_t len)
  {
         struct spdk_vhost_session *vsession;
         struct spdk_vhost_dev *vdev;
         int rc = -1;
  
-       pthread_mutex_lock(&g_spdk_vhost_mutex);
-       vsession = spdk_vhost_session_find_by_vid(vid);
+       pthread_mutex_lock(&g_vhost_mutex);
+       vsession = vhost_session_find_by_vid(vid);
         if (vsession == NULL) {
                 SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid);
                 goto out;
@@ -1220,19 +1307,19 @@ get_config(int vid, uint8_t *config, uint32_t len)
         }
  
  out:
-       pthread_mutex_unlock(&g_spdk_vhost_mutex);
+       pthread_mutex_unlock(&g_vhost_mutex);
         return rc;
  }
  
-static int
-set_config(int vid, uint8_t *config, uint32_t offset, uint32_t size, uint32_t flags)
+int
+vhost_set_config_cb(int vid, uint8_t *config, uint32_t offset, uint32_t size, uint32_t flags)
  {
         struct spdk_vhost_session *vsession;
         struct spdk_vhost_dev *vdev;
         int rc = -1;
  
-       pthread_mutex_lock(&g_spdk_vhost_mutex);
-       vsession = spdk_vhost_session_find_by_vid(vid);
+       pthread_mutex_lock(&g_vhost_mutex);
+       vsession = vhost_session_find_by_vid(vid);
         if (vsession == NULL) {
                 SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid);
                 goto out;
@@ -1244,7 +1331,7 @@ set_config(int vid, uint8_t *config, uint32_t offset, uint32_t size, uint32_t fl
         }
  
  out:
-       pthread_mutex_unlock(&g_spdk_vhost_mutex);
+       pthread_mutex_unlock(&g_vhost_mutex);
         return rc;
  }
  #endif
@@ -1274,7 +1361,7 @@ spdk_vhost_set_socket_path(const char *basename)
  }
  
  void
-spdk_vhost_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
+vhost_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
  {
         assert(vdev->backend->dump_info_json != NULL);
         vdev->backend->dump_info_json(vdev, w);
@@ -1290,25 +1377,18 @@ spdk_vhost_dev_remove(struct spdk_vhost_dev *vdev)
         return vdev->backend->remove_device(vdev);
  }
  
-static int
-new_connection(int vid)
+int
+vhost_new_connection_cb(int vid, const char *ifname)
  {
         struct spdk_vhost_dev *vdev;
         struct spdk_vhost_session *vsession;
-       char ifname[PATH_MAX];
  
-       pthread_mutex_lock(&g_spdk_vhost_mutex);
-
-       if (rte_vhost_get_ifname(vid, ifname, PATH_MAX) < 0) {
-               SPDK_ERRLOG("Couldn't get a valid ifname for device with vid %d\n", vid);
-               pthread_mutex_unlock(&g_spdk_vhost_mutex);
-               return -1;
-       }
+       pthread_mutex_lock(&g_vhost_mutex);
  
         vdev = spdk_vhost_dev_find(ifname);
         if (vdev == NULL) {
                 SPDK_ERRLOG("Couldn't find device with vid %d to create connection for.\n", vid);
-               pthread_mutex_unlock(&g_spdk_vhost_mutex);
+               pthread_mutex_unlock(&g_vhost_mutex);
                 return -1;
         }
  
@@ -1325,15 +1405,21 @@ new_connection(int vid)
         if (posix_memalign((void **)&vsession, SPDK_CACHE_LINE_SIZE, sizeof(*vsession) +
                            vdev->backend->session_ctx_size)) {
                 SPDK_ERRLOG("vsession alloc failed\n");
-               pthread_mutex_unlock(&g_spdk_vhost_mutex);
+               pthread_mutex_unlock(&g_vhost_mutex);
                 return -1;
         }
         memset(vsession, 0, sizeof(*vsession) + vdev->backend->session_ctx_size);
  
         vsession->vdev = vdev;
-       vsession->id = vdev->vsessions_num++;
         vsession->vid = vid;
-       vsession->lcore = -1;
+       vsession->id = vdev->vsessions_num++;
+       vsession->name = spdk_sprintf_alloc("%ss%u", vdev->name, vsession->vid);
+       if (vsession->name == NULL) {
+               SPDK_ERRLOG("vsession alloc failed\n");
+               pthread_mutex_unlock(&g_vhost_mutex);
+               free(vsession);
+               return -1;
+       }
         vsession->started = false;
         vsession->initialized = false;
         vsession->next_stats_check_time = 0;
@@ -1341,194 +1427,120 @@ new_connection(int vid)
                                          spdk_get_ticks_hz() / 1000UL;
         TAILQ_INSERT_TAIL(&vdev->vsessions, vsession, tailq);
  
-       spdk_vhost_session_install_rte_compat_hooks(vsession);
-       pthread_mutex_unlock(&g_spdk_vhost_mutex);
+       vhost_session_install_rte_compat_hooks(vsession);
+       pthread_mutex_unlock(&g_vhost_mutex);
         return 0;
  }
  
-static void
-destroy_connection(int vid)
+int
+vhost_destroy_connection_cb(int vid)
  {
         struct spdk_vhost_session *vsession;
+       int rc = 0;
  
-       pthread_mutex_lock(&g_spdk_vhost_mutex);
-       vsession = spdk_vhost_session_find_by_vid(vid);
+       pthread_mutex_lock(&g_vhost_mutex);
+       vsession = vhost_session_find_by_vid(vid);
         if (vsession == NULL) {
                 SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid);
-               pthread_mutex_unlock(&g_spdk_vhost_mutex);
-               return;
+               pthread_mutex_unlock(&g_vhost_mutex);
+               return -EINVAL;
         }
  
         if (vsession->started) {
-               _stop_session(vsession);
+               rc = _stop_session(vsession);
         }
  
         TAILQ_REMOVE(&vsession->vdev->vsessions, vsession, tailq);
+       free(vsession->name);
         free(vsession);
-       pthread_mutex_unlock(&g_spdk_vhost_mutex);
-}
+       pthread_mutex_unlock(&g_vhost_mutex);
  
-static void
-spdk_vhost_external_event_foreach_continue(struct spdk_vhost_dev *vdev,
-               struct spdk_vhost_session *vsession,
-               spdk_vhost_session_fn fn, void *arg)
-{
-       int rc;
-
-       if (vsession == NULL) {
-               goto out_finish_foreach;
-       }
-
-       while (!vsession->started) {
-               if (vsession->initialized) {
-                       rc = fn(vdev, vsession, arg);
-                       if (rc < 0) {
-                               return;
-                       }
-               }
-
-               vsession = spdk_vhost_session_next(vdev, vsession->id);
-               if (vsession == NULL) {
-                       goto out_finish_foreach;
-               }
-       }
-
-       spdk_vhost_event_async_send_foreach_continue(vsession, fn, arg);
-       return;
-
-out_finish_foreach:
-       /* there are no more sessions to iterate through, so call the
-        * fn one last time with vsession == NULL
-        */
-       assert(vdev->pending_async_op_num > 0);
-       vdev->pending_async_op_num--;
-       fn(vdev, NULL, arg);
-}
-
-void
-spdk_vhost_dev_foreach_session(struct spdk_vhost_dev *vdev,
-                              spdk_vhost_session_fn fn, void *arg)
-{
-       struct spdk_vhost_session *vsession = TAILQ_FIRST(&vdev->vsessions);
-
-       assert(vdev->pending_async_op_num < UINT32_MAX);
-       vdev->pending_async_op_num++;
-       spdk_vhost_external_event_foreach_continue(vdev, vsession, fn, arg);
+       return rc;
  }
  
  void
  spdk_vhost_lock(void)
  {
-       pthread_mutex_lock(&g_spdk_vhost_mutex);
+       pthread_mutex_lock(&g_vhost_mutex);
  }
  
  int
  spdk_vhost_trylock(void)
  {
-       return -pthread_mutex_trylock(&g_spdk_vhost_mutex);
+       return -pthread_mutex_trylock(&g_vhost_mutex);
  }
  
  void
  spdk_vhost_unlock(void)
  {
-       pthread_mutex_unlock(&g_spdk_vhost_mutex);
+       pthread_mutex_unlock(&g_vhost_mutex);
  }
  
-static void
-vhost_create_poll_group_done(void *ctx)
+void
+spdk_vhost_init(spdk_vhost_init_cb init_cb)
  {
-       spdk_vhost_init_cb init_cb = ctx;
+       size_t len;
         int ret;
  
-       if (TAILQ_EMPTY(&g_poll_groups)) {
-               /* No threads? Iteration failed? */
-               init_cb(-ECHILD);
-               return;
+       g_vhost_init_thread = spdk_get_thread();
+       assert(g_vhost_init_thread != NULL);
+
+       if (dev_dirname[0] == '\0') {
+               if (getcwd(dev_dirname, sizeof(dev_dirname) - 1) == NULL) {
+                       SPDK_ERRLOG("getcwd failed (%d): %s\n", errno, spdk_strerror(errno));
+                       ret = -1;
+                       goto out;
+               }
+
+               len = strlen(dev_dirname);
+               if (dev_dirname[len - 1] != '/') {
+                       dev_dirname[len] = '/';
+                       dev_dirname[len + 1] = '\0';
+               }
         }
  
-       ret = spdk_vhost_scsi_controller_construct();
+       ret = sem_init(&g_dpdk_sem, 0, 0);
+       if (ret != 0) {
+               SPDK_ERRLOG("Failed to initialize semaphore for rte_vhost pthread.\n");
+               ret = -1;
+               goto out;
+       }
+
+       ret = vhost_scsi_controller_construct();
         if (ret != 0) {
                 SPDK_ERRLOG("Cannot construct vhost controllers\n");
                 goto out;
         }
  
-       ret = spdk_vhost_blk_controller_construct();
+       ret = vhost_blk_controller_construct();
         if (ret != 0) {
                 SPDK_ERRLOG("Cannot construct vhost block controllers\n");
                 goto out;
         }
  
  #ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB
-       ret = spdk_vhost_nvme_controller_construct();
+       ret = vhost_nvme_controller_construct();
         if (ret != 0) {
                 SPDK_ERRLOG("Cannot construct vhost NVMe controllers\n");
                 goto out;
         }
  #endif
  
-out:
-       init_cb(ret);
-}
-
-static void
-vhost_create_poll_group(void *ctx)
-{
-       struct vhost_poll_group *pg;
+       spdk_cpuset_zero(&g_vhost_core_mask);
  
-       pg = calloc(1, sizeof(*pg));
-       if (!pg) {
-               SPDK_ERRLOG("Not enough memory to allocate poll groups\n");
-               spdk_app_stop(-ENOMEM);
-               return;
-       }
-
-       pg->thread = spdk_get_thread();
-       TAILQ_INSERT_TAIL(&g_poll_groups, pg, tailq);
-}
-
-void
-spdk_vhost_init(spdk_vhost_init_cb init_cb)
-{
-       uint32_t last_core;
-       size_t len;
-       int ret;
-
-       if (dev_dirname[0] == '\0') {
-               if (getcwd(dev_dirname, sizeof(dev_dirname) - 1) == NULL) {
-                       SPDK_ERRLOG("getcwd failed (%d): %s\n", errno, spdk_strerror(errno));
-                       ret = -1;
-                       goto err_out;
-               }
-
-               len = strlen(dev_dirname);
-               if (dev_dirname[len - 1] != '/') {
-                       dev_dirname[len] = '/';
-                       dev_dirname[len + 1] = '\0';
-               }
-       }
-
-       last_core = spdk_env_get_last_core();
-       g_num_ctrlrs = calloc(last_core + 1, sizeof(uint32_t));
-       if (!g_num_ctrlrs) {
-               SPDK_ERRLOG("Could not allocate array size=%u for g_num_ctrlrs\n",
-                           last_core + 1);
-               ret = -1;
-               goto err_out;
-       }
-
-       spdk_for_each_thread(vhost_create_poll_group,
-                            init_cb,
-                            vhost_create_poll_group_done);
+       /* iterate threads instead of using SPDK_ENV_FOREACH_CORE to ensure that threads are really
+        * created.
+        */
+       spdk_for_each_thread(vhost_setup_core_mask, init_cb, vhost_setup_core_mask_done);
         return;
-err_out:
+out:
         init_cb(ret);
  }
  
  static void
-_spdk_vhost_fini(void *arg1)
+vhost_fini(void *arg1)
  {
         struct spdk_vhost_dev *vdev, *tmp;
-       struct vhost_poll_group *pg, *tpg;
  
         spdk_vhost_lock();
         vdev = spdk_vhost_dev_next(NULL);
@@ -1540,12 +1552,11 @@ _spdk_vhost_fini(void *arg1)
         }
         spdk_vhost_unlock();
  
+       spdk_cpuset_zero(&g_vhost_core_mask);
+
         /* All devices are removed now. */
-       free(g_num_ctrlrs);
-       TAILQ_FOREACH_SAFE(pg, &g_poll_groups, tailq, tpg) {
-               TAILQ_REMOVE(&g_poll_groups, pg, tailq);
-               free(pg);
-       }
+       sem_destroy(&g_dpdk_sem);
+
         g_fini_cpl_cb();
  }
  
@@ -1554,13 +1565,13 @@ session_shutdown(void *arg)
  {
         struct spdk_vhost_dev *vdev = NULL;
  
-       TAILQ_FOREACH(vdev, &g_spdk_vhost_devices, tailq) {
-               rte_vhost_driver_unregister(vdev->path);
+       TAILQ_FOREACH(vdev, &g_vhost_devices, tailq) {
+               vhost_driver_unregister(vdev->path);
                 vdev->registered = false;
         }
  
         SPDK_INFOLOG(SPDK_LOG_VHOST, "Exiting\n");
-       spdk_thread_send_msg(g_fini_thread, _spdk_vhost_fini, NULL);
+       spdk_thread_send_msg(g_vhost_init_thread, vhost_fini, NULL);
         return NULL;
  }
  
@@ -1570,7 +1581,7 @@ spdk_vhost_fini(spdk_vhost_fini_cb fini_cb)
         pthread_t tid;
         int rc;
  
-       g_fini_thread = spdk_get_thread();
+       assert(spdk_get_thread() == g_vhost_init_thread);
         g_fini_cpl_cb = fini_cb;
  
         /* rte_vhost API for removing sockets is not asynchronous. Since it may call SPDK
@@ -1602,7 +1613,7 @@ spdk_vhost_config_json(struct spdk_json_write_ctx *w)
                 spdk_vhost_get_coalescing(vdev, &delay_base_us, &iops_threshold);
                 if (delay_base_us) {
                         spdk_json_write_object_begin(w);
-                       spdk_json_write_named_string(w, "method", "set_vhost_controller_coalescing");
+                       spdk_json_write_named_string(w, "method", "vhost_controller_set_coalescing");
  
                         spdk_json_write_named_object_begin(w, "params");
                         spdk_json_write_named_string(w, "ctrlr", vdev->name);