import 15.2.0 Octopus source

[ceph.git] / ceph / src / spdk / lib / vhost / vhost_blk.c
diff --git a/ceph/src/spdk/lib/vhost/vhost_blk.c b/ceph/src/spdk/lib/vhost/vhost_blk.c

index 6a9a1896f57dccbb3b80f726e5945b3317287a37..3154098501c2b177c42c6171f1a887a9691d3c9f 100644 (file)
--- a/ceph/src/spdk/lib/vhost/vhost_blk.c
+++ b/ceph/src/spdk/lib/vhost/vhost_blk.c
@@ -35,6 +35,7 @@
  
  #include "spdk/env.h"
  #include "spdk/bdev.h"
+#include "spdk/bdev_module.h"
  #include "spdk/conf.h"
  #include "spdk/thread.h"
  #include "spdk/likely.h"
@@ -46,7 +47,7 @@
  
  struct spdk_vhost_blk_task {
         struct spdk_bdev_io *bdev_io;
-       struct spdk_vhost_blk_dev *bvdev;
+       struct spdk_vhost_blk_session *bvsession;
         struct spdk_vhost_virtqueue *vq;
  
         volatile uint8_t *status;
@@ -69,24 +70,31 @@ struct spdk_vhost_blk_dev {
         struct spdk_vhost_dev vdev;
         struct spdk_bdev *bdev;
         struct spdk_bdev_desc *bdev_desc;
-       struct spdk_io_channel *bdev_io_channel;
-       struct spdk_poller *requestq_poller;
-       struct spdk_vhost_dev_destroy_ctx destroy_ctx;
         bool readonly;
  };
  
+struct spdk_vhost_blk_session {
+       /* The parent session must be the very first field in this struct */
+       struct spdk_vhost_session vsession;
+       struct spdk_vhost_blk_dev *bvdev;
+       struct spdk_poller *requestq_poller;
+       struct spdk_io_channel *io_channel;
+       struct spdk_poller *stop_poller;
+};
+
  /* forward declaration */
  static const struct spdk_vhost_dev_backend vhost_blk_device_backend;
  
  static int
-process_blk_request(struct spdk_vhost_blk_task *task, struct spdk_vhost_blk_dev *bvdev,
+process_blk_request(struct spdk_vhost_blk_task *task,
+                   struct spdk_vhost_blk_session *bvsession,
                     struct spdk_vhost_virtqueue *vq);
  
  static void
  blk_task_finish(struct spdk_vhost_blk_task *task)
  {
-       assert(task->bvdev->vdev.task_cnt > 0);
-       task->bvdev->vdev.task_cnt--;
+       assert(task->bvsession->vsession.task_cnt > 0);
+       task->bvsession->vsession.task_cnt--;
         task->used = false;
  }
  
@@ -97,7 +105,7 @@ invalid_blk_request(struct spdk_vhost_blk_task *task, uint8_t status)
                 *task->status = status;
         }
  
-       spdk_vhost_vq_used_ring_enqueue(&task->bvdev->vdev, task->vq, task->req_idx,
+       spdk_vhost_vq_used_ring_enqueue(&task->bvsession->vsession, task->vq, task->req_idx,
                                         task->used_len);
         blk_task_finish(task);
         SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Invalid request (status=%" PRIu8")\n", status);
@@ -111,20 +119,24 @@ invalid_blk_request(struct spdk_vhost_blk_task *task, uint8_t status)
   *   FIXME: Make this function return to rd_cnt and wr_cnt
   */
  static int
-blk_iovs_setup(struct spdk_vhost_dev *vdev, struct spdk_vhost_virtqueue *vq, uint16_t req_idx,
-              struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length)
+blk_iovs_setup(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq,
+              uint16_t req_idx, struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length)
  {
+       struct spdk_vhost_session *vsession = &bvsession->vsession;
+       struct spdk_vhost_dev *vdev = vsession->vdev;
         struct vring_desc *desc, *desc_table;
         uint16_t out_cnt = 0, cnt = 0;
         uint32_t desc_table_size, len = 0;
+       uint32_t desc_handled_cnt;
         int rc;
  
-       rc = spdk_vhost_vq_get_desc(vdev, vq, req_idx, &desc, &desc_table, &desc_table_size);
+       rc = spdk_vhost_vq_get_desc(vsession, vq, req_idx, &desc, &desc_table, &desc_table_size);
         if (rc != 0) {
                 SPDK_ERRLOG("%s: Invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx);
                 return -1;
         }
  
+       desc_handled_cnt = 0;
         while (1) {
                 /*
                  * Maximum cnt reached?
@@ -136,7 +148,7 @@ blk_iovs_setup(struct spdk_vhost_dev *vdev, struct spdk_vhost_virtqueue *vq, uin
                         return -1;
                 }
  
-               if (spdk_unlikely(spdk_vhost_vring_desc_to_iov(vdev, iovs, &cnt, desc))) {
+               if (spdk_unlikely(spdk_vhost_vring_desc_to_iov(vsession, iovs, &cnt, desc))) {
                         SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n",
                                       req_idx, cnt);
                         return -1;
@@ -154,6 +166,14 @@ blk_iovs_setup(struct spdk_vhost_dev *vdev, struct spdk_vhost_virtqueue *vq, uin
                 } else if (desc == NULL) {
                         break;
                 }
+
+               desc_handled_cnt++;
+               if (spdk_unlikely(desc_handled_cnt > desc_table_size)) {
+                       /* Break a cycle and report an error, if any. */
+                       SPDK_ERRLOG("%s: found a cycle in the descriptor chain: desc_table_size = %d, desc_handled_cnt = %d.\n",
+                                   vdev->name, desc_table_size, desc_handled_cnt);
+                       return -1;
+               }
         }
  
         /*
@@ -174,7 +194,7 @@ static void
  blk_request_finish(bool success, struct spdk_vhost_blk_task *task)
  {
         *task->status = success ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR;
-       spdk_vhost_vq_used_ring_enqueue(&task->bvdev->vdev, task->vq, task->req_idx,
+       spdk_vhost_vq_used_ring_enqueue(&task->bvsession->vsession, task->vq, task->req_idx,
                                         task->used_len);
         SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Finished task (%p) req_idx=%d\n status: %s\n", task,
                       task->req_idx, success ? "OK" : "FAIL");
@@ -196,7 +216,7 @@ blk_request_resubmit(void *arg)
         struct spdk_vhost_blk_task *task = (struct spdk_vhost_blk_task *)arg;
         int rc = 0;
  
-       rc = process_blk_request(task, task->bvdev, task->vq);
+       rc = process_blk_request(task, task->bvsession, task->vq);
         if (rc == 0) {
                 SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p resubmitted ======\n", task);
         } else {
@@ -208,14 +228,14 @@ static inline void
  blk_request_queue_io(struct spdk_vhost_blk_task *task)
  {
         int rc;
-       struct spdk_vhost_blk_dev *bvdev = task->bvdev;
-       struct spdk_bdev *bdev = bvdev->bdev;
+       struct spdk_vhost_blk_session *bvsession = task->bvsession;
+       struct spdk_bdev *bdev = bvsession->bvdev->bdev;
  
         task->bdev_io_wait.bdev = bdev;
         task->bdev_io_wait.cb_fn = blk_request_resubmit;
         task->bdev_io_wait.cb_arg = task;
  
-       rc = spdk_bdev_queue_io_wait(bdev, bvdev->bdev_io_channel, &task->bdev_io_wait);
+       rc = spdk_bdev_queue_io_wait(bdev, bvsession->io_channel, &task->bdev_io_wait);
         if (rc != 0) {
                 SPDK_ERRLOG("Queue io failed in vhost_blk, rc=%d\n", rc);
                 invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
@@ -223,16 +243,20 @@ blk_request_queue_io(struct spdk_vhost_blk_task *task)
  }
  
  static int
-process_blk_request(struct spdk_vhost_blk_task *task, struct spdk_vhost_blk_dev *bvdev,
+process_blk_request(struct spdk_vhost_blk_task *task,
+                   struct spdk_vhost_blk_session *bvsession,
                     struct spdk_vhost_virtqueue *vq)
  {
+       struct spdk_vhost_blk_dev *bvdev = bvsession->bvdev;
         const struct virtio_blk_outhdr *req;
+       struct virtio_blk_discard_write_zeroes *desc;
         struct iovec *iov;
         uint32_t type;
         uint32_t payload_len;
+       uint64_t flush_bytes;
         int rc;
  
-       if (blk_iovs_setup(&bvdev->vdev, vq, task->req_idx, task->iovs, &task->iovcnt, &payload_len)) {
+       if (blk_iovs_setup(bvsession, vq, task->req_idx, task->iovs, &task->iovcnt, &payload_len)) {
                 SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Invalid request (req_idx = %"PRIu16").\n", task->req_idx);
                 /* Only READ and WRITE are supported for now. */
                 invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
@@ -272,7 +296,7 @@ process_blk_request(struct spdk_vhost_blk_task *task, struct spdk_vhost_blk_dev
         switch (type) {
         case VIRTIO_BLK_T_IN:
         case VIRTIO_BLK_T_OUT:
-               if (spdk_unlikely((payload_len & (512 - 1)) != 0)) {
+               if (spdk_unlikely(payload_len == 0 || (payload_len & (512 - 1)) != 0)) {
                         SPDK_ERRLOG("%s - passed IO buffer is not multiple of 512b (req_idx = %"PRIu16").\n",
                                     type ? "WRITE" : "READ", task->req_idx);
                         invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
@@ -281,12 +305,12 @@ process_blk_request(struct spdk_vhost_blk_task *task, struct spdk_vhost_blk_dev
  
                 if (type == VIRTIO_BLK_T_IN) {
                         task->used_len = payload_len + sizeof(*task->status);
-                       rc = spdk_bdev_readv(bvdev->bdev_desc, bvdev->bdev_io_channel,
+                       rc = spdk_bdev_readv(bvdev->bdev_desc, bvsession->io_channel,
                                              &task->iovs[1], task->iovcnt, req->sector * 512,
                                              payload_len, blk_request_complete_cb, task);
                 } else if (!bvdev->readonly) {
                         task->used_len = sizeof(*task->status);
-                       rc = spdk_bdev_writev(bvdev->bdev_desc, bvdev->bdev_io_channel,
+                       rc = spdk_bdev_writev(bvdev->bdev_desc, bvsession->io_channel,
                                               &task->iovs[1], task->iovcnt, req->sector * 512,
                                               payload_len, blk_request_complete_cb, task);
                 } else {
@@ -304,6 +328,75 @@ process_blk_request(struct spdk_vhost_blk_task *task, struct spdk_vhost_blk_dev
                         }
                 }
                 break;
+       case VIRTIO_BLK_T_DISCARD:
+               desc = task->iovs[1].iov_base;
+               if (payload_len != sizeof(*desc)) {
+                       SPDK_NOTICELOG("Invalid discard payload size: %u\n", payload_len);
+                       invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
+                       return -1;
+               }
+
+               rc = spdk_bdev_unmap(bvdev->bdev_desc, bvsession->io_channel,
+                                    desc->sector * 512, desc->num_sectors * 512,
+                                    blk_request_complete_cb, task);
+               if (rc) {
+                       if (rc == -ENOMEM) {
+                               SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
+                               blk_request_queue_io(task);
+                       } else {
+                               invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
+                               return -1;
+                       }
+               }
+               break;
+       case VIRTIO_BLK_T_WRITE_ZEROES:
+               desc = task->iovs[1].iov_base;
+               if (payload_len != sizeof(*desc)) {
+                       SPDK_NOTICELOG("Invalid write zeroes payload size: %u\n", payload_len);
+                       invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
+                       return -1;
+               }
+
+               /* Zeroed and Unmap the range, SPDK doen't support it. */
+               if (desc->flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) {
+                       SPDK_NOTICELOG("Can't support Write Zeroes with Unmap flag\n");
+                       invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
+                       return -1;
+               }
+
+               rc = spdk_bdev_write_zeroes(bvdev->bdev_desc, bvsession->io_channel,
+                                           desc->sector * 512, desc->num_sectors * 512,
+                                           blk_request_complete_cb, task);
+               if (rc) {
+                       if (rc == -ENOMEM) {
+                               SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
+                               blk_request_queue_io(task);
+                       } else {
+                               invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
+                               return -1;
+                       }
+               }
+               break;
+       case VIRTIO_BLK_T_FLUSH:
+               flush_bytes = spdk_bdev_get_num_blocks(bvdev->bdev) * spdk_bdev_get_block_size(bvdev->bdev);
+               if (req->sector != 0) {
+                       SPDK_NOTICELOG("sector must be zero for flush command\n");
+                       invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
+                       return -1;
+               }
+               rc = spdk_bdev_flush(bvdev->bdev_desc, bvsession->io_channel,
+                                    0, flush_bytes,
+                                    blk_request_complete_cb, task);
+               if (rc) {
+                       if (rc == -ENOMEM) {
+                               SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
+                               blk_request_queue_io(task);
+                       } else {
+                               invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
+                               return -1;
+                       }
+               }
+               break;
         case VIRTIO_BLK_T_GET_ID:
                 if (!task->iovcnt || !payload_len) {
                         invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
@@ -324,9 +417,11 @@ process_blk_request(struct spdk_vhost_blk_task *task, struct spdk_vhost_blk_dev
  }
  
  static void
-process_vq(struct spdk_vhost_blk_dev *bvdev, struct spdk_vhost_virtqueue *vq)
+process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
  {
+       struct spdk_vhost_blk_dev *bvdev = bvsession->bvdev;
         struct spdk_vhost_blk_task *task;
+       struct spdk_vhost_session *vsession = &bvsession->vsession;
         int rc;
         uint16_t reqs[32];
         uint16_t reqs_cnt, i;
@@ -343,7 +438,7 @@ process_vq(struct spdk_vhost_blk_dev *bvdev, struct spdk_vhost_virtqueue *vq)
                 if (spdk_unlikely(reqs[i] >= vq->vring.size)) {
                         SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n",
                                     bvdev->vdev.name, reqs[i], vq->vring.size);
-                       spdk_vhost_vq_used_ring_enqueue(&bvdev->vdev, vq, reqs[i], 0);
+                       spdk_vhost_vq_used_ring_enqueue(vsession, vq, reqs[i], 0);
                         continue;
                 }
  
@@ -351,18 +446,18 @@ process_vq(struct spdk_vhost_blk_dev *bvdev, struct spdk_vhost_virtqueue *vq)
                 if (spdk_unlikely(task->used)) {
                         SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
                                     bvdev->vdev.name, reqs[i]);
-                       spdk_vhost_vq_used_ring_enqueue(&bvdev->vdev, vq, reqs[i], 0);
+                       spdk_vhost_vq_used_ring_enqueue(vsession, vq, reqs[i], 0);
                         continue;
                 }
  
-               bvdev->vdev.task_cnt++;
+               vsession->task_cnt++;
  
                 task->used = true;
                 task->iovcnt = SPDK_COUNTOF(task->iovs);
                 task->status = NULL;
                 task->used_len = 0;
  
-               rc = process_blk_request(task, bvdev, vq);
+               rc = process_blk_request(task, bvsession, vq);
                 if (rc == 0) {
                         SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p req_idx %d submitted ======\n", task,
                                       reqs[i]);
@@ -375,21 +470,24 @@ process_vq(struct spdk_vhost_blk_dev *bvdev, struct spdk_vhost_virtqueue *vq)
  static int
  vdev_worker(void *arg)
  {
-       struct spdk_vhost_blk_dev *bvdev = arg;
+       struct spdk_vhost_blk_session *bvsession = arg;
+       struct spdk_vhost_session *vsession = &bvsession->vsession;
+
         uint16_t q_idx;
  
-       for (q_idx = 0; q_idx < bvdev->vdev.max_queues; q_idx++) {
-               process_vq(bvdev, &bvdev->vdev.virtqueue[q_idx]);
+       for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
+               process_vq(bvsession, &vsession->virtqueue[q_idx]);
         }
  
-       spdk_vhost_dev_used_signal(&bvdev->vdev);
+       spdk_vhost_session_used_signal(vsession);
  
         return -1;
  }
  
  static void
-no_bdev_process_vq(struct spdk_vhost_blk_dev *bvdev, struct spdk_vhost_virtqueue *vq)
+no_bdev_process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
  {
+       struct spdk_vhost_session *vsession = &bvsession->vsession;
         struct iovec iovs[SPDK_VHOST_IOVS_MAX];
         uint32_t length;
         uint16_t iovcnt, req_idx;
@@ -399,34 +497,50 @@ no_bdev_process_vq(struct spdk_vhost_blk_dev *bvdev, struct spdk_vhost_virtqueue
         }
  
         iovcnt = SPDK_COUNTOF(iovs);
-       if (blk_iovs_setup(&bvdev->vdev, vq, req_idx, iovs, &iovcnt, &length) == 0) {
+       if (blk_iovs_setup(bvsession, vq, req_idx, iovs, &iovcnt, &length) == 0) {
                 *(volatile uint8_t *)iovs[iovcnt - 1].iov_base = VIRTIO_BLK_S_IOERR;
                 SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Aborting request %" PRIu16"\n", req_idx);
         }
  
-       spdk_vhost_vq_used_ring_enqueue(&bvdev->vdev, vq, req_idx, 0);
+       spdk_vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0);
  }
  
  static int
  no_bdev_vdev_worker(void *arg)
  {
-       struct spdk_vhost_blk_dev *bvdev = arg;
+       struct spdk_vhost_blk_session *bvsession = arg;
+       struct spdk_vhost_session *vsession = &bvsession->vsession;
         uint16_t q_idx;
  
-       for (q_idx = 0; q_idx < bvdev->vdev.max_queues; q_idx++) {
-               no_bdev_process_vq(bvdev, &bvdev->vdev.virtqueue[q_idx]);
+       for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
+               no_bdev_process_vq(bvsession, &vsession->virtqueue[q_idx]);
         }
  
-       spdk_vhost_dev_used_signal(&bvdev->vdev);
+       spdk_vhost_session_used_signal(vsession);
  
-       if (bvdev->vdev.task_cnt == 0 && bvdev->bdev_io_channel) {
-               spdk_put_io_channel(bvdev->bdev_io_channel);
-               bvdev->bdev_io_channel = NULL;
+       if (vsession->task_cnt == 0 && bvsession->io_channel) {
+               spdk_put_io_channel(bvsession->io_channel);
+               bvsession->io_channel = NULL;
         }
  
         return -1;
  }
  
+static struct spdk_vhost_blk_session *
+to_blk_session(struct spdk_vhost_session *vsession)
+{
+       if (vsession == NULL) {
+               return NULL;
+       }
+
+       if (vsession->vdev->backend != &vhost_blk_device_backend) {
+               SPDK_ERRLOG("%s: not a vhost-blk device\n", vsession->vdev->name);
+               return NULL;
+       }
+
+       return (struct spdk_vhost_blk_session *)vsession;
+}
+
  static struct spdk_vhost_blk_dev *
  to_blk_dev(struct spdk_vhost_dev *vdev)
  {
@@ -452,20 +566,34 @@ spdk_vhost_blk_get_dev(struct spdk_vhost_dev *vdev)
  }
  
  static int
-_bdev_remove_cb(struct spdk_vhost_dev *vdev, void *arg)
+_spdk_vhost_session_bdev_remove_cb(struct spdk_vhost_dev *vdev, struct spdk_vhost_session *vsession,
+                                  void *ctx)
  {
-       struct spdk_vhost_blk_dev *bvdev = arg;
+       struct spdk_vhost_blk_session *bvsession;
  
-       SPDK_WARNLOG("Controller %s: Hot-removing bdev - all further requests will fail.\n",
-                    bvdev->vdev.name);
-       if (bvdev->requestq_poller) {
-               spdk_poller_unregister(&bvdev->requestq_poller);
-               bvdev->requestq_poller = spdk_poller_register(no_bdev_vdev_worker, bvdev, 0);
+       if (vdev == NULL) {
+               /* Nothing to do */
+               return 0;
+       }
+
+       if (vsession == NULL) {
+               /* All sessions have been notified, time to close the bdev */
+               struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
+
+               assert(bvdev != NULL);
+
+               spdk_bdev_close(bvdev->bdev_desc);
+               bvdev->bdev_desc = NULL;
+               bvdev->bdev = NULL;
+               return 0;
+       }
+
+       bvsession = (struct spdk_vhost_blk_session *)vsession;
+       if (bvsession->requestq_poller) {
+               spdk_poller_unregister(&bvsession->requestq_poller);
+               bvsession->requestq_poller = spdk_poller_register(no_bdev_vdev_worker, bvsession, 0);
         }
  
-       spdk_bdev_close(bvdev->bdev_desc);
-       bvdev->bdev_desc = NULL;
-       bvdev->bdev = NULL;
         return 0;
  }
  
@@ -474,37 +602,45 @@ bdev_remove_cb(void *remove_ctx)
  {
         struct spdk_vhost_blk_dev *bvdev = remove_ctx;
  
-       spdk_vhost_call_external_event(bvdev->vdev.name, _bdev_remove_cb, bvdev);
+       SPDK_WARNLOG("Controller %s: Hot-removing bdev - all further requests will fail.\n",
+                    bvdev->vdev.name);
+
+       spdk_vhost_lock();
+       spdk_vhost_dev_foreach_session(&bvdev->vdev, _spdk_vhost_session_bdev_remove_cb, NULL);
+       spdk_vhost_unlock();
  }
  
  static void
-free_task_pool(struct spdk_vhost_blk_dev *bvdev)
+free_task_pool(struct spdk_vhost_blk_session *bvsession)
  {
+       struct spdk_vhost_session *vsession = &bvsession->vsession;
         struct spdk_vhost_virtqueue *vq;
         uint16_t i;
  
-       for (i = 0; i < bvdev->vdev.max_queues; i++) {
-               vq = &bvdev->vdev.virtqueue[i];
+       for (i = 0; i < vsession->max_queues; i++) {
+               vq = &vsession->virtqueue[i];
                 if (vq->tasks == NULL) {
                         continue;
                 }
  
-               spdk_dma_free(vq->tasks);
+               spdk_free(vq->tasks);
                 vq->tasks = NULL;
         }
  }
  
  static int
-alloc_task_pool(struct spdk_vhost_blk_dev *bvdev)
+alloc_task_pool(struct spdk_vhost_blk_session *bvsession)
  {
+       struct spdk_vhost_session *vsession = &bvsession->vsession;
+       struct spdk_vhost_blk_dev *bvdev = bvsession->bvdev;
         struct spdk_vhost_virtqueue *vq;
         struct spdk_vhost_blk_task *task;
         uint32_t task_cnt;
         uint16_t i;
         uint32_t j;
  
-       for (i = 0; i < bvdev->vdev.max_queues; i++) {
-               vq = &bvdev->vdev.virtqueue[i];
+       for (i = 0; i < vsession->max_queues; i++) {
+               vq = &vsession->virtqueue[i];
                 if (vq->vring.desc == NULL) {
                         continue;
                 }
@@ -514,21 +650,22 @@ alloc_task_pool(struct spdk_vhost_blk_dev *bvdev)
                         /* sanity check */
                         SPDK_ERRLOG("Controller %s: virtuque %"PRIu16" is too big. (size = %"PRIu32", max = %"PRIu32")\n",
                                     bvdev->vdev.name, i, task_cnt, SPDK_VHOST_MAX_VQ_SIZE);
-                       free_task_pool(bvdev);
+                       free_task_pool(bvsession);
                         return -1;
                 }
-               vq->tasks = spdk_dma_zmalloc(sizeof(struct spdk_vhost_blk_task) * task_cnt,
-                                            SPDK_CACHE_LINE_SIZE, NULL);
+               vq->tasks = spdk_zmalloc(sizeof(struct spdk_vhost_blk_task) * task_cnt,
+                                        SPDK_CACHE_LINE_SIZE, NULL,
+                                        SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
                 if (vq->tasks == NULL) {
                         SPDK_ERRLOG("Controller %s: failed to allocate %"PRIu32" tasks for virtqueue %"PRIu16"\n",
                                     bvdev->vdev.name, task_cnt, i);
-                       free_task_pool(bvdev);
+                       free_task_pool(bvsession);
                         return -1;
                 }
  
                 for (j = 0; j < task_cnt; j++) {
                         task = &((struct spdk_vhost_blk_task *)vq->tasks)[j];
-                       task->bvdev = bvdev;
+                       task->bvsession = bvsession;
                         task->req_idx = j;
                         task->vq = vq;
                 }
@@ -537,109 +674,140 @@ alloc_task_pool(struct spdk_vhost_blk_dev *bvdev)
         return 0;
  }
  
-/*
- * A new device is added to a data core. First the device is added to the main linked list
- * and then allocated to a specific data core.
- *
- */
  static int
-spdk_vhost_blk_start(struct spdk_vhost_dev *vdev, void *event_ctx)
+spdk_vhost_blk_start_cb(struct spdk_vhost_dev *vdev,
+                       struct spdk_vhost_session *vsession, void *unused)
  {
         struct spdk_vhost_blk_dev *bvdev;
+       struct spdk_vhost_blk_session *bvsession;
         int i, rc = 0;
  
-       bvdev = to_blk_dev(vdev);
-       if (bvdev == NULL) {
+       bvsession = to_blk_session(vsession);
+       if (bvsession == NULL) {
                 SPDK_ERRLOG("Trying to start non-blk controller as a blk one.\n");
                 rc = -1;
                 goto out;
         }
  
+       bvdev = to_blk_dev(vdev);
+       assert(bvdev != NULL);
+       bvsession->bvdev = bvdev;
+
         /* validate all I/O queues are in a contiguous index range */
-       for (i = 0; i < vdev->max_queues; i++) {
-               if (vdev->virtqueue[i].vring.desc == NULL) {
+       for (i = 0; i < vsession->max_queues; i++) {
+               if (vsession->virtqueue[i].vring.desc == NULL) {
                         SPDK_ERRLOG("%s: queue %"PRIu32" is empty\n", vdev->name, i);
                         rc = -1;
                         goto out;
                 }
         }
  
-       rc = alloc_task_pool(bvdev);
+       rc = alloc_task_pool(bvsession);
         if (rc != 0) {
                 SPDK_ERRLOG("%s: failed to alloc task pool.\n", bvdev->vdev.name);
                 goto out;
         }
  
         if (bvdev->bdev) {
-               bvdev->bdev_io_channel = spdk_bdev_get_io_channel(bvdev->bdev_desc);
-               if (!bvdev->bdev_io_channel) {
-                       free_task_pool(bvdev);
+               bvsession->io_channel = spdk_bdev_get_io_channel(bvdev->bdev_desc);
+               if (!bvsession->io_channel) {
+                       free_task_pool(bvsession);
                         SPDK_ERRLOG("Controller %s: IO channel allocation failed\n", vdev->name);
                         rc = -1;
                         goto out;
                 }
         }
  
-       bvdev->requestq_poller = spdk_poller_register(bvdev->bdev ? vdev_worker : no_bdev_vdev_worker,
-                                bvdev, 0);
+       bvsession->requestq_poller = spdk_poller_register(bvdev->bdev ? vdev_worker : no_bdev_vdev_worker,
+                                    bvsession, 0);
         SPDK_INFOLOG(SPDK_LOG_VHOST, "Started poller for vhost controller %s on lcore %d\n",
-                    vdev->name, vdev->lcore);
+                    vdev->name, spdk_env_get_current_core());
  out:
-       spdk_vhost_dev_backend_event_done(event_ctx, rc);
+       spdk_vhost_session_start_done(vsession, rc);
         return rc;
  }
  
  static int
-destroy_device_poller_cb(void *arg)
+spdk_vhost_blk_start(struct spdk_vhost_session *vsession)
  {
-       struct spdk_vhost_blk_dev *bvdev = arg;
+       int32_t lcore;
+       int rc;
+
+       lcore = spdk_vhost_allocate_reactor(vsession->vdev->cpumask);
+       rc = spdk_vhost_session_send_event(lcore, vsession, spdk_vhost_blk_start_cb,
+                                          3, "start session");
+
+       if (rc != 0) {
+               spdk_vhost_free_reactor(lcore);
+       }
+
+       return rc;
+}
+
+static int
+destroy_session_poller_cb(void *arg)
+{
+       struct spdk_vhost_blk_session *bvsession = arg;
+       struct spdk_vhost_session *vsession = &bvsession->vsession;
         int i;
  
-       if (bvdev->vdev.task_cnt > 0) {
+       if (vsession->task_cnt > 0) {
+               return -1;
+       }
+
+       if (spdk_vhost_trylock() != 0) {
                 return -1;
         }
  
-       for (i = 0; i < bvdev->vdev.max_queues; i++) {
-               bvdev->vdev.virtqueue[i].next_event_time = 0;
-               spdk_vhost_vq_used_signal(&bvdev->vdev, &bvdev->vdev.virtqueue[i]);
+       for (i = 0; i < vsession->max_queues; i++) {
+               vsession->virtqueue[i].next_event_time = 0;
+               spdk_vhost_vq_used_signal(vsession, &vsession->virtqueue[i]);
         }
  
-       SPDK_INFOLOG(SPDK_LOG_VHOST, "Stopping poller for vhost controller %s\n", bvdev->vdev.name);
+       SPDK_INFOLOG(SPDK_LOG_VHOST, "Stopping poller for vhost controller %s\n", vsession->vdev->name);
  
-       if (bvdev->bdev_io_channel) {
-               spdk_put_io_channel(bvdev->bdev_io_channel);
-               bvdev->bdev_io_channel = NULL;
+       if (bvsession->io_channel) {
+               spdk_put_io_channel(bvsession->io_channel);
+               bvsession->io_channel = NULL;
         }
  
-       free_task_pool(bvdev);
-       spdk_poller_unregister(&bvdev->destroy_ctx.poller);
-       spdk_vhost_dev_backend_event_done(bvdev->destroy_ctx.event_ctx, 0);
+       free_task_pool(bvsession);
+       spdk_poller_unregister(&bvsession->stop_poller);
+       spdk_vhost_session_stop_done(vsession, 0);
  
+       spdk_vhost_unlock();
         return -1;
  }
  
  static int
-spdk_vhost_blk_stop(struct spdk_vhost_dev *vdev, void *event_ctx)
+spdk_vhost_blk_stop_cb(struct spdk_vhost_dev *vdev,
+                      struct spdk_vhost_session *vsession, void *unused)
  {
-       struct spdk_vhost_blk_dev *bvdev;
+       struct spdk_vhost_blk_session *bvsession;
  
-       bvdev = to_blk_dev(vdev);
-       if (bvdev == NULL) {
+       bvsession = to_blk_session(vsession);
+       if (bvsession == NULL) {
                 SPDK_ERRLOG("Trying to stop non-blk controller as a blk one.\n");
                 goto err;
         }
  
-       bvdev->destroy_ctx.event_ctx = event_ctx;
-       spdk_poller_unregister(&bvdev->requestq_poller);
-       bvdev->destroy_ctx.poller = spdk_poller_register(destroy_device_poller_cb,
-                                   bvdev, 1000);
+       spdk_poller_unregister(&bvsession->requestq_poller);
+       bvsession->stop_poller = spdk_poller_register(destroy_session_poller_cb,
+                                bvsession, 1000);
         return 0;
  
  err:
-       spdk_vhost_dev_backend_event_done(event_ctx, -1);
+       spdk_vhost_session_stop_done(vsession, -1);
         return -1;
  }
  
+static int
+spdk_vhost_blk_stop(struct spdk_vhost_session *vsession)
+{
+       return spdk_vhost_session_send_event(vsession->lcore, vsession,
+                                            spdk_vhost_blk_stop_cb, 3, "stop session");
+}
+
  static void
  spdk_vhost_blk_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
  {
@@ -652,11 +820,9 @@ spdk_vhost_blk_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_writ
         }
  
         assert(bvdev != NULL);
-       spdk_json_write_name(w, "block");
-       spdk_json_write_object_begin(w);
+       spdk_json_write_named_object_begin(w, "block");
  
-       spdk_json_write_name(w, "readonly");
-       spdk_json_write_bool(w, bvdev->readonly);
+       spdk_json_write_named_bool(w, "readonly", bvdev->readonly);
  
         spdk_json_write_name(w, "bdev");
         if (bdev) {
@@ -701,7 +867,7 @@ static int
  spdk_vhost_blk_get_config(struct spdk_vhost_dev *vdev, uint8_t *config,
                           uint32_t len)
  {
-       struct virtio_blk_config *blkcfg = (struct virtio_blk_config *)config;
+       struct virtio_blk_config blkcfg;
         struct spdk_vhost_blk_dev *bvdev;
         struct spdk_bdev *bdev;
         uint32_t blk_size;
@@ -713,10 +879,6 @@ spdk_vhost_blk_get_config(struct spdk_vhost_dev *vdev, uint8_t *config,
                 return -1;
         }
  
-       if (len < sizeof(*blkcfg)) {
-               return -1;
-       }
-
         bdev = bvdev->bdev;
         if (bdev == NULL) {
                 /* We can't just return -1 here as this GET_CONFIG message might
@@ -735,19 +897,37 @@ spdk_vhost_blk_get_config(struct spdk_vhost_dev *vdev, uint8_t *config,
         } else {
                 blk_size = spdk_bdev_get_block_size(bdev);
                 blkcnt = spdk_bdev_get_num_blocks(bdev);
+               if (spdk_bdev_get_buf_align(bdev) > 1) {
+                       blkcfg.size_max = SPDK_BDEV_LARGE_BUF_MAX_SIZE;
+                       blkcfg.seg_max = spdk_min(SPDK_VHOST_IOVS_MAX - 2 - 1, BDEV_IO_NUM_CHILD_IOV - 2 - 1);
+               } else {
+                       blkcfg.size_max = 131072;
+                       /*  -2 for REQ and RESP and -1 for region boundary splitting */
+                       blkcfg.seg_max = SPDK_VHOST_IOVS_MAX - 2 - 1;
+               }
         }
  
-       memset(blkcfg, 0, sizeof(*blkcfg));
-       blkcfg->blk_size = blk_size;
+       memset(&blkcfg, 0, sizeof(blkcfg));
+       blkcfg.blk_size = blk_size;
         /* minimum I/O size in blocks */
-       blkcfg->min_io_size = 1;
+       blkcfg.min_io_size = 1;
         /* expressed in 512 Bytes sectors */
-       blkcfg->capacity = (blkcnt * blk_size) / 512;
-       blkcfg->size_max = 131072;
-       /*  -2 for REQ and RESP and -1 for region boundary splitting */
-       blkcfg->seg_max = SPDK_VHOST_IOVS_MAX - 2 - 1;
+       blkcfg.capacity = (blkcnt * blk_size) / 512;
         /* QEMU can overwrite this value when started */
-       blkcfg->num_queues = SPDK_VHOST_MAX_VQUEUES;
+       blkcfg.num_queues = SPDK_VHOST_MAX_VQUEUES;
+
+       if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
+               /* 16MiB, expressed in 512 Bytes */
+               blkcfg.max_discard_sectors = 32768;
+               blkcfg.max_discard_seg = 1;
+               blkcfg.discard_sector_alignment = blk_size / 512;
+       }
+       if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
+               blkcfg.max_write_zeroes_sectors = 32768;
+               blkcfg.max_write_zeroes_seg = 1;
+       }
+
+       memcpy(config, &blkcfg, spdk_min(len, sizeof(blkcfg)));
  
         return 0;
  }
@@ -759,12 +939,15 @@ static const struct spdk_vhost_dev_backend vhost_blk_device_backend = {
         (1ULL << VIRTIO_BLK_F_BLK_SIZE) | (1ULL << VIRTIO_BLK_F_TOPOLOGY) |
         (1ULL << VIRTIO_BLK_F_BARRIER)  | (1ULL << VIRTIO_BLK_F_SCSI) |
         (1ULL << VIRTIO_BLK_F_FLUSH)    | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) |
-       (1ULL << VIRTIO_BLK_F_MQ),
+       (1ULL << VIRTIO_BLK_F_MQ)       | (1ULL << VIRTIO_BLK_F_DISCARD) |
+       (1ULL << VIRTIO_BLK_F_WRITE_ZEROES),
         .disabled_features = SPDK_VHOST_DISABLED_FEATURES | (1ULL << VIRTIO_BLK_F_GEOMETRY) |
         (1ULL << VIRTIO_BLK_F_RO) | (1ULL << VIRTIO_BLK_F_FLUSH) | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) |
-       (1ULL << VIRTIO_BLK_F_BARRIER) | (1ULL << VIRTIO_BLK_F_SCSI),
-       .start_device =  spdk_vhost_blk_start,
-       .stop_device = spdk_vhost_blk_stop,
+       (1ULL << VIRTIO_BLK_F_BARRIER) | (1ULL << VIRTIO_BLK_F_SCSI) | (1ULL << VIRTIO_BLK_F_DISCARD) |
+       (1ULL << VIRTIO_BLK_F_WRITE_ZEROES),
+       .session_ctx_size = sizeof(struct spdk_vhost_blk_session) - sizeof(struct spdk_vhost_session),
+       .start_session =  spdk_vhost_blk_start,
+       .stop_session = spdk_vhost_blk_stop,
         .vhost_get_config = spdk_vhost_blk_get_config,
         .dump_info_json = spdk_vhost_blk_dump_info_json,
         .write_config_json = spdk_vhost_blk_write_config_json,
@@ -819,6 +1002,7 @@ spdk_vhost_blk_construct(const char *name, const char *cpumask, const char *dev_
  {
         struct spdk_vhost_blk_dev *bvdev = NULL;
         struct spdk_bdev *bdev;
+       uint64_t features = 0;
         int ret = 0;
  
         spdk_vhost_lock();
@@ -830,7 +1014,7 @@ spdk_vhost_blk_construct(const char *name, const char *cpumask, const char *dev_
                 goto out;
         }
  
-       bvdev = spdk_dma_zmalloc(sizeof(*bvdev), SPDK_CACHE_LINE_SIZE, NULL);
+       bvdev = calloc(1, sizeof(*bvdev));
         if (bvdev == NULL) {
                 ret = -ENOMEM;
                 goto out;
@@ -851,14 +1035,27 @@ spdk_vhost_blk_construct(const char *name, const char *cpumask, const char *dev_
                 goto out;
         }
  
-       if (readonly && rte_vhost_driver_enable_features(bvdev->vdev.path, (1ULL << VIRTIO_BLK_F_RO))) {
-               SPDK_ERRLOG("Controller %s: failed to set as a readonly\n", name);
-               spdk_bdev_close(bvdev->bdev_desc);
+       if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
+               features |= (1ULL << VIRTIO_BLK_F_DISCARD);
+       }
+       if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
+               features |= (1ULL << VIRTIO_BLK_F_WRITE_ZEROES);
+       }
+       if (readonly) {
+               features |= (1ULL << VIRTIO_BLK_F_RO);
+       }
+       if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH)) {
+               features |= (1ULL << VIRTIO_BLK_F_FLUSH);
+       }
+
+       if (features && rte_vhost_driver_enable_features(bvdev->vdev.path, features)) {
+               SPDK_ERRLOG("Controller %s: failed to enable features 0x%"PRIx64"\n", name, features);
  
                 if (spdk_vhost_dev_unregister(&bvdev->vdev) != 0) {
                         SPDK_ERRLOG("Controller %s: failed to remove controller\n", name);
                 }
  
+               spdk_bdev_close(bvdev->bdev_desc);
                 ret = -1;
                 goto out;
         }
@@ -866,7 +1063,7 @@ spdk_vhost_blk_construct(const char *name, const char *cpumask, const char *dev_
         SPDK_INFOLOG(SPDK_LOG_VHOST, "Controller %s: using bdev '%s'\n", name, dev_name);
  out:
         if (ret != 0 && bvdev) {
-               spdk_dma_free(bvdev);
+               free(bvdev);
         }
         spdk_vhost_unlock();
         return ret;
@@ -893,7 +1090,7 @@ spdk_vhost_blk_destroy(struct spdk_vhost_dev *vdev)
         }
         bvdev->bdev = NULL;
  
-       spdk_dma_free(bvdev);
+       free(bvdev);
         return 0;
  }