]> git.proxmox.com Git - ceph.git/blobdiff - ceph/src/spdk/lib/vhost/vhost_blk.c
import 15.2.0 Octopus source
[ceph.git] / ceph / src / spdk / lib / vhost / vhost_blk.c
index 6a9a1896f57dccbb3b80f726e5945b3317287a37..3154098501c2b177c42c6171f1a887a9691d3c9f 100644 (file)
@@ -35,6 +35,7 @@
 
 #include "spdk/env.h"
 #include "spdk/bdev.h"
+#include "spdk/bdev_module.h"
 #include "spdk/conf.h"
 #include "spdk/thread.h"
 #include "spdk/likely.h"
@@ -46,7 +47,7 @@
 
 struct spdk_vhost_blk_task {
        struct spdk_bdev_io *bdev_io;
-       struct spdk_vhost_blk_dev *bvdev;
+       struct spdk_vhost_blk_session *bvsession;
        struct spdk_vhost_virtqueue *vq;
 
        volatile uint8_t *status;
@@ -69,24 +70,31 @@ struct spdk_vhost_blk_dev {
        struct spdk_vhost_dev vdev;
        struct spdk_bdev *bdev;
        struct spdk_bdev_desc *bdev_desc;
-       struct spdk_io_channel *bdev_io_channel;
-       struct spdk_poller *requestq_poller;
-       struct spdk_vhost_dev_destroy_ctx destroy_ctx;
        bool readonly;
 };
 
+struct spdk_vhost_blk_session {
+       /* The parent session must be the very first field in this struct */
+       struct spdk_vhost_session vsession;
+       struct spdk_vhost_blk_dev *bvdev;
+       struct spdk_poller *requestq_poller;
+       struct spdk_io_channel *io_channel;
+       struct spdk_poller *stop_poller;
+};
+
 /* forward declaration */
 static const struct spdk_vhost_dev_backend vhost_blk_device_backend;
 
 static int
-process_blk_request(struct spdk_vhost_blk_task *task, struct spdk_vhost_blk_dev *bvdev,
+process_blk_request(struct spdk_vhost_blk_task *task,
+                   struct spdk_vhost_blk_session *bvsession,
                    struct spdk_vhost_virtqueue *vq);
 
 static void
 blk_task_finish(struct spdk_vhost_blk_task *task)
 {
-       assert(task->bvdev->vdev.task_cnt > 0);
-       task->bvdev->vdev.task_cnt--;
+       assert(task->bvsession->vsession.task_cnt > 0);
+       task->bvsession->vsession.task_cnt--;
        task->used = false;
 }
 
@@ -97,7 +105,7 @@ invalid_blk_request(struct spdk_vhost_blk_task *task, uint8_t status)
                *task->status = status;
        }
 
-       spdk_vhost_vq_used_ring_enqueue(&task->bvdev->vdev, task->vq, task->req_idx,
+       spdk_vhost_vq_used_ring_enqueue(&task->bvsession->vsession, task->vq, task->req_idx,
                                        task->used_len);
        blk_task_finish(task);
        SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Invalid request (status=%" PRIu8")\n", status);
@@ -111,20 +119,24 @@ invalid_blk_request(struct spdk_vhost_blk_task *task, uint8_t status)
  *   FIXME: Make this function return to rd_cnt and wr_cnt
  */
 static int
-blk_iovs_setup(struct spdk_vhost_dev *vdev, struct spdk_vhost_virtqueue *vq, uint16_t req_idx,
-              struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length)
+blk_iovs_setup(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq,
+              uint16_t req_idx, struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length)
 {
+       struct spdk_vhost_session *vsession = &bvsession->vsession;
+       struct spdk_vhost_dev *vdev = vsession->vdev;
        struct vring_desc *desc, *desc_table;
        uint16_t out_cnt = 0, cnt = 0;
        uint32_t desc_table_size, len = 0;
+       uint32_t desc_handled_cnt;
        int rc;
 
-       rc = spdk_vhost_vq_get_desc(vdev, vq, req_idx, &desc, &desc_table, &desc_table_size);
+       rc = spdk_vhost_vq_get_desc(vsession, vq, req_idx, &desc, &desc_table, &desc_table_size);
        if (rc != 0) {
                SPDK_ERRLOG("%s: Invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx);
                return -1;
        }
 
+       desc_handled_cnt = 0;
        while (1) {
                /*
                 * Maximum cnt reached?
@@ -136,7 +148,7 @@ blk_iovs_setup(struct spdk_vhost_dev *vdev, struct spdk_vhost_virtqueue *vq, uin
                        return -1;
                }
 
-               if (spdk_unlikely(spdk_vhost_vring_desc_to_iov(vdev, iovs, &cnt, desc))) {
+               if (spdk_unlikely(spdk_vhost_vring_desc_to_iov(vsession, iovs, &cnt, desc))) {
                        SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n",
                                      req_idx, cnt);
                        return -1;
@@ -154,6 +166,14 @@ blk_iovs_setup(struct spdk_vhost_dev *vdev, struct spdk_vhost_virtqueue *vq, uin
                } else if (desc == NULL) {
                        break;
                }
+
+               desc_handled_cnt++;
+               if (spdk_unlikely(desc_handled_cnt > desc_table_size)) {
+                       /* Break a cycle and report an error, if any. */
+                       SPDK_ERRLOG("%s: found a cycle in the descriptor chain: desc_table_size = %d, desc_handled_cnt = %d.\n",
+                                   vdev->name, desc_table_size, desc_handled_cnt);
+                       return -1;
+               }
        }
 
        /*
@@ -174,7 +194,7 @@ static void
 blk_request_finish(bool success, struct spdk_vhost_blk_task *task)
 {
        *task->status = success ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR;
-       spdk_vhost_vq_used_ring_enqueue(&task->bvdev->vdev, task->vq, task->req_idx,
+       spdk_vhost_vq_used_ring_enqueue(&task->bvsession->vsession, task->vq, task->req_idx,
                                        task->used_len);
        SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Finished task (%p) req_idx=%d\n status: %s\n", task,
                      task->req_idx, success ? "OK" : "FAIL");
@@ -196,7 +216,7 @@ blk_request_resubmit(void *arg)
        struct spdk_vhost_blk_task *task = (struct spdk_vhost_blk_task *)arg;
        int rc = 0;
 
-       rc = process_blk_request(task, task->bvdev, task->vq);
+       rc = process_blk_request(task, task->bvsession, task->vq);
        if (rc == 0) {
                SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p resubmitted ======\n", task);
        } else {
@@ -208,14 +228,14 @@ static inline void
 blk_request_queue_io(struct spdk_vhost_blk_task *task)
 {
        int rc;
-       struct spdk_vhost_blk_dev *bvdev = task->bvdev;
-       struct spdk_bdev *bdev = bvdev->bdev;
+       struct spdk_vhost_blk_session *bvsession = task->bvsession;
+       struct spdk_bdev *bdev = bvsession->bvdev->bdev;
 
        task->bdev_io_wait.bdev = bdev;
        task->bdev_io_wait.cb_fn = blk_request_resubmit;
        task->bdev_io_wait.cb_arg = task;
 
-       rc = spdk_bdev_queue_io_wait(bdev, bvdev->bdev_io_channel, &task->bdev_io_wait);
+       rc = spdk_bdev_queue_io_wait(bdev, bvsession->io_channel, &task->bdev_io_wait);
        if (rc != 0) {
                SPDK_ERRLOG("Queue io failed in vhost_blk, rc=%d\n", rc);
                invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
@@ -223,16 +243,20 @@ blk_request_queue_io(struct spdk_vhost_blk_task *task)
 }
 
 static int
-process_blk_request(struct spdk_vhost_blk_task *task, struct spdk_vhost_blk_dev *bvdev,
+process_blk_request(struct spdk_vhost_blk_task *task,
+                   struct spdk_vhost_blk_session *bvsession,
                    struct spdk_vhost_virtqueue *vq)
 {
+       struct spdk_vhost_blk_dev *bvdev = bvsession->bvdev;
        const struct virtio_blk_outhdr *req;
+       struct virtio_blk_discard_write_zeroes *desc;
        struct iovec *iov;
        uint32_t type;
        uint32_t payload_len;
+       uint64_t flush_bytes;
        int rc;
 
-       if (blk_iovs_setup(&bvdev->vdev, vq, task->req_idx, task->iovs, &task->iovcnt, &payload_len)) {
+       if (blk_iovs_setup(bvsession, vq, task->req_idx, task->iovs, &task->iovcnt, &payload_len)) {
                SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Invalid request (req_idx = %"PRIu16").\n", task->req_idx);
                /* Only READ and WRITE are supported for now. */
                invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
@@ -272,7 +296,7 @@ process_blk_request(struct spdk_vhost_blk_task *task, struct spdk_vhost_blk_dev
        switch (type) {
        case VIRTIO_BLK_T_IN:
        case VIRTIO_BLK_T_OUT:
-               if (spdk_unlikely((payload_len & (512 - 1)) != 0)) {
+               if (spdk_unlikely(payload_len == 0 || (payload_len & (512 - 1)) != 0)) {
                        SPDK_ERRLOG("%s - passed IO buffer is not multiple of 512b (req_idx = %"PRIu16").\n",
                                    type ? "WRITE" : "READ", task->req_idx);
                        invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
@@ -281,12 +305,12 @@ process_blk_request(struct spdk_vhost_blk_task *task, struct spdk_vhost_blk_dev
 
                if (type == VIRTIO_BLK_T_IN) {
                        task->used_len = payload_len + sizeof(*task->status);
-                       rc = spdk_bdev_readv(bvdev->bdev_desc, bvdev->bdev_io_channel,
+                       rc = spdk_bdev_readv(bvdev->bdev_desc, bvsession->io_channel,
                                             &task->iovs[1], task->iovcnt, req->sector * 512,
                                             payload_len, blk_request_complete_cb, task);
                } else if (!bvdev->readonly) {
                        task->used_len = sizeof(*task->status);
-                       rc = spdk_bdev_writev(bvdev->bdev_desc, bvdev->bdev_io_channel,
+                       rc = spdk_bdev_writev(bvdev->bdev_desc, bvsession->io_channel,
                                              &task->iovs[1], task->iovcnt, req->sector * 512,
                                              payload_len, blk_request_complete_cb, task);
                } else {
@@ -304,6 +328,75 @@ process_blk_request(struct spdk_vhost_blk_task *task, struct spdk_vhost_blk_dev
                        }
                }
                break;
+       case VIRTIO_BLK_T_DISCARD:
+               desc = task->iovs[1].iov_base;
+               if (payload_len != sizeof(*desc)) {
+                       SPDK_NOTICELOG("Invalid discard payload size: %u\n", payload_len);
+                       invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
+                       return -1;
+               }
+
+               rc = spdk_bdev_unmap(bvdev->bdev_desc, bvsession->io_channel,
+                                    desc->sector * 512, desc->num_sectors * 512,
+                                    blk_request_complete_cb, task);
+               if (rc) {
+                       if (rc == -ENOMEM) {
+                               SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
+                               blk_request_queue_io(task);
+                       } else {
+                               invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
+                               return -1;
+                       }
+               }
+               break;
+       case VIRTIO_BLK_T_WRITE_ZEROES:
+               desc = task->iovs[1].iov_base;
+               if (payload_len != sizeof(*desc)) {
+                       SPDK_NOTICELOG("Invalid write zeroes payload size: %u\n", payload_len);
+                       invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
+                       return -1;
+               }
+
+               /* Zeroed and Unmap the range, SPDK doen't support it. */
+               if (desc->flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) {
+                       SPDK_NOTICELOG("Can't support Write Zeroes with Unmap flag\n");
+                       invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
+                       return -1;
+               }
+
+               rc = spdk_bdev_write_zeroes(bvdev->bdev_desc, bvsession->io_channel,
+                                           desc->sector * 512, desc->num_sectors * 512,
+                                           blk_request_complete_cb, task);
+               if (rc) {
+                       if (rc == -ENOMEM) {
+                               SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
+                               blk_request_queue_io(task);
+                       } else {
+                               invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
+                               return -1;
+                       }
+               }
+               break;
+       case VIRTIO_BLK_T_FLUSH:
+               flush_bytes = spdk_bdev_get_num_blocks(bvdev->bdev) * spdk_bdev_get_block_size(bvdev->bdev);
+               if (req->sector != 0) {
+                       SPDK_NOTICELOG("sector must be zero for flush command\n");
+                       invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
+                       return -1;
+               }
+               rc = spdk_bdev_flush(bvdev->bdev_desc, bvsession->io_channel,
+                                    0, flush_bytes,
+                                    blk_request_complete_cb, task);
+               if (rc) {
+                       if (rc == -ENOMEM) {
+                               SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
+                               blk_request_queue_io(task);
+                       } else {
+                               invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
+                               return -1;
+                       }
+               }
+               break;
        case VIRTIO_BLK_T_GET_ID:
                if (!task->iovcnt || !payload_len) {
                        invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
@@ -324,9 +417,11 @@ process_blk_request(struct spdk_vhost_blk_task *task, struct spdk_vhost_blk_dev
 }
 
 static void
-process_vq(struct spdk_vhost_blk_dev *bvdev, struct spdk_vhost_virtqueue *vq)
+process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
 {
+       struct spdk_vhost_blk_dev *bvdev = bvsession->bvdev;
        struct spdk_vhost_blk_task *task;
+       struct spdk_vhost_session *vsession = &bvsession->vsession;
        int rc;
        uint16_t reqs[32];
        uint16_t reqs_cnt, i;
@@ -343,7 +438,7 @@ process_vq(struct spdk_vhost_blk_dev *bvdev, struct spdk_vhost_virtqueue *vq)
                if (spdk_unlikely(reqs[i] >= vq->vring.size)) {
                        SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n",
                                    bvdev->vdev.name, reqs[i], vq->vring.size);
-                       spdk_vhost_vq_used_ring_enqueue(&bvdev->vdev, vq, reqs[i], 0);
+                       spdk_vhost_vq_used_ring_enqueue(vsession, vq, reqs[i], 0);
                        continue;
                }
 
@@ -351,18 +446,18 @@ process_vq(struct spdk_vhost_blk_dev *bvdev, struct spdk_vhost_virtqueue *vq)
                if (spdk_unlikely(task->used)) {
                        SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
                                    bvdev->vdev.name, reqs[i]);
-                       spdk_vhost_vq_used_ring_enqueue(&bvdev->vdev, vq, reqs[i], 0);
+                       spdk_vhost_vq_used_ring_enqueue(vsession, vq, reqs[i], 0);
                        continue;
                }
 
-               bvdev->vdev.task_cnt++;
+               vsession->task_cnt++;
 
                task->used = true;
                task->iovcnt = SPDK_COUNTOF(task->iovs);
                task->status = NULL;
                task->used_len = 0;
 
-               rc = process_blk_request(task, bvdev, vq);
+               rc = process_blk_request(task, bvsession, vq);
                if (rc == 0) {
                        SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p req_idx %d submitted ======\n", task,
                                      reqs[i]);
@@ -375,21 +470,24 @@ process_vq(struct spdk_vhost_blk_dev *bvdev, struct spdk_vhost_virtqueue *vq)
 static int
 vdev_worker(void *arg)
 {
-       struct spdk_vhost_blk_dev *bvdev = arg;
+       struct spdk_vhost_blk_session *bvsession = arg;
+       struct spdk_vhost_session *vsession = &bvsession->vsession;
+
        uint16_t q_idx;
 
-       for (q_idx = 0; q_idx < bvdev->vdev.max_queues; q_idx++) {
-               process_vq(bvdev, &bvdev->vdev.virtqueue[q_idx]);
+       for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
+               process_vq(bvsession, &vsession->virtqueue[q_idx]);
        }
 
-       spdk_vhost_dev_used_signal(&bvdev->vdev);
+       spdk_vhost_session_used_signal(vsession);
 
        return -1;
 }
 
 static void
-no_bdev_process_vq(struct spdk_vhost_blk_dev *bvdev, struct spdk_vhost_virtqueue *vq)
+no_bdev_process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
 {
+       struct spdk_vhost_session *vsession = &bvsession->vsession;
        struct iovec iovs[SPDK_VHOST_IOVS_MAX];
        uint32_t length;
        uint16_t iovcnt, req_idx;
@@ -399,34 +497,50 @@ no_bdev_process_vq(struct spdk_vhost_blk_dev *bvdev, struct spdk_vhost_virtqueue
        }
 
        iovcnt = SPDK_COUNTOF(iovs);
-       if (blk_iovs_setup(&bvdev->vdev, vq, req_idx, iovs, &iovcnt, &length) == 0) {
+       if (blk_iovs_setup(bvsession, vq, req_idx, iovs, &iovcnt, &length) == 0) {
                *(volatile uint8_t *)iovs[iovcnt - 1].iov_base = VIRTIO_BLK_S_IOERR;
                SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Aborting request %" PRIu16"\n", req_idx);
        }
 
-       spdk_vhost_vq_used_ring_enqueue(&bvdev->vdev, vq, req_idx, 0);
+       spdk_vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0);
 }
 
 static int
 no_bdev_vdev_worker(void *arg)
 {
-       struct spdk_vhost_blk_dev *bvdev = arg;
+       struct spdk_vhost_blk_session *bvsession = arg;
+       struct spdk_vhost_session *vsession = &bvsession->vsession;
        uint16_t q_idx;
 
-       for (q_idx = 0; q_idx < bvdev->vdev.max_queues; q_idx++) {
-               no_bdev_process_vq(bvdev, &bvdev->vdev.virtqueue[q_idx]);
+       for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
+               no_bdev_process_vq(bvsession, &vsession->virtqueue[q_idx]);
        }
 
-       spdk_vhost_dev_used_signal(&bvdev->vdev);
+       spdk_vhost_session_used_signal(vsession);
 
-       if (bvdev->vdev.task_cnt == 0 && bvdev->bdev_io_channel) {
-               spdk_put_io_channel(bvdev->bdev_io_channel);
-               bvdev->bdev_io_channel = NULL;
+       if (vsession->task_cnt == 0 && bvsession->io_channel) {
+               spdk_put_io_channel(bvsession->io_channel);
+               bvsession->io_channel = NULL;
        }
 
        return -1;
 }
 
+static struct spdk_vhost_blk_session *
+to_blk_session(struct spdk_vhost_session *vsession)
+{
+       if (vsession == NULL) {
+               return NULL;
+       }
+
+       if (vsession->vdev->backend != &vhost_blk_device_backend) {
+               SPDK_ERRLOG("%s: not a vhost-blk device\n", vsession->vdev->name);
+               return NULL;
+       }
+
+       return (struct spdk_vhost_blk_session *)vsession;
+}
+
 static struct spdk_vhost_blk_dev *
 to_blk_dev(struct spdk_vhost_dev *vdev)
 {
@@ -452,20 +566,34 @@ spdk_vhost_blk_get_dev(struct spdk_vhost_dev *vdev)
 }
 
 static int
-_bdev_remove_cb(struct spdk_vhost_dev *vdev, void *arg)
+_spdk_vhost_session_bdev_remove_cb(struct spdk_vhost_dev *vdev, struct spdk_vhost_session *vsession,
+                                  void *ctx)
 {
-       struct spdk_vhost_blk_dev *bvdev = arg;
+       struct spdk_vhost_blk_session *bvsession;
 
-       SPDK_WARNLOG("Controller %s: Hot-removing bdev - all further requests will fail.\n",
-                    bvdev->vdev.name);
-       if (bvdev->requestq_poller) {
-               spdk_poller_unregister(&bvdev->requestq_poller);
-               bvdev->requestq_poller = spdk_poller_register(no_bdev_vdev_worker, bvdev, 0);
+       if (vdev == NULL) {
+               /* Nothing to do */
+               return 0;
+       }
+
+       if (vsession == NULL) {
+               /* All sessions have been notified, time to close the bdev */
+               struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
+
+               assert(bvdev != NULL);
+
+               spdk_bdev_close(bvdev->bdev_desc);
+               bvdev->bdev_desc = NULL;
+               bvdev->bdev = NULL;
+               return 0;
+       }
+
+       bvsession = (struct spdk_vhost_blk_session *)vsession;
+       if (bvsession->requestq_poller) {
+               spdk_poller_unregister(&bvsession->requestq_poller);
+               bvsession->requestq_poller = spdk_poller_register(no_bdev_vdev_worker, bvsession, 0);
        }
 
-       spdk_bdev_close(bvdev->bdev_desc);
-       bvdev->bdev_desc = NULL;
-       bvdev->bdev = NULL;
        return 0;
 }
 
@@ -474,37 +602,45 @@ bdev_remove_cb(void *remove_ctx)
 {
        struct spdk_vhost_blk_dev *bvdev = remove_ctx;
 
-       spdk_vhost_call_external_event(bvdev->vdev.name, _bdev_remove_cb, bvdev);
+       SPDK_WARNLOG("Controller %s: Hot-removing bdev - all further requests will fail.\n",
+                    bvdev->vdev.name);
+
+       spdk_vhost_lock();
+       spdk_vhost_dev_foreach_session(&bvdev->vdev, _spdk_vhost_session_bdev_remove_cb, NULL);
+       spdk_vhost_unlock();
 }
 
 static void
-free_task_pool(struct spdk_vhost_blk_dev *bvdev)
+free_task_pool(struct spdk_vhost_blk_session *bvsession)
 {
+       struct spdk_vhost_session *vsession = &bvsession->vsession;
        struct spdk_vhost_virtqueue *vq;
        uint16_t i;
 
-       for (i = 0; i < bvdev->vdev.max_queues; i++) {
-               vq = &bvdev->vdev.virtqueue[i];
+       for (i = 0; i < vsession->max_queues; i++) {
+               vq = &vsession->virtqueue[i];
                if (vq->tasks == NULL) {
                        continue;
                }
 
-               spdk_dma_free(vq->tasks);
+               spdk_free(vq->tasks);
                vq->tasks = NULL;
        }
 }
 
 static int
-alloc_task_pool(struct spdk_vhost_blk_dev *bvdev)
+alloc_task_pool(struct spdk_vhost_blk_session *bvsession)
 {
+       struct spdk_vhost_session *vsession = &bvsession->vsession;
+       struct spdk_vhost_blk_dev *bvdev = bvsession->bvdev;
        struct spdk_vhost_virtqueue *vq;
        struct spdk_vhost_blk_task *task;
        uint32_t task_cnt;
        uint16_t i;
        uint32_t j;
 
-       for (i = 0; i < bvdev->vdev.max_queues; i++) {
-               vq = &bvdev->vdev.virtqueue[i];
+       for (i = 0; i < vsession->max_queues; i++) {
+               vq = &vsession->virtqueue[i];
                if (vq->vring.desc == NULL) {
                        continue;
                }
@@ -514,21 +650,22 @@ alloc_task_pool(struct spdk_vhost_blk_dev *bvdev)
                        /* sanity check */
                        SPDK_ERRLOG("Controller %s: virtuque %"PRIu16" is too big. (size = %"PRIu32", max = %"PRIu32")\n",
                                    bvdev->vdev.name, i, task_cnt, SPDK_VHOST_MAX_VQ_SIZE);
-                       free_task_pool(bvdev);
+                       free_task_pool(bvsession);
                        return -1;
                }
-               vq->tasks = spdk_dma_zmalloc(sizeof(struct spdk_vhost_blk_task) * task_cnt,
-                                            SPDK_CACHE_LINE_SIZE, NULL);
+               vq->tasks = spdk_zmalloc(sizeof(struct spdk_vhost_blk_task) * task_cnt,
+                                        SPDK_CACHE_LINE_SIZE, NULL,
+                                        SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
                if (vq->tasks == NULL) {
                        SPDK_ERRLOG("Controller %s: failed to allocate %"PRIu32" tasks for virtqueue %"PRIu16"\n",
                                    bvdev->vdev.name, task_cnt, i);
-                       free_task_pool(bvdev);
+                       free_task_pool(bvsession);
                        return -1;
                }
 
                for (j = 0; j < task_cnt; j++) {
                        task = &((struct spdk_vhost_blk_task *)vq->tasks)[j];
-                       task->bvdev = bvdev;
+                       task->bvsession = bvsession;
                        task->req_idx = j;
                        task->vq = vq;
                }
@@ -537,109 +674,140 @@ alloc_task_pool(struct spdk_vhost_blk_dev *bvdev)
        return 0;
 }
 
-/*
- * A new device is added to a data core. First the device is added to the main linked list
- * and then allocated to a specific data core.
- *
- */
 static int
-spdk_vhost_blk_start(struct spdk_vhost_dev *vdev, void *event_ctx)
+spdk_vhost_blk_start_cb(struct spdk_vhost_dev *vdev,
+                       struct spdk_vhost_session *vsession, void *unused)
 {
        struct spdk_vhost_blk_dev *bvdev;
+       struct spdk_vhost_blk_session *bvsession;
        int i, rc = 0;
 
-       bvdev = to_blk_dev(vdev);
-       if (bvdev == NULL) {
+       bvsession = to_blk_session(vsession);
+       if (bvsession == NULL) {
                SPDK_ERRLOG("Trying to start non-blk controller as a blk one.\n");
                rc = -1;
                goto out;
        }
 
+       bvdev = to_blk_dev(vdev);
+       assert(bvdev != NULL);
+       bvsession->bvdev = bvdev;
+
        /* validate all I/O queues are in a contiguous index range */
-       for (i = 0; i < vdev->max_queues; i++) {
-               if (vdev->virtqueue[i].vring.desc == NULL) {
+       for (i = 0; i < vsession->max_queues; i++) {
+               if (vsession->virtqueue[i].vring.desc == NULL) {
                        SPDK_ERRLOG("%s: queue %"PRIu32" is empty\n", vdev->name, i);
                        rc = -1;
                        goto out;
                }
        }
 
-       rc = alloc_task_pool(bvdev);
+       rc = alloc_task_pool(bvsession);
        if (rc != 0) {
                SPDK_ERRLOG("%s: failed to alloc task pool.\n", bvdev->vdev.name);
                goto out;
        }
 
        if (bvdev->bdev) {
-               bvdev->bdev_io_channel = spdk_bdev_get_io_channel(bvdev->bdev_desc);
-               if (!bvdev->bdev_io_channel) {
-                       free_task_pool(bvdev);
+               bvsession->io_channel = spdk_bdev_get_io_channel(bvdev->bdev_desc);
+               if (!bvsession->io_channel) {
+                       free_task_pool(bvsession);
                        SPDK_ERRLOG("Controller %s: IO channel allocation failed\n", vdev->name);
                        rc = -1;
                        goto out;
                }
        }
 
-       bvdev->requestq_poller = spdk_poller_register(bvdev->bdev ? vdev_worker : no_bdev_vdev_worker,
-                                bvdev, 0);
+       bvsession->requestq_poller = spdk_poller_register(bvdev->bdev ? vdev_worker : no_bdev_vdev_worker,
+                                    bvsession, 0);
        SPDK_INFOLOG(SPDK_LOG_VHOST, "Started poller for vhost controller %s on lcore %d\n",
-                    vdev->name, vdev->lcore);
+                    vdev->name, spdk_env_get_current_core());
 out:
-       spdk_vhost_dev_backend_event_done(event_ctx, rc);
+       spdk_vhost_session_start_done(vsession, rc);
        return rc;
 }
 
 static int
-destroy_device_poller_cb(void *arg)
+spdk_vhost_blk_start(struct spdk_vhost_session *vsession)
 {
-       struct spdk_vhost_blk_dev *bvdev = arg;
+       int32_t lcore;
+       int rc;
+
+       lcore = spdk_vhost_allocate_reactor(vsession->vdev->cpumask);
+       rc = spdk_vhost_session_send_event(lcore, vsession, spdk_vhost_blk_start_cb,
+                                          3, "start session");
+
+       if (rc != 0) {
+               spdk_vhost_free_reactor(lcore);
+       }
+
+       return rc;
+}
+
+static int
+destroy_session_poller_cb(void *arg)
+{
+       struct spdk_vhost_blk_session *bvsession = arg;
+       struct spdk_vhost_session *vsession = &bvsession->vsession;
        int i;
 
-       if (bvdev->vdev.task_cnt > 0) {
+       if (vsession->task_cnt > 0) {
+               return -1;
+       }
+
+       if (spdk_vhost_trylock() != 0) {
                return -1;
        }
 
-       for (i = 0; i < bvdev->vdev.max_queues; i++) {
-               bvdev->vdev.virtqueue[i].next_event_time = 0;
-               spdk_vhost_vq_used_signal(&bvdev->vdev, &bvdev->vdev.virtqueue[i]);
+       for (i = 0; i < vsession->max_queues; i++) {
+               vsession->virtqueue[i].next_event_time = 0;
+               spdk_vhost_vq_used_signal(vsession, &vsession->virtqueue[i]);
        }
 
-       SPDK_INFOLOG(SPDK_LOG_VHOST, "Stopping poller for vhost controller %s\n", bvdev->vdev.name);
+       SPDK_INFOLOG(SPDK_LOG_VHOST, "Stopping poller for vhost controller %s\n", vsession->vdev->name);
 
-       if (bvdev->bdev_io_channel) {
-               spdk_put_io_channel(bvdev->bdev_io_channel);
-               bvdev->bdev_io_channel = NULL;
+       if (bvsession->io_channel) {
+               spdk_put_io_channel(bvsession->io_channel);
+               bvsession->io_channel = NULL;
        }
 
-       free_task_pool(bvdev);
-       spdk_poller_unregister(&bvdev->destroy_ctx.poller);
-       spdk_vhost_dev_backend_event_done(bvdev->destroy_ctx.event_ctx, 0);
+       free_task_pool(bvsession);
+       spdk_poller_unregister(&bvsession->stop_poller);
+       spdk_vhost_session_stop_done(vsession, 0);
 
+       spdk_vhost_unlock();
        return -1;
 }
 
 static int
-spdk_vhost_blk_stop(struct spdk_vhost_dev *vdev, void *event_ctx)
+spdk_vhost_blk_stop_cb(struct spdk_vhost_dev *vdev,
+                      struct spdk_vhost_session *vsession, void *unused)
 {
-       struct spdk_vhost_blk_dev *bvdev;
+       struct spdk_vhost_blk_session *bvsession;
 
-       bvdev = to_blk_dev(vdev);
-       if (bvdev == NULL) {
+       bvsession = to_blk_session(vsession);
+       if (bvsession == NULL) {
                SPDK_ERRLOG("Trying to stop non-blk controller as a blk one.\n");
                goto err;
        }
 
-       bvdev->destroy_ctx.event_ctx = event_ctx;
-       spdk_poller_unregister(&bvdev->requestq_poller);
-       bvdev->destroy_ctx.poller = spdk_poller_register(destroy_device_poller_cb,
-                                   bvdev, 1000);
+       spdk_poller_unregister(&bvsession->requestq_poller);
+       bvsession->stop_poller = spdk_poller_register(destroy_session_poller_cb,
+                                bvsession, 1000);
        return 0;
 
 err:
-       spdk_vhost_dev_backend_event_done(event_ctx, -1);
+       spdk_vhost_session_stop_done(vsession, -1);
        return -1;
 }
 
+static int
+spdk_vhost_blk_stop(struct spdk_vhost_session *vsession)
+{
+       return spdk_vhost_session_send_event(vsession->lcore, vsession,
+                                            spdk_vhost_blk_stop_cb, 3, "stop session");
+}
+
 static void
 spdk_vhost_blk_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
 {
@@ -652,11 +820,9 @@ spdk_vhost_blk_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_writ
        }
 
        assert(bvdev != NULL);
-       spdk_json_write_name(w, "block");
-       spdk_json_write_object_begin(w);
+       spdk_json_write_named_object_begin(w, "block");
 
-       spdk_json_write_name(w, "readonly");
-       spdk_json_write_bool(w, bvdev->readonly);
+       spdk_json_write_named_bool(w, "readonly", bvdev->readonly);
 
        spdk_json_write_name(w, "bdev");
        if (bdev) {
@@ -701,7 +867,7 @@ static int
 spdk_vhost_blk_get_config(struct spdk_vhost_dev *vdev, uint8_t *config,
                          uint32_t len)
 {
-       struct virtio_blk_config *blkcfg = (struct virtio_blk_config *)config;
+       struct virtio_blk_config blkcfg;
        struct spdk_vhost_blk_dev *bvdev;
        struct spdk_bdev *bdev;
        uint32_t blk_size;
@@ -713,10 +879,6 @@ spdk_vhost_blk_get_config(struct spdk_vhost_dev *vdev, uint8_t *config,
                return -1;
        }
 
-       if (len < sizeof(*blkcfg)) {
-               return -1;
-       }
-
        bdev = bvdev->bdev;
        if (bdev == NULL) {
                /* We can't just return -1 here as this GET_CONFIG message might
@@ -735,19 +897,37 @@ spdk_vhost_blk_get_config(struct spdk_vhost_dev *vdev, uint8_t *config,
        } else {
                blk_size = spdk_bdev_get_block_size(bdev);
                blkcnt = spdk_bdev_get_num_blocks(bdev);
+               if (spdk_bdev_get_buf_align(bdev) > 1) {
+                       blkcfg.size_max = SPDK_BDEV_LARGE_BUF_MAX_SIZE;
+                       blkcfg.seg_max = spdk_min(SPDK_VHOST_IOVS_MAX - 2 - 1, BDEV_IO_NUM_CHILD_IOV - 2 - 1);
+               } else {
+                       blkcfg.size_max = 131072;
+                       /*  -2 for REQ and RESP and -1 for region boundary splitting */
+                       blkcfg.seg_max = SPDK_VHOST_IOVS_MAX - 2 - 1;
+               }
        }
 
-       memset(blkcfg, 0, sizeof(*blkcfg));
-       blkcfg->blk_size = blk_size;
+       memset(&blkcfg, 0, sizeof(blkcfg));
+       blkcfg.blk_size = blk_size;
        /* minimum I/O size in blocks */
-       blkcfg->min_io_size = 1;
+       blkcfg.min_io_size = 1;
        /* expressed in 512 Bytes sectors */
-       blkcfg->capacity = (blkcnt * blk_size) / 512;
-       blkcfg->size_max = 131072;
-       /*  -2 for REQ and RESP and -1 for region boundary splitting */
-       blkcfg->seg_max = SPDK_VHOST_IOVS_MAX - 2 - 1;
+       blkcfg.capacity = (blkcnt * blk_size) / 512;
        /* QEMU can overwrite this value when started */
-       blkcfg->num_queues = SPDK_VHOST_MAX_VQUEUES;
+       blkcfg.num_queues = SPDK_VHOST_MAX_VQUEUES;
+
+       if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
+               /* 16MiB, expressed in 512 Bytes */
+               blkcfg.max_discard_sectors = 32768;
+               blkcfg.max_discard_seg = 1;
+               blkcfg.discard_sector_alignment = blk_size / 512;
+       }
+       if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
+               blkcfg.max_write_zeroes_sectors = 32768;
+               blkcfg.max_write_zeroes_seg = 1;
+       }
+
+       memcpy(config, &blkcfg, spdk_min(len, sizeof(blkcfg)));
 
        return 0;
 }
@@ -759,12 +939,15 @@ static const struct spdk_vhost_dev_backend vhost_blk_device_backend = {
        (1ULL << VIRTIO_BLK_F_BLK_SIZE) | (1ULL << VIRTIO_BLK_F_TOPOLOGY) |
        (1ULL << VIRTIO_BLK_F_BARRIER)  | (1ULL << VIRTIO_BLK_F_SCSI) |
        (1ULL << VIRTIO_BLK_F_FLUSH)    | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) |
-       (1ULL << VIRTIO_BLK_F_MQ),
+       (1ULL << VIRTIO_BLK_F_MQ)       | (1ULL << VIRTIO_BLK_F_DISCARD) |
+       (1ULL << VIRTIO_BLK_F_WRITE_ZEROES),
        .disabled_features = SPDK_VHOST_DISABLED_FEATURES | (1ULL << VIRTIO_BLK_F_GEOMETRY) |
        (1ULL << VIRTIO_BLK_F_RO) | (1ULL << VIRTIO_BLK_F_FLUSH) | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) |
-       (1ULL << VIRTIO_BLK_F_BARRIER) | (1ULL << VIRTIO_BLK_F_SCSI),
-       .start_device =  spdk_vhost_blk_start,
-       .stop_device = spdk_vhost_blk_stop,
+       (1ULL << VIRTIO_BLK_F_BARRIER) | (1ULL << VIRTIO_BLK_F_SCSI) | (1ULL << VIRTIO_BLK_F_DISCARD) |
+       (1ULL << VIRTIO_BLK_F_WRITE_ZEROES),
+       .session_ctx_size = sizeof(struct spdk_vhost_blk_session) - sizeof(struct spdk_vhost_session),
+       .start_session =  spdk_vhost_blk_start,
+       .stop_session = spdk_vhost_blk_stop,
        .vhost_get_config = spdk_vhost_blk_get_config,
        .dump_info_json = spdk_vhost_blk_dump_info_json,
        .write_config_json = spdk_vhost_blk_write_config_json,
@@ -819,6 +1002,7 @@ spdk_vhost_blk_construct(const char *name, const char *cpumask, const char *dev_
 {
        struct spdk_vhost_blk_dev *bvdev = NULL;
        struct spdk_bdev *bdev;
+       uint64_t features = 0;
        int ret = 0;
 
        spdk_vhost_lock();
@@ -830,7 +1014,7 @@ spdk_vhost_blk_construct(const char *name, const char *cpumask, const char *dev_
                goto out;
        }
 
-       bvdev = spdk_dma_zmalloc(sizeof(*bvdev), SPDK_CACHE_LINE_SIZE, NULL);
+       bvdev = calloc(1, sizeof(*bvdev));
        if (bvdev == NULL) {
                ret = -ENOMEM;
                goto out;
@@ -851,14 +1035,27 @@ spdk_vhost_blk_construct(const char *name, const char *cpumask, const char *dev_
                goto out;
        }
 
-       if (readonly && rte_vhost_driver_enable_features(bvdev->vdev.path, (1ULL << VIRTIO_BLK_F_RO))) {
-               SPDK_ERRLOG("Controller %s: failed to set as a readonly\n", name);
-               spdk_bdev_close(bvdev->bdev_desc);
+       if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
+               features |= (1ULL << VIRTIO_BLK_F_DISCARD);
+       }
+       if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
+               features |= (1ULL << VIRTIO_BLK_F_WRITE_ZEROES);
+       }
+       if (readonly) {
+               features |= (1ULL << VIRTIO_BLK_F_RO);
+       }
+       if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH)) {
+               features |= (1ULL << VIRTIO_BLK_F_FLUSH);
+       }
+
+       if (features && rte_vhost_driver_enable_features(bvdev->vdev.path, features)) {
+               SPDK_ERRLOG("Controller %s: failed to enable features 0x%"PRIx64"\n", name, features);
 
                if (spdk_vhost_dev_unregister(&bvdev->vdev) != 0) {
                        SPDK_ERRLOG("Controller %s: failed to remove controller\n", name);
                }
 
+               spdk_bdev_close(bvdev->bdev_desc);
                ret = -1;
                goto out;
        }
@@ -866,7 +1063,7 @@ spdk_vhost_blk_construct(const char *name, const char *cpumask, const char *dev_
        SPDK_INFOLOG(SPDK_LOG_VHOST, "Controller %s: using bdev '%s'\n", name, dev_name);
 out:
        if (ret != 0 && bvdev) {
-               spdk_dma_free(bvdev);
+               free(bvdev);
        }
        spdk_vhost_unlock();
        return ret;
@@ -893,7 +1090,7 @@ spdk_vhost_blk_destroy(struct spdk_vhost_dev *vdev)
        }
        bvdev->bdev = NULL;
 
-       spdk_dma_free(bvdev);
+       free(bvdev);
        return 0;
 }