#include "qapi/error.h"
#include "qapi/qmp/qdict.h"
#include "qapi/qmp/qstring.h"
+#include "qemu/defer-call.h"
#include "qemu/error-report.h"
#include "qemu/main-loop.h"
#include "qemu/module.h"
#include "qemu/cutils.h"
#include "qemu/option.h"
+#include "qemu/memalign.h"
#include "qemu/vfio-helpers.h"
+#include "block/block-io.h"
#include "block/block_int.h"
+#include "sysemu/block-backend.h"
#include "sysemu/replay.h"
#include "trace.h"
int blkshift;
uint64_t max_transfer;
- bool plugged;
bool supports_write_zeroes;
bool supports_discard;
static bool nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q,
unsigned nentries, size_t entry_bytes, Error **errp)
{
+ ERRP_GUARD();
size_t bytes;
int r;
- bytes = ROUND_UP(nentries * entry_bytes, qemu_real_host_page_size);
+ bytes = ROUND_UP(nentries * entry_bytes, qemu_real_host_page_size());
q->head = q->tail = 0;
- q->queue = qemu_try_memalign(qemu_real_host_page_size, bytes);
+ q->queue = qemu_try_memalign(qemu_real_host_page_size(), bytes);
if (!q->queue) {
error_setg(errp, "Cannot allocate queue");
return false;
return r == 0;
}
+static void nvme_free_queue(NVMeQueue *q)
+{
+ qemu_vfree(q->queue);
+}
+
static void nvme_free_queue_pair(NVMeQueuePair *q)
{
- trace_nvme_free_queue_pair(q->index, q);
+ trace_nvme_free_queue_pair(q->index, q, &q->cq, &q->sq);
if (q->completion_bh) {
qemu_bh_delete(q->completion_bh);
}
+ nvme_free_queue(&q->sq);
+ nvme_free_queue(&q->cq);
qemu_vfree(q->prp_list_pages);
- qemu_vfree(q->sq.queue);
- qemu_vfree(q->cq.queue);
qemu_mutex_destroy(&q->lock);
g_free(q);
}
NVMeQueuePair *q = opaque;
qemu_mutex_lock(&q->lock);
- while (qemu_co_enter_next(&q->free_req_queue, &q->lock)) {
- /* Retry all pending requests */
+ while (q->free_req_head != -1 &&
+ qemu_co_enter_next(&q->free_req_queue, &q->lock)) {
+ /* Retry waiting requests */
}
qemu_mutex_unlock(&q->lock);
}
unsigned idx, size_t size,
Error **errp)
{
+ ERRP_GUARD();
int i, r;
NVMeQueuePair *q;
uint64_t prp_list_iova;
trace_nvme_create_queue_pair(idx, q, size, aio_context,
event_notifier_get_fd(s->irq_notifier));
bytes = QEMU_ALIGN_UP(s->page_size * NVME_NUM_REQS,
- qemu_real_host_page_size);
- q->prp_list_pages = qemu_try_memalign(qemu_real_host_page_size, bytes);
+ qemu_real_host_page_size());
+ q->prp_list_pages = qemu_try_memalign(qemu_real_host_page_size(), bytes);
if (!q->prp_list_pages) {
error_setg(errp, "Cannot allocate PRP page list");
goto fail;
{
BDRVNVMeState *s = q->s;
- if (s->plugged || !q->need_kick) {
+ if (!q->need_kick) {
return;
}
trace_nvme_kick(s, q->index);
q->need_kick = 0;
}
-/* Find a free request element if any, otherwise:
- * a) if in coroutine context, try to wait for one to become available;
- * b) if not in coroutine, return NULL;
- */
-static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
+static NVMeRequest *nvme_get_free_req_nofail_locked(NVMeQueuePair *q)
{
NVMeRequest *req;
- qemu_mutex_lock(&q->lock);
-
- while (q->free_req_head == -1) {
- if (qemu_in_coroutine()) {
- trace_nvme_free_req_queue_wait(q->s, q->index);
- qemu_co_queue_wait(&q->free_req_queue, &q->lock);
- } else {
- qemu_mutex_unlock(&q->lock);
- return NULL;
- }
- }
-
req = &q->reqs[q->free_req_head];
q->free_req_head = req->free_req_next;
req->free_req_next = -1;
-
- qemu_mutex_unlock(&q->lock);
return req;
}
+/* Return a free request element if any, otherwise return NULL. */
+static NVMeRequest *nvme_get_free_req_nowait(NVMeQueuePair *q)
+{
+ QEMU_LOCK_GUARD(&q->lock);
+ if (q->free_req_head == -1) {
+ return NULL;
+ }
+ return nvme_get_free_req_nofail_locked(q);
+}
+
+/*
+ * Wait for a free request to become available if necessary, then
+ * return it.
+ */
+static coroutine_fn NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
+{
+ QEMU_LOCK_GUARD(&q->lock);
+
+ while (q->free_req_head == -1) {
+ trace_nvme_free_req_queue_wait(q->s, q->index);
+ qemu_co_queue_wait(&q->free_req_queue, &q->lock);
+ }
+
+ return nvme_get_free_req_nofail_locked(q);
+}
+
/* With q->lock */
static void nvme_put_free_req_locked(NVMeQueuePair *q, NVMeRequest *req)
{
NvmeCqe *c;
trace_nvme_process_completion(s, q->index, q->inflight);
- if (s->plugged) {
- trace_nvme_process_completion_queue_plugged(s, q->index);
- return false;
- }
/*
* Support re-entrancy when a request cb() function invokes aio_poll().
q->cq_phase = !q->cq_phase;
}
cid = le16_to_cpu(c->cid);
- if (cid == 0 || cid > NVME_QUEUE_SIZE) {
- warn_report("NVMe: Unexpected CID in completion queue: %"PRIu32", "
- "queue size: %u", cid, NVME_QUEUE_SIZE);
+ if (cid == 0 || cid > NVME_NUM_REQS) {
+ warn_report("NVMe: Unexpected CID in completion queue: %" PRIu32
+ ", should be within: 1..%u inclusively", cid,
+ NVME_NUM_REQS);
continue;
}
trace_nvme_complete_command(s, q->index, cid);
}
}
+static void nvme_deferred_fn(void *opaque)
+{
+ NVMeQueuePair *q = opaque;
+
+ QEMU_LOCK_GUARD(&q->lock);
+ nvme_kick(q);
+ nvme_process_completion(q);
+}
+
static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req,
NvmeCmd *cmd, BlockCompletionFunc cb,
void *opaque)
q->sq.tail * NVME_SQ_ENTRY_BYTES, cmd, sizeof(*cmd));
q->sq.tail = (q->sq.tail + 1) % NVME_QUEUE_SIZE;
q->need_kick++;
- nvme_kick(q);
- nvme_process_completion(q);
qemu_mutex_unlock(&q->lock);
+
+ defer_call(nvme_deferred_fn, q);
}
static void nvme_admin_cmd_sync_cb(void *opaque, int ret)
AioContext *aio_context = bdrv_get_aio_context(bs);
NVMeRequest *req;
int ret = -EINPROGRESS;
- req = nvme_get_free_req(q);
+ req = nvme_get_free_req_nowait(q);
if (!req) {
return -EBUSY;
}
/* Returns true on success, false on failure. */
static bool nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
{
+ ERRP_GUARD();
BDRVNVMeState *s = bs->opaque;
bool ret = false;
- union {
+ QEMU_AUTO_VFREE union {
NvmeIdCtrl ctrl;
NvmeIdNs ns;
- } *id;
+ } *id = NULL;
NvmeLBAF *lbaf;
uint16_t oncs;
int r;
.opcode = NVME_ADM_CMD_IDENTIFY,
.cdw10 = cpu_to_le32(0x1),
};
- size_t id_size = QEMU_ALIGN_UP(sizeof(*id), qemu_real_host_page_size);
+ size_t id_size = QEMU_ALIGN_UP(sizeof(*id), qemu_real_host_page_size());
- id = qemu_try_memalign(qemu_real_host_page_size, id_size);
+ id = qemu_try_memalign(qemu_real_host_page_size(), id_size);
if (!id) {
error_setg(errp, "Cannot allocate buffer for identify response");
goto out;
s->blkshift = lbaf->ds;
out:
qemu_vfio_dma_unmap(s->vfio, id);
- qemu_vfree(id);
return ret;
}
-static bool nvme_poll_queue(NVMeQueuePair *q)
+static void nvme_poll_queue(NVMeQueuePair *q)
{
- bool progress = false;
-
const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES;
NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset];
* cannot race with itself.
*/
if ((le16_to_cpu(cqe->status) & 0x1) == q->cq_phase) {
- return false;
+ return;
}
qemu_mutex_lock(&q->lock);
while (nvme_process_completion(q)) {
/* Keep polling */
- progress = true;
}
qemu_mutex_unlock(&q->lock);
-
- return progress;
}
-static bool nvme_poll_queues(BDRVNVMeState *s)
+static void nvme_poll_queues(BDRVNVMeState *s)
{
- bool progress = false;
int i;
for (i = 0; i < s->queue_count; i++) {
- if (nvme_poll_queue(s->queues[i])) {
- progress = true;
- }
+ nvme_poll_queue(s->queues[i]);
}
- return progress;
}
static void nvme_handle_event(EventNotifier *n)
EventNotifier *e = opaque;
BDRVNVMeState *s = container_of(e, BDRVNVMeState,
irq_notifier[MSIX_SHARED_IRQ_IDX]);
+ int i;
- return nvme_poll_queues(s);
+ for (i = 0; i < s->queue_count; i++) {
+ NVMeQueuePair *q = s->queues[i];
+ const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES;
+ NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset];
+
+ /*
+ * q->lock isn't needed because nvme_process_completion() only runs in
+ * the event loop thread and cannot race with itself.
+ */
+ if ((le16_to_cpu(cqe->status) & 0x1) != q->cq_phase) {
+ return true;
+ }
+ }
+ return false;
+}
+
+static void nvme_poll_ready(EventNotifier *e)
+{
+ BDRVNVMeState *s = container_of(e, BDRVNVMeState,
+ irq_notifier[MSIX_SHARED_IRQ_IDX]);
+
+ nvme_poll_queues(s);
}
static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
}
aio_set_event_notifier(bdrv_get_aio_context(bs),
&s->irq_notifier[MSIX_SHARED_IRQ_IDX],
- false, nvme_handle_event, nvme_poll_cb);
+ nvme_handle_event, nvme_poll_cb,
+ nvme_poll_ready);
if (!nvme_identify(bs, namespace, errp)) {
ret = -EIO;
g_free(s->queues);
aio_set_event_notifier(bdrv_get_aio_context(bs),
&s->irq_notifier[MSIX_SHARED_IRQ_IDX],
- false, NULL, NULL);
+ NULL, NULL, NULL);
event_notifier_cleanup(&s->irq_notifier[MSIX_SHARED_IRQ_IDX]);
qemu_vfio_pci_unmap_bar(s->vfio, 0, s->bar0_wo_map,
0, sizeof(NvmeBar) + NVME_DOORBELL_SIZE);
return ret;
}
-static int64_t nvme_getlength(BlockDriverState *bs)
+static int64_t coroutine_fn nvme_co_getlength(BlockDriverState *bs)
{
BDRVNVMeState *s = bs->opaque;
return s->nsze << s->blkshift;
bool retry = true;
uint64_t iova;
size_t len = QEMU_ALIGN_UP(qiov->iov[i].iov_len,
- qemu_real_host_page_size);
+ qemu_real_host_page_size());
try_map:
r = qemu_vfio_dma_map(s->vfio,
qiov->iov[i].iov_base,
for (i = 0; i < qiov->niov; ++i) {
if (!QEMU_PTR_IS_ALIGNED(qiov->iov[i].iov_base,
- qemu_real_host_page_size) ||
- !QEMU_IS_ALIGNED(qiov->iov[i].iov_len, qemu_real_host_page_size)) {
+ qemu_real_host_page_size()) ||
+ !QEMU_IS_ALIGNED(qiov->iov[i].iov_len, qemu_real_host_page_size())) {
trace_nvme_qiov_unaligned(qiov, i, qiov->iov[i].iov_base,
qiov->iov[i].iov_len, s->page_size);
return false;
return true;
}
-static int nvme_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
- QEMUIOVector *qiov, bool is_write, int flags)
+static coroutine_fn int nvme_co_prw(BlockDriverState *bs,
+ uint64_t offset, uint64_t bytes,
+ QEMUIOVector *qiov, bool is_write,
+ int flags)
{
BDRVNVMeState *s = bs->opaque;
int r;
- uint8_t *buf = NULL;
+ QEMU_AUTO_VFREE uint8_t *buf = NULL;
QEMUIOVector local_qiov;
- size_t len = QEMU_ALIGN_UP(bytes, qemu_real_host_page_size);
+ size_t len = QEMU_ALIGN_UP(bytes, qemu_real_host_page_size());
assert(QEMU_IS_ALIGNED(offset, s->page_size));
assert(QEMU_IS_ALIGNED(bytes, s->page_size));
assert(bytes <= s->max_transfer);
}
s->stats.unaligned_accesses++;
trace_nvme_prw_buffered(s, offset, bytes, qiov->niov, is_write);
- buf = qemu_try_memalign(qemu_real_host_page_size, len);
+ buf = qemu_try_memalign(qemu_real_host_page_size(), len);
if (!buf) {
return -ENOMEM;
if (!r && !is_write) {
qemu_iovec_from_buf(qiov, 0, buf, bytes);
}
- qemu_vfree(buf);
return r;
}
static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
int64_t offset,
- int bytes,
+ int64_t bytes,
BdrvRequestFlags flags)
{
BDRVNVMeState *s = bs->opaque;
NVMeQueuePair *ioq = s->queues[INDEX_IO(0)];
NVMeRequest *req;
-
- uint32_t cdw12 = ((bytes >> s->blkshift) - 1) & 0xFFFF;
+ uint32_t cdw12;
if (!s->supports_write_zeroes) {
return -ENOTSUP;
}
+ if (bytes == 0) {
+ return 0;
+ }
+
+ cdw12 = ((bytes >> s->blkshift) - 1) & 0xFFFF;
+ /*
+ * We should not lose information. pwrite_zeroes_alignment and
+ * max_pwrite_zeroes guarantees it.
+ */
+ assert(((cdw12 + 1) << s->blkshift) == bytes);
+
NvmeCmd cmd = {
.opcode = NVME_CMD_WRITE_ZEROES,
.nsid = cpu_to_le32(s->nsid),
static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
int64_t offset,
- int bytes)
+ int64_t bytes)
{
BDRVNVMeState *s = bs->opaque;
NVMeQueuePair *ioq = s->queues[INDEX_IO(0)];
NVMeRequest *req;
- NvmeDsmRange *buf;
+ QEMU_AUTO_VFREE NvmeDsmRange *buf = NULL;
QEMUIOVector local_qiov;
int ret;
assert(s->queue_count > 1);
+ /*
+ * Filling the @buf requires @offset and @bytes to satisfy restrictions
+ * defined in nvme_refresh_limits().
+ */
+ assert(QEMU_IS_ALIGNED(bytes, 1UL << s->blkshift));
+ assert(QEMU_IS_ALIGNED(offset, 1UL << s->blkshift));
+ assert((bytes >> s->blkshift) <= UINT32_MAX);
+
buf = qemu_try_memalign(s->page_size, s->page_size);
if (!buf) {
return -ENOMEM;
trace_nvme_dsm_done(s, offset, bytes, ret);
out:
qemu_iovec_destroy(&local_qiov);
- qemu_vfree(buf);
return ret;
}
return -ENOTSUP;
}
- cur_length = nvme_getlength(bs);
+ cur_length = nvme_co_getlength(bs);
if (offset != cur_length && exact) {
error_setg(errp, "Cannot resize NVMe devices");
return -ENOTSUP;
bs->bl.opt_mem_alignment = s->page_size;
bs->bl.request_alignment = s->page_size;
bs->bl.max_transfer = s->max_transfer;
+
+ /*
+ * Look at nvme_co_pwrite_zeroes: after shift and decrement we should get
+ * at most 0xFFFF
+ */
+ bs->bl.max_pwrite_zeroes = 1ULL << (s->blkshift + 16);
+ bs->bl.pwrite_zeroes_alignment = MAX(bs->bl.request_alignment,
+ 1UL << s->blkshift);
+
+ bs->bl.max_pdiscard = (uint64_t)UINT32_MAX << s->blkshift;
+ bs->bl.pdiscard_alignment = MAX(bs->bl.request_alignment,
+ 1UL << s->blkshift);
}
static void nvme_detach_aio_context(BlockDriverState *bs)
aio_set_event_notifier(bdrv_get_aio_context(bs),
&s->irq_notifier[MSIX_SHARED_IRQ_IDX],
- false, NULL, NULL);
+ NULL, NULL, NULL);
}
static void nvme_attach_aio_context(BlockDriverState *bs,
s->aio_context = new_context;
aio_set_event_notifier(new_context, &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
- false, nvme_handle_event, nvme_poll_cb);
+ nvme_handle_event, nvme_poll_cb,
+ nvme_poll_ready);
for (unsigned i = 0; i < s->queue_count; i++) {
NVMeQueuePair *q = s->queues[i];
}
}
-static void nvme_aio_plug(BlockDriverState *bs)
-{
- BDRVNVMeState *s = bs->opaque;
- assert(!s->plugged);
- s->plugged = true;
-}
-
-static void nvme_aio_unplug(BlockDriverState *bs)
-{
- BDRVNVMeState *s = bs->opaque;
- assert(s->plugged);
- s->plugged = false;
- for (unsigned i = INDEX_IO(0); i < s->queue_count; i++) {
- NVMeQueuePair *q = s->queues[i];
- qemu_mutex_lock(&q->lock);
- nvme_kick(q);
- nvme_process_completion(q);
- qemu_mutex_unlock(&q->lock);
- }
-}
-
-static void nvme_register_buf(BlockDriverState *bs, void *host, size_t size)
+static bool nvme_register_buf(BlockDriverState *bs, void *host, size_t size,
+ Error **errp)
{
int ret;
- Error *local_err = NULL;
BDRVNVMeState *s = bs->opaque;
- ret = qemu_vfio_dma_map(s->vfio, host, size, false, NULL, &local_err);
- if (ret) {
- /* FIXME: we may run out of IOVA addresses after repeated
- * bdrv_register_buf/bdrv_unregister_buf, because nvme_vfio_dma_unmap
- * doesn't reclaim addresses for fixed mappings. */
- error_reportf_err(local_err, "nvme_register_buf failed: ");
- }
+ /*
+ * FIXME: we may run out of IOVA addresses after repeated
+ * bdrv_register_buf/bdrv_unregister_buf, because nvme_vfio_dma_unmap
+ * doesn't reclaim addresses for fixed mappings.
+ */
+ ret = qemu_vfio_dma_map(s->vfio, host, size, false, NULL, errp);
+ return ret == 0;
}
-static void nvme_unregister_buf(BlockDriverState *bs, void *host)
+static void nvme_unregister_buf(BlockDriverState *bs, void *host, size_t size)
{
BDRVNVMeState *s = bs->opaque;
.bdrv_parse_filename = nvme_parse_filename,
.bdrv_file_open = nvme_file_open,
.bdrv_close = nvme_close,
- .bdrv_getlength = nvme_getlength,
+ .bdrv_co_getlength = nvme_co_getlength,
.bdrv_probe_blocksizes = nvme_probe_blocksizes,
.bdrv_co_truncate = nvme_co_truncate,
.bdrv_detach_aio_context = nvme_detach_aio_context,
.bdrv_attach_aio_context = nvme_attach_aio_context,
- .bdrv_io_plug = nvme_aio_plug,
- .bdrv_io_unplug = nvme_aio_unplug,
-
.bdrv_register_buf = nvme_register_buf,
.bdrv_unregister_buf = nvme_unregister_buf,
};