* Notes on coding style
* ---------------------
* While QEMU coding style prefers lowercase hexadecimals in constants, the
- * NVMe subsystem use thes format from the NVMe specifications in the comments
+ * NVMe subsystem use this format from the NVMe specifications in the comments
* (i.e. 'h' suffix instead of '0x' prefix).
*
* Usage
* subsys=<subsys_id>
* -device nvme-ns,drive=<drive_id>,bus=<bus_name>,nsid=<nsid>,\
* zoned=<true|false[optional]>, \
- * subsys=<subsys_id>,detached=<true|false[optional]>
+ * subsys=<subsys_id>,shared=<true|false[optional]>, \
+ * detached=<true|false[optional]>, \
+ * zoned.zone_size=<N[optional]>, \
+ * zoned.zone_capacity=<N[optional]>, \
+ * zoned.descr_ext_size=<N[optional]>, \
+ * zoned.max_active=<N[optional]>, \
+ * zoned.max_open=<N[optional]>, \
+ * zoned.cross_read=<true|false[optional]>
*
* Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
* offset 0 in BAR2 and supports only WDS, RDS and SQS for now. By default, the
}
/*
- * When metadata is transfered as extended LBAs, the DPTR mapped into `sg`
+ * When metadata is transferred as extended LBAs, the DPTR mapped into `sg`
* holds both data and metadata. This function splits the data and metadata
* into two separate QSG/IOVs.
*/
len -= trans_len;
if (len) {
if (len > n->page_size) {
- uint64_t prp_list[n->max_prp_ents];
+ g_autofree uint64_t *prp_list = g_new(uint64_t, n->max_prp_ents);
uint32_t nents, prp_trans;
int i = 0;
* descriptors and segment chain) than the command transfer size, so it is
* not bounded by MDTS.
*/
- const int SEG_CHUNK_SIZE = 256;
+#define SEG_CHUNK_SIZE 256
NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld;
uint64_t nsgld;
}
static inline void nvme_blk_read(BlockBackend *blk, int64_t offset,
- BlockCompletionFunc *cb, NvmeRequest *req)
+ uint32_t align, BlockCompletionFunc *cb,
+ NvmeRequest *req)
{
assert(req->sg.flags & NVME_SG_ALLOC);
if (req->sg.flags & NVME_SG_DMA) {
- req->aiocb = dma_blk_read(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE,
- cb, req);
+ req->aiocb = dma_blk_read(blk, &req->sg.qsg, offset, align, cb, req);
} else {
req->aiocb = blk_aio_preadv(blk, offset, &req->sg.iov, 0, cb, req);
}
}
static inline void nvme_blk_write(BlockBackend *blk, int64_t offset,
- BlockCompletionFunc *cb, NvmeRequest *req)
+ uint32_t align, BlockCompletionFunc *cb,
+ NvmeRequest *req)
{
assert(req->sg.flags & NVME_SG_ALLOC);
if (req->sg.flags & NVME_SG_DMA) {
- req->aiocb = dma_blk_write(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE,
- cb, req);
+ req->aiocb = dma_blk_write(blk, &req->sg.qsg, offset, align, cb, req);
} else {
req->aiocb = blk_aio_pwritev(blk, offset, &req->sg.iov, 0, cb, req);
}
static void nvme_update_cq_eventidx(const NvmeCQueue *cq)
{
- uint32_t v = cpu_to_le32(cq->head);
-
trace_pci_nvme_update_cq_eventidx(cq->cqid, cq->head);
- pci_dma_write(PCI_DEVICE(cq->ctrl), cq->ei_addr, &v, sizeof(v));
+ stl_le_pci_dma(PCI_DEVICE(cq->ctrl), cq->ei_addr, cq->head,
+ MEMTXATTRS_UNSPECIFIED);
}
static void nvme_update_cq_head(NvmeCQueue *cq)
{
- uint32_t v;
-
- pci_dma_read(PCI_DEVICE(cq->ctrl), cq->db_addr, &v, sizeof(v));
-
- cq->head = le32_to_cpu(v);
+ ldl_le_pci_dma(PCI_DEVICE(cq->ctrl), cq->db_addr, &cq->head,
+ MEMTXATTRS_UNSPECIFIED);
trace_pci_nvme_update_cq_head(cq->cqid, cq->head);
}
req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase);
req->cqe.sq_id = cpu_to_le16(sq->sqid);
req->cqe.sq_head = cpu_to_le16(sq->head);
- addr = cq->dma_addr + cq->tail * n->cqe_size;
+ addr = cq->dma_addr + (cq->tail << NVME_CQES);
ret = pci_dma_write(PCI_DEVICE(n), addr, (void *)&req->cqe,
sizeof(req->cqe));
if (ret) {
case NVME_CMD_WRITE:
case NVME_CMD_WRITE_ZEROES:
case NVME_CMD_ZONE_APPEND:
+ case NVME_CMD_COPY:
status = NVME_WRITE_FAULT;
break;
default:
rw->opcode == NVME_CMD_WRITE_ZEROES;
}
-static AioContext *nvme_get_aio_context(BlockAIOCB *acb)
-{
- return qemu_get_aio_context();
-}
-
static void nvme_misc_cb(void *opaque, int ret)
{
NvmeRequest *req = opaque;
}
if (req->cmd.opcode == NVME_CMD_READ) {
- return nvme_blk_read(blk, offset, nvme_rw_complete_cb, req);
+ return nvme_blk_read(blk, offset, 1, nvme_rw_complete_cb, req);
}
- return nvme_blk_write(blk, offset, nvme_rw_complete_cb, req);
+ return nvme_blk_write(blk, offset, 1, nvme_rw_complete_cb, req);
}
}
for (bufp = buf; mbufp < end; bufp += ns->lbaf.ms, mbufp += ns->lbaf.ms) {
if (memcmp(bufp + pil, mbufp + pil, ns->lbaf.ms - pil)) {
- req->status = NVME_CMP_FAILURE;
+ req->status = NVME_CMP_FAILURE | NVME_DNR;
goto out;
}
}
}
if (memcmp(buf, ctx->mdata.bounce, ctx->mdata.iov.size)) {
- req->status = NVME_CMP_FAILURE;
+ req->status = NVME_CMP_FAILURE | NVME_DNR;
goto out;
}
}
if (memcmp(buf, ctx->data.bounce, ctx->data.iov.size)) {
- req->status = NVME_CMP_FAILURE;
+ req->status = NVME_CMP_FAILURE | NVME_DNR;
goto out;
}
status = nvme_h2c(n, (uint8_t *)iocb->range, sizeof(NvmeDsmRange) * nr,
req);
if (status) {
+ g_free(iocb->range);
+ qemu_aio_unref(iocb);
+
return status;
}
}
}
+static inline uint16_t nvme_check_copy_mcl(NvmeNamespace *ns,
+ NvmeCopyAIOCB *iocb, uint16_t nr)
+{
+ uint32_t copy_len = 0;
+
+ for (int idx = 0; idx < nr; idx++) {
+ uint32_t nlb;
+ nvme_copy_source_range_parse(iocb->ranges, idx, iocb->format, NULL,
+ &nlb, NULL, NULL, NULL);
+ copy_len += nlb + 1;
+ }
+
+ if (copy_len > ns->id_ns.mcl) {
+ return NVME_CMD_SIZE_LIMIT | NVME_DNR;
+ }
+
+ return NVME_SUCCESS;
+}
+
static void nvme_copy_out_completed_cb(void *opaque, int ret)
{
NvmeCopyAIOCB *iocb = opaque;
}
}
+ status = nvme_check_copy_mcl(ns, iocb, nr);
+ if (status) {
+ goto invalid;
+ }
+
iocb->req = req;
iocb->ret = 0;
iocb->nr = nr;
static const AIOCBInfo nvme_flush_aiocb_info = {
.aiocb_size = sizeof(NvmeFlushAIOCB),
.cancel_async = nvme_flush_cancel,
- .get_aio_context = nvme_get_aio_context,
};
static void nvme_do_flush(NvmeFlushAIOCB *iocb);
block_acct_start(blk_get_stats(blk), &req->acct, data_size,
BLOCK_ACCT_READ);
- nvme_blk_read(blk, data_offset, nvme_rw_cb, req);
+ nvme_blk_read(blk, data_offset, BDRV_SECTOR_SIZE, nvme_rw_cb, req);
return NVME_NO_COMPLETE;
invalid:
block_acct_start(blk_get_stats(blk), &req->acct, data_size,
BLOCK_ACCT_WRITE);
- nvme_blk_write(blk, data_offset, nvme_rw_cb, req);
+ nvme_blk_write(blk, data_offset, BDRV_SECTOR_SIZE, nvme_rw_cb, req);
} else {
req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size,
BDRV_REQ_MAY_UNMAP, nvme_rw_cb,
uint32_t npid = (cdw10 >> 1) + 1;
unsigned int i = 0;
g_autofree uint16_t *pids = NULL;
- uint32_t maxnpid = n->subsys->endgrp.fdp.nrg * n->subsys->endgrp.fdp.nruh;
+ uint32_t maxnpid;
+
+ if (!ns->endgrp || !ns->endgrp->fdp.enabled) {
+ return NVME_FDP_DISABLED | NVME_DNR;
+ }
+
+ maxnpid = n->subsys->endgrp.fdp.nrg * n->subsys->endgrp.fdp.nruh;
if (unlikely(npid >= MIN(NVME_FDP_MAXPIDS, maxnpid))) {
return NVME_INVALID_FIELD | NVME_DNR;
QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
}
- sq->bh = qemu_bh_new(nvme_process_sq, sq);
+ sq->bh = qemu_bh_new_guarded(nvme_process_sq, sq,
+ &DEVICE(sq->ctrl)->mem_reentrancy_guard);
if (n->dbbuf_enabled) {
sq->db_addr = n->dbbuf_dbs + (sqid << 3);
}
log_size = sizeof(NvmeFdpEventsLog) + ebuf->nelems * sizeof(NvmeFdpEvent);
+
+ if (off >= log_size) {
+ return NVME_INVALID_FIELD | NVME_DNR;
+ }
+
trans_len = MIN(log_size - off, buf_len);
elog = g_malloc0(log_size);
elog->num_events = cpu_to_le32(ebuf->nelems);
}
}
n->cq[cqid] = cq;
- cq->bh = qemu_bh_new(nvme_post_cqes, cq);
+ cq->bh = qemu_bh_new_guarded(nvme_post_cqes, cq,
+ &DEVICE(cq->ctrl)->mem_reentrancy_guard);
}
static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req)
uint16_t qsize = le16_to_cpu(c->qsize);
uint16_t qflags = le16_to_cpu(c->cq_flags);
uint64_t prp1 = le64_to_cpu(c->prp1);
+ uint32_t cc = ldq_le_p(&n->bar.cc);
+ uint8_t iocqes = NVME_CC_IOCQES(cc);
+ uint8_t iosqes = NVME_CC_IOSQES(cc);
trace_pci_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
NVME_CQ_FLAGS_IEN(qflags) != 0);
+ if (iosqes != NVME_SQES || iocqes != NVME_CQES) {
+ trace_pci_nvme_err_invalid_create_cq_entry_size(iosqes, iocqes);
+ return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
+ }
+
if (unlikely(!cqid || cqid > n->conf_ioqpairs || n->cq[cqid] != NULL)) {
trace_pci_nvme_err_invalid_create_cq_cqid(cqid);
return NVME_INVALID_QID | NVME_DNR;
static const AIOCBInfo nvme_format_aiocb_info = {
.aiocb_size = sizeof(NvmeFormatAIOCB),
.cancel_async = nvme_format_cancel,
- .get_aio_context = nvme_get_aio_context,
};
static void nvme_format_set(NvmeNamespace *ns, uint8_t lbaf, uint8_t mset,
*/
sq->db_addr = dbs_addr + (i << 3);
sq->ei_addr = eis_addr + (i << 3);
- pci_dma_write(pci, sq->db_addr, &sq->tail, sizeof(sq->tail));
+ stl_le_pci_dma(pci, sq->db_addr, sq->tail, MEMTXATTRS_UNSPECIFIED);
if (n->params.ioeventfd && sq->sqid != 0) {
if (!nvme_init_sq_ioeventfd(sq)) {
/* CAP.DSTRD is 0, so offset of ith cq db_addr is (i<<3)+(1<<2) */
cq->db_addr = dbs_addr + (i << 3) + (1 << 2);
cq->ei_addr = eis_addr + (i << 3) + (1 << 2);
- pci_dma_write(pci, cq->db_addr, &cq->head, sizeof(cq->head));
+ stl_le_pci_dma(pci, cq->db_addr, cq->head, MEMTXATTRS_UNSPECIFIED);
if (n->params.ioeventfd && cq->cqid != 0) {
if (!nvme_init_cq_ioeventfd(cq)) {
case NVME_DIRECTIVE_IDENTIFY:
switch (doper) {
case NVME_DIRECTIVE_RETURN_PARAMS:
- if (ns->endgrp->fdp.enabled) {
+ if (ns->endgrp && ns->endgrp->fdp.enabled) {
id.supported |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT;
id.enabled |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT;
id.persistent |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT;
static void nvme_update_sq_eventidx(const NvmeSQueue *sq)
{
- uint32_t v = cpu_to_le32(sq->tail);
-
trace_pci_nvme_update_sq_eventidx(sq->sqid, sq->tail);
- pci_dma_write(PCI_DEVICE(sq->ctrl), sq->ei_addr, &v, sizeof(v));
+ stl_le_pci_dma(PCI_DEVICE(sq->ctrl), sq->ei_addr, sq->tail,
+ MEMTXATTRS_UNSPECIFIED);
}
static void nvme_update_sq_tail(NvmeSQueue *sq)
{
- uint32_t v;
-
- pci_dma_read(PCI_DEVICE(sq->ctrl), sq->db_addr, &v, sizeof(v));
-
- sq->tail = le32_to_cpu(v);
+ ldl_le_pci_dma(PCI_DEVICE(sq->ctrl), sq->db_addr, &sq->tail,
+ MEMTXATTRS_UNSPECIFIED);
trace_pci_nvme_update_sq_tail(sq->sqid, sq->tail);
}
}
while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
- addr = sq->dma_addr + sq->head * n->sqe_size;
+ addr = sq->dma_addr + (sq->head << NVME_SQES);
if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) {
trace_pci_nvme_err_addr_read(addr);
trace_pci_nvme_err_cfs();
if (pci_is_vf(PCI_DEVICE(n)) && !sctrl->scs) {
trace_pci_nvme_err_startfail_virt_state(le16_to_cpu(sctrl->nvi),
- le16_to_cpu(sctrl->nvq),
- sctrl->scs ? "ONLINE" :
- "OFFLINE");
+ le16_to_cpu(sctrl->nvq));
return -1;
}
if (unlikely(n->cq[0])) {
NVME_CAP_MPSMAX(cap));
return -1;
}
- if (unlikely(NVME_CC_IOCQES(cc) <
- NVME_CTRL_CQES_MIN(n->id_ctrl.cqes))) {
- trace_pci_nvme_err_startfail_cqent_too_small(
- NVME_CC_IOCQES(cc),
- NVME_CTRL_CQES_MIN(cap));
- return -1;
- }
- if (unlikely(NVME_CC_IOCQES(cc) >
- NVME_CTRL_CQES_MAX(n->id_ctrl.cqes))) {
- trace_pci_nvme_err_startfail_cqent_too_large(
- NVME_CC_IOCQES(cc),
- NVME_CTRL_CQES_MAX(cap));
- return -1;
- }
- if (unlikely(NVME_CC_IOSQES(cc) <
- NVME_CTRL_SQES_MIN(n->id_ctrl.sqes))) {
- trace_pci_nvme_err_startfail_sqent_too_small(
- NVME_CC_IOSQES(cc),
- NVME_CTRL_SQES_MIN(cap));
- return -1;
- }
- if (unlikely(NVME_CC_IOSQES(cc) >
- NVME_CTRL_SQES_MAX(n->id_ctrl.sqes))) {
- trace_pci_nvme_err_startfail_sqent_too_large(
- NVME_CC_IOSQES(cc),
- NVME_CTRL_SQES_MAX(cap));
- return -1;
- }
if (unlikely(!NVME_AQA_ASQS(aqa))) {
trace_pci_nvme_err_startfail_asqent_sz_zero();
return -1;
n->page_bits = page_bits;
n->page_size = page_size;
n->max_prp_ents = n->page_size / sizeof(uint64_t);
- n->cqe_size = 1 << NVME_CC_IOCQES(cc);
- n->sqe_size = 1 << NVME_CC_IOSQES(cc);
nvme_init_cq(&n->admin_cq, n, acq, 0, 0, NVME_AQA_ACQS(aqa) + 1, 1);
nvme_init_sq(&n->admin_sq, n, asq, 0, 0, NVME_AQA_ASQS(aqa) + 1);
/*
* NVM Express v1.3d, Section 4.1 state: "If host software writes
* an invalid value to the Submission Queue Tail Doorbell or
- * Completion Queue Head Doorbell regiter and an Asynchronous Event
+ * Completion Queue Head Doorbell register and an Asynchronous Event
* Request command is outstanding, then an asynchronous event is
* posted to the Admin Completion Queue with a status code of
* Invalid Doorbell Write Value."
start_sqs = nvme_cq_full(cq) ? 1 : 0;
cq->head = new_head;
if (!qid && n->dbbuf_enabled) {
- pci_dma_write(pci, cq->db_addr, &cq->head, sizeof(cq->head));
+ stl_le_pci_dma(pci, cq->db_addr, cq->head, MEMTXATTRS_UNSPECIFIED);
}
if (start_sqs) {
NvmeSQueue *sq;
* including ones that run on Linux, are not updating Admin Queues,
* so we can't trust reading it for an appropriate sq tail.
*/
- pci_dma_write(pci, sq->db_addr, &sq->tail, sizeof(sq->tail));
+ stl_le_pci_dma(pci, sq->db_addr, sq->tail, MEMTXATTRS_UNSPECIFIED);
}
qemu_bh_schedule(sq->bh);
pcie_endpoint_cap_init(pci_dev, 0x80);
pcie_cap_flr_init(pci_dev);
if (n->params.sriov_max_vfs) {
- pcie_ari_init(pci_dev, 0x100, 1);
+ pcie_ari_init(pci_dev, 0x100);
}
/* add one to max_ioqpairs to account for the admin queue pair */
id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING);
id->cctemp = cpu_to_le16(NVME_TEMPERATURE_CRITICAL);
- id->sqes = (0x6 << 4) | 0x6;
- id->cqes = (0x4 << 4) | 0x4;
+ id->sqes = (NVME_SQES << 4) | NVME_SQES;
+ id->cqes = (NVME_CQES << 4) | NVME_CQES;
id->nn = cpu_to_le32(NVME_MAX_NAMESPACES);
id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
NVME_ONCS_FEATURES | NVME_ONCS_DSM |