Merge tag 'pull-trivial-patches' of https://gitlab.com/mjt0k/qemu into staging

[mirror_qemu.git] / hw / nvme / ctrl.c
diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c

index e54276dc1dc7e5baeca48f22b39bad6cfb9929fc..f026245d1e9eb3a14d39f651c4a9d35c9d2cd201 100644 (file)
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -17,7 +17,7 @@
   * Notes on coding style
   * ---------------------
   * While QEMU coding style prefers lowercase hexadecimals in constants, the
- * NVMe subsystem use thes format from the NVMe specifications in the comments
+ * NVMe subsystem use this format from the NVMe specifications in the comments
   * (i.e. 'h' suffix instead of '0x' prefix).
   *
   * Usage
@@ -43,7 +43,14 @@
   *              subsys=<subsys_id>
   *      -device nvme-ns,drive=<drive_id>,bus=<bus_name>,nsid=<nsid>,\
   *              zoned=<true|false[optional]>, \
- *              subsys=<subsys_id>,detached=<true|false[optional]>
+ *              subsys=<subsys_id>,shared=<true|false[optional]>, \
+ *              detached=<true|false[optional]>, \
+ *              zoned.zone_size=<N[optional]>, \
+ *              zoned.zone_capacity=<N[optional]>, \
+ *              zoned.descr_ext_size=<N[optional]>, \
+ *              zoned.max_active=<N[optional]>, \
+ *              zoned.max_open=<N[optional]>, \
+ *              zoned.cross_read=<true|false[optional]>
   *
   * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
   * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. By default, the
@@ -238,6 +245,8 @@ static const bool nvme_feature_support[NVME_FID_MAX] = {
      [NVME_TIMESTAMP]                = true,
      [NVME_HOST_BEHAVIOR_SUPPORT]    = true,
      [NVME_COMMAND_SET_PROFILE]      = true,
+    [NVME_FDP_MODE]                 = true,
+    [NVME_FDP_EVENTS]               = true,
  };
  
  static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
@@ -249,6 +258,8 @@ static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
      [NVME_TIMESTAMP]                = NVME_FEAT_CAP_CHANGE,
      [NVME_HOST_BEHAVIOR_SUPPORT]    = NVME_FEAT_CAP_CHANGE,
      [NVME_COMMAND_SET_PROFILE]      = NVME_FEAT_CAP_CHANGE,
+    [NVME_FDP_MODE]                 = NVME_FEAT_CAP_CHANGE,
+    [NVME_FDP_EVENTS]               = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
  };
  
  static const uint32_t nvme_cse_acs[256] = {
@@ -266,6 +277,8 @@ static const uint32_t nvme_cse_acs[256] = {
      [NVME_ADM_CMD_VIRT_MNGMT]       = NVME_CMD_EFF_CSUPP,
      [NVME_ADM_CMD_DBBUF_CONFIG]     = NVME_CMD_EFF_CSUPP,
      [NVME_ADM_CMD_FORMAT_NVM]       = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
+    [NVME_ADM_CMD_DIRECTIVE_RECV]   = NVME_CMD_EFF_CSUPP,
+    [NVME_ADM_CMD_DIRECTIVE_SEND]   = NVME_CMD_EFF_CSUPP,
  };
  
  static const uint32_t nvme_cse_iocs_none[256];
@@ -279,6 +292,8 @@ static const uint32_t nvme_cse_iocs_nvm[256] = {
      [NVME_CMD_VERIFY]               = NVME_CMD_EFF_CSUPP,
      [NVME_CMD_COPY]                 = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
      [NVME_CMD_COMPARE]              = NVME_CMD_EFF_CSUPP,
+    [NVME_CMD_IO_MGMT_RECV]         = NVME_CMD_EFF_CSUPP,
+    [NVME_CMD_IO_MGMT_SEND]         = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
  };
  
  static const uint32_t nvme_cse_iocs_zoned[256] = {
@@ -297,12 +312,66 @@ static const uint32_t nvme_cse_iocs_zoned[256] = {
  
  static void nvme_process_sq(void *opaque);
  static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst);
+static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n);
  
  static uint16_t nvme_sqid(NvmeRequest *req)
  {
      return le16_to_cpu(req->sq->sqid);
  }
  
+static inline uint16_t nvme_make_pid(NvmeNamespace *ns, uint16_t rg,
+                                     uint16_t ph)
+{
+    uint16_t rgif = ns->endgrp->fdp.rgif;
+
+    if (!rgif) {
+        return ph;
+    }
+
+    return (rg << (16 - rgif)) | ph;
+}
+
+static inline bool nvme_ph_valid(NvmeNamespace *ns, uint16_t ph)
+{
+    return ph < ns->fdp.nphs;
+}
+
+static inline bool nvme_rg_valid(NvmeEnduranceGroup *endgrp, uint16_t rg)
+{
+    return rg < endgrp->fdp.nrg;
+}
+
+static inline uint16_t nvme_pid2ph(NvmeNamespace *ns, uint16_t pid)
+{
+    uint16_t rgif = ns->endgrp->fdp.rgif;
+
+    if (!rgif) {
+        return pid;
+    }
+
+    return pid & ((1 << (15 - rgif)) - 1);
+}
+
+static inline uint16_t nvme_pid2rg(NvmeNamespace *ns, uint16_t pid)
+{
+    uint16_t rgif = ns->endgrp->fdp.rgif;
+
+    if (!rgif) {
+        return 0;
+    }
+
+    return pid >> (16 - rgif);
+}
+
+static inline bool nvme_parse_pid(NvmeNamespace *ns, uint16_t pid,
+                                  uint16_t *ph, uint16_t *rg)
+{
+    *rg = nvme_pid2rg(ns, pid);
+    *ph = nvme_pid2ph(ns, pid);
+
+    return nvme_ph_valid(ns, *ph) && nvme_rg_valid(ns->endgrp, *rg);
+}
+
  static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone,
                                     NvmeZoneState state)
  {
@@ -376,6 +445,69 @@ static uint16_t nvme_aor_check(NvmeNamespace *ns, uint32_t act, uint32_t opn)
      return nvme_zns_check_resources(ns, act, opn, 0);
  }
  
+static NvmeFdpEvent *nvme_fdp_alloc_event(NvmeCtrl *n, NvmeFdpEventBuffer *ebuf)
+{
+    NvmeFdpEvent *ret = NULL;
+    bool is_full = ebuf->next == ebuf->start && ebuf->nelems;
+
+    ret = &ebuf->events[ebuf->next++];
+    if (unlikely(ebuf->next == NVME_FDP_MAX_EVENTS)) {
+        ebuf->next = 0;
+    }
+    if (is_full) {
+        ebuf->start = ebuf->next;
+    } else {
+        ebuf->nelems++;
+    }
+
+    memset(ret, 0, sizeof(NvmeFdpEvent));
+    ret->timestamp = nvme_get_timestamp(n);
+
+    return ret;
+}
+
+static inline int log_event(NvmeRuHandle *ruh, uint8_t event_type)
+{
+    return (ruh->event_filter >> nvme_fdp_evf_shifts[event_type]) & 0x1;
+}
+
+static bool nvme_update_ruh(NvmeCtrl *n, NvmeNamespace *ns, uint16_t pid)
+{
+    NvmeEnduranceGroup *endgrp = ns->endgrp;
+    NvmeRuHandle *ruh;
+    NvmeReclaimUnit *ru;
+    NvmeFdpEvent *e = NULL;
+    uint16_t ph, rg, ruhid;
+
+    if (!nvme_parse_pid(ns, pid, &ph, &rg)) {
+        return false;
+    }
+
+    ruhid = ns->fdp.phs[ph];
+
+    ruh = &endgrp->fdp.ruhs[ruhid];
+    ru = &ruh->rus[rg];
+
+    if (ru->ruamw) {
+        if (log_event(ruh, FDP_EVT_RU_NOT_FULLY_WRITTEN)) {
+            e = nvme_fdp_alloc_event(n, &endgrp->fdp.host_events);
+            e->type = FDP_EVT_RU_NOT_FULLY_WRITTEN;
+            e->flags = FDPEF_PIV | FDPEF_NSIDV | FDPEF_LV;
+            e->pid = cpu_to_le16(pid);
+            e->nsid = cpu_to_le32(ns->params.nsid);
+            e->rgid = cpu_to_le16(rg);
+            e->ruhid = cpu_to_le16(ruhid);
+        }
+
+        /* log (eventual) GC overhead of prematurely swapping the RU */
+        nvme_fdp_stat_inc(&endgrp->fdp.mbmw, nvme_l2b(ns, ru->ruamw));
+    }
+
+    ru->ruamw = ruh->ruamw;
+
+    return true;
+}
+
  static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
  {
      hwaddr hi, lo;
@@ -449,7 +581,7 @@ static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
          return 0;
      }
  
-    return pci_dma_read(&n->parent_obj, addr, buf, size);
+    return pci_dma_read(PCI_DEVICE(n), addr, buf, size);
  }
  
  static int nvme_addr_write(NvmeCtrl *n, hwaddr addr, const void *buf, int size)
@@ -469,7 +601,7 @@ static int nvme_addr_write(NvmeCtrl *n, hwaddr addr, const void *buf, int size)
          return 0;
      }
  
-    return pci_dma_write(&n->parent_obj, addr, buf, size);
+    return pci_dma_write(PCI_DEVICE(n), addr, buf, size);
  }
  
  static bool nvme_nsid_valid(NvmeCtrl *n, uint32_t nsid)
@@ -514,24 +646,27 @@ static uint8_t nvme_sq_empty(NvmeSQueue *sq)
  
  static void nvme_irq_check(NvmeCtrl *n)
  {
+    PCIDevice *pci = PCI_DEVICE(n);
      uint32_t intms = ldl_le_p(&n->bar.intms);
  
-    if (msix_enabled(&(n->parent_obj))) {
+    if (msix_enabled(pci)) {
          return;
      }
      if (~intms & n->irq_status) {
-        pci_irq_assert(&n->parent_obj);
+        pci_irq_assert(pci);
      } else {
-        pci_irq_deassert(&n->parent_obj);
+        pci_irq_deassert(pci);
      }
  }
  
  static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq)
  {
+    PCIDevice *pci = PCI_DEVICE(n);
+
      if (cq->irq_enabled) {
-        if (msix_enabled(&(n->parent_obj))) {
+        if (msix_enabled(pci)) {
              trace_pci_nvme_irq_msix(cq->vector);
-            msix_notify(&(n->parent_obj), cq->vector);
+            msix_notify(pci, cq->vector);
          } else {
              trace_pci_nvme_irq_pin();
              assert(cq->vector < 32);
@@ -546,7 +681,7 @@ static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq)
  static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq)
  {
      if (cq->irq_enabled) {
-        if (msix_enabled(&(n->parent_obj))) {
+        if (msix_enabled(PCI_DEVICE(n))) {
              return;
          } else {
              assert(cq->vector < 32);
@@ -570,7 +705,7 @@ static void nvme_req_clear(NvmeRequest *req)
  static inline void nvme_sg_init(NvmeCtrl *n, NvmeSg *sg, bool dma)
  {
      if (dma) {
-        pci_dma_sglist_init(&sg->qsg, &n->parent_obj, 0);
+        pci_dma_sglist_init(&sg->qsg, PCI_DEVICE(n), 0);
          sg->flags = NVME_SG_DMA;
      } else {
          qemu_iovec_init(&sg->iov, 0);
@@ -595,7 +730,7 @@ static inline void nvme_sg_unmap(NvmeSg *sg)
  }
  
  /*
- * When metadata is transfered as extended LBAs, the DPTR mapped into `sg`
+ * When metadata is transferred as extended LBAs, the DPTR mapped into `sg`
   * holds both data and metadata. This function splits the data and metadata
   * into two separate QSG/IOVs.
   */
@@ -759,7 +894,7 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, NvmeSg *sg, uint64_t prp1,
      len -= trans_len;
      if (len) {
          if (len > n->page_size) {
-            uint64_t prp_list[n->max_prp_ents];
+            g_autofree uint64_t *prp_list = g_new(uint64_t, n->max_prp_ents);
              uint32_t nents, prp_trans;
              int i = 0;
  
@@ -910,7 +1045,7 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl,
       * descriptors and segment chain) than the command transfer size, so it is
       * not bounded by MDTS.
       */
-    const int SEG_CHUNK_SIZE = 256;
+#define SEG_CHUNK_SIZE 256
  
      NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld;
      uint64_t nsgld;
@@ -1306,36 +1441,45 @@ uint16_t nvme_bounce_mdata(NvmeCtrl *n, void *ptr, uint32_t len,
  }
  
  static inline void nvme_blk_read(BlockBackend *blk, int64_t offset,
-                                 BlockCompletionFunc *cb, NvmeRequest *req)
+                                 uint32_t align, BlockCompletionFunc *cb,
+                                 NvmeRequest *req)
  {
      assert(req->sg.flags & NVME_SG_ALLOC);
  
      if (req->sg.flags & NVME_SG_DMA) {
-        req->aiocb = dma_blk_read(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE,
-                                  cb, req);
+        req->aiocb = dma_blk_read(blk, &req->sg.qsg, offset, align, cb, req);
      } else {
          req->aiocb = blk_aio_preadv(blk, offset, &req->sg.iov, 0, cb, req);
      }
  }
  
  static inline void nvme_blk_write(BlockBackend *blk, int64_t offset,
-                                  BlockCompletionFunc *cb, NvmeRequest *req)
+                                  uint32_t align, BlockCompletionFunc *cb,
+                                  NvmeRequest *req)
  {
      assert(req->sg.flags & NVME_SG_ALLOC);
  
      if (req->sg.flags & NVME_SG_DMA) {
-        req->aiocb = dma_blk_write(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE,
-                                   cb, req);
+        req->aiocb = dma_blk_write(blk, &req->sg.qsg, offset, align, cb, req);
      } else {
          req->aiocb = blk_aio_pwritev(blk, offset, &req->sg.iov, 0, cb, req);
      }
  }
  
+static void nvme_update_cq_eventidx(const NvmeCQueue *cq)
+{
+    trace_pci_nvme_update_cq_eventidx(cq->cqid, cq->head);
+
+    stl_le_pci_dma(PCI_DEVICE(cq->ctrl), cq->ei_addr, cq->head,
+                   MEMTXATTRS_UNSPECIFIED);
+}
+
  static void nvme_update_cq_head(NvmeCQueue *cq)
  {
-    pci_dma_read(&cq->ctrl->parent_obj, cq->db_addr, &cq->head,
-            sizeof(cq->head));
-    trace_pci_nvme_shadow_doorbell_cq(cq->cqid, cq->head);
+    ldl_le_pci_dma(PCI_DEVICE(cq->ctrl), cq->db_addr, &cq->head,
+                   MEMTXATTRS_UNSPECIFIED);
+
+    trace_pci_nvme_update_cq_head(cq->cqid, cq->head);
  }
  
  static void nvme_post_cqes(void *opaque)
@@ -1351,6 +1495,7 @@ static void nvme_post_cqes(void *opaque)
          hwaddr addr;
  
          if (n->dbbuf_enabled) {
+            nvme_update_cq_eventidx(cq);
              nvme_update_cq_head(cq);
          }
  
@@ -1362,8 +1507,8 @@ static void nvme_post_cqes(void *opaque)
          req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase);
          req->cqe.sq_id = cpu_to_le16(sq->sqid);
          req->cqe.sq_head = cpu_to_le16(sq->head);
-        addr = cq->dma_addr + cq->tail * n->cqe_size;
-        ret = pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe,
+        addr = cq->dma_addr + (cq->tail << NVME_CQES);
+        ret = pci_dma_write(PCI_DEVICE(n), addr, (void *)&req->cqe,
                              sizeof(req->cqe));
          if (ret) {
              trace_pci_nvme_err_addr_write(addr);
@@ -1606,6 +1751,7 @@ static void nvme_aio_err(NvmeRequest *req, int ret)
      case NVME_CMD_WRITE:
      case NVME_CMD_WRITE_ZEROES:
      case NVME_CMD_ZONE_APPEND:
+    case NVME_CMD_COPY:
          status = NVME_WRITE_FAULT;
          break;
      default:
@@ -1984,11 +2130,6 @@ static inline bool nvme_is_write(NvmeRequest *req)
             rw->opcode == NVME_CMD_WRITE_ZEROES;
  }
  
-static AioContext *nvme_get_aio_context(BlockAIOCB *acb)
-{
-    return qemu_get_aio_context();
-}
-
  static void nvme_misc_cb(void *opaque, int ret)
  {
      NvmeRequest *req = opaque;
@@ -2065,10 +2206,10 @@ static void nvme_rw_cb(void *opaque, int ret)
              }
  
              if (req->cmd.opcode == NVME_CMD_READ) {
-                return nvme_blk_read(blk, offset, nvme_rw_complete_cb, req);
+                return nvme_blk_read(blk, offset, 1, nvme_rw_complete_cb, req);
              }
  
-            return nvme_blk_write(blk, offset, nvme_rw_complete_cb, req);
+            return nvme_blk_write(blk, offset, 1, nvme_rw_complete_cb, req);
          }
      }
  
@@ -2236,7 +2377,7 @@ static void nvme_compare_mdata_cb(void *opaque, int ret)
  
          for (bufp = buf; mbufp < end; bufp += ns->lbaf.ms, mbufp += ns->lbaf.ms) {
              if (memcmp(bufp + pil, mbufp + pil, ns->lbaf.ms - pil)) {
-                req->status = NVME_CMP_FAILURE;
+                req->status = NVME_CMP_FAILURE | NVME_DNR;
                  goto out;
              }
          }
@@ -2245,7 +2386,7 @@ static void nvme_compare_mdata_cb(void *opaque, int ret)
      }
  
      if (memcmp(buf, ctx->mdata.bounce, ctx->mdata.iov.size)) {
-        req->status = NVME_CMP_FAILURE;
+        req->status = NVME_CMP_FAILURE | NVME_DNR;
          goto out;
      }
  
@@ -2294,7 +2435,7 @@ static void nvme_compare_data_cb(void *opaque, int ret)
      }
  
      if (memcmp(buf, ctx->data.bounce, ctx->data.iov.size)) {
-        req->status = NVME_CMP_FAILURE;
+        req->status = NVME_CMP_FAILURE | NVME_DNR;
          goto out;
      }
  
@@ -2477,6 +2618,9 @@ static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
          status = nvme_h2c(n, (uint8_t *)iocb->range, sizeof(NvmeDsmRange) * nr,
                            req);
          if (status) {
+            g_free(iocb->range);
+            qemu_aio_unref(iocb);
+
              return status;
          }
  
@@ -2702,6 +2846,25 @@ static void nvme_copy_source_range_parse(void *ranges, int idx, uint8_t format,
      }
  }
  
+static inline uint16_t nvme_check_copy_mcl(NvmeNamespace *ns,
+                                           NvmeCopyAIOCB *iocb, uint16_t nr)
+{
+    uint32_t copy_len = 0;
+
+    for (int idx = 0; idx < nr; idx++) {
+        uint32_t nlb;
+        nvme_copy_source_range_parse(iocb->ranges, idx, iocb->format, NULL,
+                                     &nlb, NULL, NULL, NULL);
+        copy_len += nlb + 1;
+    }
+
+    if (copy_len > ns->id_ns.mcl) {
+        return NVME_CMD_SIZE_LIMIT | NVME_DNR;
+    }
+
+    return NVME_SUCCESS;
+}
+
  static void nvme_copy_out_completed_cb(void *opaque, int ret)
  {
      NvmeCopyAIOCB *iocb = opaque;
@@ -3014,6 +3177,11 @@ static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)
          }
      }
  
+    status = nvme_check_copy_mcl(ns, iocb, nr);
+    if (status) {
+        goto invalid;
+    }
+
      iocb->req = req;
      iocb->ret = 0;
      iocb->nr = nr;
@@ -3129,7 +3297,6 @@ static void nvme_flush_cancel(BlockAIOCB *acb)
  static const AIOCBInfo nvme_flush_aiocb_info = {
      .aiocb_size = sizeof(NvmeFlushAIOCB),
      .cancel_async = nvme_flush_cancel,
-    .get_aio_context = nvme_get_aio_context,
  };
  
  static void nvme_do_flush(NvmeFlushAIOCB *iocb);
@@ -3295,7 +3462,7 @@ static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
  
      block_acct_start(blk_get_stats(blk), &req->acct, data_size,
                       BLOCK_ACCT_READ);
-    nvme_blk_read(blk, data_offset, nvme_rw_cb, req);
+    nvme_blk_read(blk, data_offset, BDRV_SECTOR_SIZE, nvme_rw_cb, req);
      return NVME_NO_COMPLETE;
  
  invalid:
@@ -3303,6 +3470,41 @@ invalid:
      return status | NVME_DNR;
  }
  
+static void nvme_do_write_fdp(NvmeCtrl *n, NvmeRequest *req, uint64_t slba,
+                              uint32_t nlb)
+{
+    NvmeNamespace *ns = req->ns;
+    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
+    uint64_t data_size = nvme_l2b(ns, nlb);
+    uint32_t dw12 = le32_to_cpu(req->cmd.cdw12);
+    uint8_t dtype = (dw12 >> 20) & 0xf;
+    uint16_t pid = le16_to_cpu(rw->dspec);
+    uint16_t ph, rg, ruhid;
+    NvmeReclaimUnit *ru;
+
+    if (dtype != NVME_DIRECTIVE_DATA_PLACEMENT ||
+        !nvme_parse_pid(ns, pid, &ph, &rg)) {
+        ph = 0;
+        rg = 0;
+    }
+
+    ruhid = ns->fdp.phs[ph];
+    ru = &ns->endgrp->fdp.ruhs[ruhid].rus[rg];
+
+    nvme_fdp_stat_inc(&ns->endgrp->fdp.hbmw, data_size);
+    nvme_fdp_stat_inc(&ns->endgrp->fdp.mbmw, data_size);
+
+    while (nlb) {
+        if (nlb < ru->ruamw) {
+            ru->ruamw -= nlb;
+            break;
+        }
+
+        nlb -= ru->ruamw;
+        nvme_update_ruh(n, ns, pid);
+    }
+}
+
  static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
                                bool wrz)
  {
@@ -3412,6 +3614,8 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
          if (!(zone->d.za & NVME_ZA_ZRWA_VALID)) {
              zone->w_ptr += nlb;
          }
+    } else if (ns->endgrp && ns->endgrp->fdp.enabled) {
+        nvme_do_write_fdp(n, req, slba, nlb);
      }
  
      data_offset = nvme_l2b(ns, slba);
@@ -3428,7 +3632,7 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
  
          block_acct_start(blk_get_stats(blk), &req->acct, data_size,
                           BLOCK_ACCT_WRITE);
-        nvme_blk_write(blk, data_offset, nvme_rw_cb, req);
+        nvme_blk_write(blk, data_offset, BDRV_SECTOR_SIZE, nvme_rw_cb, req);
      } else {
          req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size,
                                             BDRV_REQ_MAY_UNMAP, nvme_rw_cb,
@@ -4028,14 +4232,14 @@ static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
              nr_zones++;
          }
      }
-    header = (NvmeZoneReportHeader *)buf;
+    header = buf;
      header->nr_zones = cpu_to_le64(nr_zones);
  
      buf_p = buf + sizeof(NvmeZoneReportHeader);
      for (; zone_idx < ns->num_zones && max_zones > 0; zone_idx++) {
          zone = &ns->zone_array[zone_idx];
          if (nvme_zone_matches_filter(zrasf, zone)) {
-            z = (NvmeZoneDescr *)buf_p;
+            z = buf_p;
              buf_p += sizeof(NvmeZoneDescr);
  
              z->zt = zone->d.zt;
@@ -4069,6 +4273,132 @@ static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
      return status;
  }
  
+static uint16_t nvme_io_mgmt_recv_ruhs(NvmeCtrl *n, NvmeRequest *req,
+                                       size_t len)
+{
+    NvmeNamespace *ns = req->ns;
+    NvmeEnduranceGroup *endgrp;
+    NvmeRuhStatus *hdr;
+    NvmeRuhStatusDescr *ruhsd;
+    unsigned int nruhsd;
+    uint16_t rg, ph, *ruhid;
+    size_t trans_len;
+    g_autofree uint8_t *buf = NULL;
+
+    if (!n->subsys) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    if (ns->params.nsid == 0 || ns->params.nsid == 0xffffffff) {
+        return NVME_INVALID_NSID | NVME_DNR;
+    }
+
+    if (!n->subsys->endgrp.fdp.enabled) {
+        return NVME_FDP_DISABLED | NVME_DNR;
+    }
+
+    endgrp = ns->endgrp;
+
+    nruhsd = ns->fdp.nphs * endgrp->fdp.nrg;
+    trans_len = sizeof(NvmeRuhStatus) + nruhsd * sizeof(NvmeRuhStatusDescr);
+    buf = g_malloc(trans_len);
+
+    trans_len = MIN(trans_len, len);
+
+    hdr = (NvmeRuhStatus *)buf;
+    ruhsd = (NvmeRuhStatusDescr *)(buf + sizeof(NvmeRuhStatus));
+
+    hdr->nruhsd = cpu_to_le16(nruhsd);
+
+    ruhid = ns->fdp.phs;
+
+    for (ph = 0; ph < ns->fdp.nphs; ph++, ruhid++) {
+        NvmeRuHandle *ruh = &endgrp->fdp.ruhs[*ruhid];
+
+        for (rg = 0; rg < endgrp->fdp.nrg; rg++, ruhsd++) {
+            uint16_t pid = nvme_make_pid(ns, rg, ph);
+
+            ruhsd->pid = cpu_to_le16(pid);
+            ruhsd->ruhid = *ruhid;
+            ruhsd->earutr = 0;
+            ruhsd->ruamw = cpu_to_le64(ruh->rus[rg].ruamw);
+        }
+    }
+
+    return nvme_c2h(n, buf, trans_len, req);
+}
+
+static uint16_t nvme_io_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
+{
+    NvmeCmd *cmd = &req->cmd;
+    uint32_t cdw10 = le32_to_cpu(cmd->cdw10);
+    uint32_t numd = le32_to_cpu(cmd->cdw11);
+    uint8_t mo = (cdw10 & 0xff);
+    size_t len = (numd + 1) << 2;
+
+    switch (mo) {
+    case NVME_IOMR_MO_NOP:
+        return 0;
+    case NVME_IOMR_MO_RUH_STATUS:
+        return nvme_io_mgmt_recv_ruhs(n, req, len);
+    default:
+        return NVME_INVALID_FIELD | NVME_DNR;
+    };
+}
+
+static uint16_t nvme_io_mgmt_send_ruh_update(NvmeCtrl *n, NvmeRequest *req)
+{
+    NvmeCmd *cmd = &req->cmd;
+    NvmeNamespace *ns = req->ns;
+    uint32_t cdw10 = le32_to_cpu(cmd->cdw10);
+    uint16_t ret = NVME_SUCCESS;
+    uint32_t npid = (cdw10 >> 1) + 1;
+    unsigned int i = 0;
+    g_autofree uint16_t *pids = NULL;
+    uint32_t maxnpid;
+
+    if (!ns->endgrp || !ns->endgrp->fdp.enabled) {
+        return NVME_FDP_DISABLED | NVME_DNR;
+    }
+
+    maxnpid = n->subsys->endgrp.fdp.nrg * n->subsys->endgrp.fdp.nruh;
+
+    if (unlikely(npid >= MIN(NVME_FDP_MAXPIDS, maxnpid))) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    pids = g_new(uint16_t, npid);
+
+    ret = nvme_h2c(n, pids, npid * sizeof(uint16_t), req);
+    if (ret) {
+        return ret;
+    }
+
+    for (; i < npid; i++) {
+        if (!nvme_update_ruh(n, ns, pids[i])) {
+            return NVME_INVALID_FIELD | NVME_DNR;
+        }
+    }
+
+    return ret;
+}
+
+static uint16_t nvme_io_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
+{
+    NvmeCmd *cmd = &req->cmd;
+    uint32_t cdw10 = le32_to_cpu(cmd->cdw10);
+    uint8_t mo = (cdw10 & 0xff);
+
+    switch (mo) {
+    case NVME_IOMS_MO_NOP:
+        return 0;
+    case NVME_IOMS_MO_RUH_UPDATE:
+        return nvme_io_mgmt_send_ruh_update(n, req);
+    default:
+        return NVME_INVALID_FIELD | NVME_DNR;
+    };
+}
+
  static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
  {
      NvmeNamespace *ns;
@@ -4145,6 +4475,10 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
          return nvme_zone_mgmt_send(n, req);
      case NVME_CMD_ZONE_MGMT_RECV:
          return nvme_zone_mgmt_recv(n, req);
+    case NVME_CMD_IO_MGMT_RECV:
+        return nvme_io_mgmt_recv(n, req);
+    case NVME_CMD_IO_MGMT_SEND:
+        return nvme_io_mgmt_send(n, req);
      default:
          assert(false);
      }
@@ -4301,7 +4635,8 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
          QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
      }
  
-    sq->bh = qemu_bh_new(nvme_process_sq, sq);
+    sq->bh = qemu_bh_new_guarded(nvme_process_sq, sq,
+                                 &DEVICE(sq->ctrl)->mem_reentrancy_guard);
  
      if (n->dbbuf_enabled) {
          sq->db_addr = n->dbbuf_dbs + (sqid << 3);
@@ -4369,8 +4704,8 @@ static void nvme_set_blk_stats(NvmeNamespace *ns, struct nvme_stats *stats)
  {
      BlockAcctStats *s = blk_get_stats(ns->blkconf.blk);
  
-    stats->units_read += s->nr_bytes[BLOCK_ACCT_READ] >> BDRV_SECTOR_BITS;
-    stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE] >> BDRV_SECTOR_BITS;
+    stats->units_read += s->nr_bytes[BLOCK_ACCT_READ];
+    stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE];
      stats->read_commands += s->nr_ops[BLOCK_ACCT_READ];
      stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE];
  }
@@ -4384,6 +4719,7 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
      uint32_t trans_len;
      NvmeNamespace *ns;
      time_t current_ms;
+    uint64_t u_read, u_written;
  
      if (off >= sizeof(smart)) {
          return NVME_INVALID_FIELD | NVME_DNR;
@@ -4410,10 +4746,11 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
      trans_len = MIN(sizeof(smart) - off, buf_len);
      smart.critical_warning = n->smart_critical_warning;
  
-    smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_read,
-                                                        1000));
-    smart.data_units_written[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_written,
-                                                           1000));
+    u_read = DIV_ROUND_UP(stats.units_read >> BDRV_SECTOR_BITS, 1000);
+    u_written = DIV_ROUND_UP(stats.units_written >> BDRV_SECTOR_BITS, 1000);
+
+    smart.data_units_read[0] = cpu_to_le64(u_read);
+    smart.data_units_written[0] = cpu_to_le64(u_written);
      smart.host_read_commands[0] = cpu_to_le64(stats.read_commands);
      smart.host_write_commands[0] = cpu_to_le64(stats.write_commands);
  
@@ -4435,6 +4772,48 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
      return nvme_c2h(n, (uint8_t *) &smart + off, trans_len, req);
  }
  
+static uint16_t nvme_endgrp_info(NvmeCtrl *n,  uint8_t rae, uint32_t buf_len,
+                                 uint64_t off, NvmeRequest *req)
+{
+    uint32_t dw11 = le32_to_cpu(req->cmd.cdw11);
+    uint16_t endgrpid = (dw11 >> 16) & 0xffff;
+    struct nvme_stats stats = {};
+    NvmeEndGrpLog info = {};
+    int i;
+
+    if (!n->subsys || endgrpid != 0x1) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    if (off >= sizeof(info)) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
+        NvmeNamespace *ns = nvme_subsys_ns(n->subsys, i);
+        if (!ns) {
+            continue;
+        }
+
+        nvme_set_blk_stats(ns, &stats);
+    }
+
+    info.data_units_read[0] =
+        cpu_to_le64(DIV_ROUND_UP(stats.units_read / 1000000000, 1000000000));
+    info.data_units_written[0] =
+        cpu_to_le64(DIV_ROUND_UP(stats.units_written / 1000000000, 1000000000));
+    info.media_units_written[0] =
+        cpu_to_le64(DIV_ROUND_UP(stats.units_written / 1000000000, 1000000000));
+
+    info.host_read_commands[0] = cpu_to_le64(stats.read_commands);
+    info.host_write_commands[0] = cpu_to_le64(stats.write_commands);
+
+    buf_len = MIN(sizeof(info) - off, buf_len);
+
+    return nvme_c2h(n, (uint8_t *)&info + off, buf_len, req);
+}
+
+
  static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off,
                                   NvmeRequest *req)
  {
@@ -4560,6 +4939,212 @@ static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len,
      return nvme_c2h(n, ((uint8_t *)&log) + off, trans_len, req);
  }
  
+static size_t sizeof_fdp_conf_descr(size_t nruh, size_t vss)
+{
+    size_t entry_siz = sizeof(NvmeFdpDescrHdr) + nruh * sizeof(NvmeRuhDescr)
+                       + vss;
+    return ROUND_UP(entry_siz, 8);
+}
+
+static uint16_t nvme_fdp_confs(NvmeCtrl *n, uint32_t endgrpid, uint32_t buf_len,
+                               uint64_t off, NvmeRequest *req)
+{
+    uint32_t log_size, trans_len;
+    g_autofree uint8_t *buf = NULL;
+    NvmeFdpDescrHdr *hdr;
+    NvmeRuhDescr *ruhd;
+    NvmeEnduranceGroup *endgrp;
+    NvmeFdpConfsHdr *log;
+    size_t nruh, fdp_descr_size;
+    int i;
+
+    if (endgrpid != 1 || !n->subsys) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    endgrp = &n->subsys->endgrp;
+
+    if (endgrp->fdp.enabled) {
+        nruh = endgrp->fdp.nruh;
+    } else {
+        nruh = 1;
+    }
+
+    fdp_descr_size = sizeof_fdp_conf_descr(nruh, FDPVSS);
+    log_size = sizeof(NvmeFdpConfsHdr) + fdp_descr_size;
+
+    if (off >= log_size) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    trans_len = MIN(log_size - off, buf_len);
+
+    buf = g_malloc0(log_size);
+    log = (NvmeFdpConfsHdr *)buf;
+    hdr = (NvmeFdpDescrHdr *)(log + 1);
+    ruhd = (NvmeRuhDescr *)(buf + sizeof(*log) + sizeof(*hdr));
+
+    log->num_confs = cpu_to_le16(0);
+    log->size = cpu_to_le32(log_size);
+
+    hdr->descr_size = cpu_to_le16(fdp_descr_size);
+    if (endgrp->fdp.enabled) {
+        hdr->fdpa = FIELD_DP8(hdr->fdpa, FDPA, VALID, 1);
+        hdr->fdpa = FIELD_DP8(hdr->fdpa, FDPA, RGIF, endgrp->fdp.rgif);
+        hdr->nrg = cpu_to_le16(endgrp->fdp.nrg);
+        hdr->nruh = cpu_to_le16(endgrp->fdp.nruh);
+        hdr->maxpids = cpu_to_le16(NVME_FDP_MAXPIDS - 1);
+        hdr->nnss = cpu_to_le32(NVME_MAX_NAMESPACES);
+        hdr->runs = cpu_to_le64(endgrp->fdp.runs);
+
+        for (i = 0; i < nruh; i++) {
+            ruhd->ruht = NVME_RUHT_INITIALLY_ISOLATED;
+            ruhd++;
+        }
+    } else {
+        /* 1 bit for RUH in PIF -> 2 RUHs max. */
+        hdr->nrg = cpu_to_le16(1);
+        hdr->nruh = cpu_to_le16(1);
+        hdr->maxpids = cpu_to_le16(NVME_FDP_MAXPIDS - 1);
+        hdr->nnss = cpu_to_le32(1);
+        hdr->runs = cpu_to_le64(96 * MiB);
+
+        ruhd->ruht = NVME_RUHT_INITIALLY_ISOLATED;
+    }
+
+    return nvme_c2h(n, (uint8_t *)buf + off, trans_len, req);
+}
+
+static uint16_t nvme_fdp_ruh_usage(NvmeCtrl *n, uint32_t endgrpid,
+                                   uint32_t dw10, uint32_t dw12,
+                                   uint32_t buf_len, uint64_t off,
+                                   NvmeRequest *req)
+{
+    NvmeRuHandle *ruh;
+    NvmeRuhuLog *hdr;
+    NvmeRuhuDescr *ruhud;
+    NvmeEnduranceGroup *endgrp;
+    g_autofree uint8_t *buf = NULL;
+    uint32_t log_size, trans_len;
+    uint16_t i;
+
+    if (endgrpid != 1 || !n->subsys) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    endgrp = &n->subsys->endgrp;
+
+    if (!endgrp->fdp.enabled) {
+        return NVME_FDP_DISABLED | NVME_DNR;
+    }
+
+    log_size = sizeof(NvmeRuhuLog) + endgrp->fdp.nruh * sizeof(NvmeRuhuDescr);
+
+    if (off >= log_size) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    trans_len = MIN(log_size - off, buf_len);
+
+    buf = g_malloc0(log_size);
+    hdr = (NvmeRuhuLog *)buf;
+    ruhud = (NvmeRuhuDescr *)(hdr + 1);
+
+    ruh = endgrp->fdp.ruhs;
+    hdr->nruh = cpu_to_le16(endgrp->fdp.nruh);
+
+    for (i = 0; i < endgrp->fdp.nruh; i++, ruhud++, ruh++) {
+        ruhud->ruha = ruh->ruha;
+    }
+
+    return nvme_c2h(n, (uint8_t *)buf + off, trans_len, req);
+}
+
+static uint16_t nvme_fdp_stats(NvmeCtrl *n, uint32_t endgrpid, uint32_t buf_len,
+                               uint64_t off, NvmeRequest *req)
+{
+    NvmeEnduranceGroup *endgrp;
+    NvmeFdpStatsLog log = {};
+    uint32_t trans_len;
+
+    if (off >= sizeof(NvmeFdpStatsLog)) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    if (endgrpid != 1 || !n->subsys) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    if (!n->subsys->endgrp.fdp.enabled) {
+        return NVME_FDP_DISABLED | NVME_DNR;
+    }
+
+    endgrp = &n->subsys->endgrp;
+
+    trans_len = MIN(sizeof(log) - off, buf_len);
+
+    /* spec value is 128 bit, we only use 64 bit */
+    log.hbmw[0] = cpu_to_le64(endgrp->fdp.hbmw);
+    log.mbmw[0] = cpu_to_le64(endgrp->fdp.mbmw);
+    log.mbe[0] = cpu_to_le64(endgrp->fdp.mbe);
+
+    return nvme_c2h(n, (uint8_t *)&log + off, trans_len, req);
+}
+
+static uint16_t nvme_fdp_events(NvmeCtrl *n, uint32_t endgrpid,
+                                uint32_t buf_len, uint64_t off,
+                                NvmeRequest *req)
+{
+    NvmeEnduranceGroup *endgrp;
+    NvmeCmd *cmd = &req->cmd;
+    bool host_events = (cmd->cdw10 >> 8) & 0x1;
+    uint32_t log_size, trans_len;
+    NvmeFdpEventBuffer *ebuf;
+    g_autofree NvmeFdpEventsLog *elog = NULL;
+    NvmeFdpEvent *event;
+
+    if (endgrpid != 1 || !n->subsys) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    endgrp = &n->subsys->endgrp;
+
+    if (!endgrp->fdp.enabled) {
+        return NVME_FDP_DISABLED | NVME_DNR;
+    }
+
+    if (host_events) {
+        ebuf = &endgrp->fdp.host_events;
+    } else {
+        ebuf = &endgrp->fdp.ctrl_events;
+    }
+
+    log_size = sizeof(NvmeFdpEventsLog) + ebuf->nelems * sizeof(NvmeFdpEvent);
+
+    if (off >= log_size) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    trans_len = MIN(log_size - off, buf_len);
+    elog = g_malloc0(log_size);
+    elog->num_events = cpu_to_le32(ebuf->nelems);
+    event = (NvmeFdpEvent *)(elog + 1);
+
+    if (ebuf->nelems && ebuf->start == ebuf->next) {
+        unsigned int nelems = (NVME_FDP_MAX_EVENTS - ebuf->start);
+        /* wrap over, copy [start;NVME_FDP_MAX_EVENTS[ and [0; next[ */
+        memcpy(event, &ebuf->events[ebuf->start],
+               sizeof(NvmeFdpEvent) * nelems);
+        memcpy(event + nelems, ebuf->events,
+               sizeof(NvmeFdpEvent) * ebuf->next);
+    } else if (ebuf->start < ebuf->next) {
+        memcpy(event, &ebuf->events[ebuf->start],
+               sizeof(NvmeFdpEvent) * (ebuf->next - ebuf->start));
+    }
+
+    return nvme_c2h(n, (uint8_t *)elog + off, trans_len, req);
+}
+
  static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
  {
      NvmeCmd *cmd = &req->cmd;
@@ -4572,13 +5157,14 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
      uint8_t  lsp = (dw10 >> 8) & 0xf;
      uint8_t  rae = (dw10 >> 15) & 0x1;
      uint8_t  csi = le32_to_cpu(cmd->cdw14) >> 24;
-    uint32_t numdl, numdu;
+    uint32_t numdl, numdu, lspi;
      uint64_t off, lpol, lpou;
      size_t   len;
      uint16_t status;
  
      numdl = (dw10 >> 16);
      numdu = (dw11 & 0xffff);
+    lspi = (dw11 >> 16);
      lpol = dw12;
      lpou = dw13;
  
@@ -4607,6 +5193,16 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
          return nvme_changed_nslist(n, rae, len, off, req);
      case NVME_LOG_CMD_EFFECTS:
          return nvme_cmd_effects(n, csi, len, off, req);
+    case NVME_LOG_ENDGRP:
+        return nvme_endgrp_info(n, rae, len, off, req);
+    case NVME_LOG_FDP_CONFS:
+        return nvme_fdp_confs(n, lspi, len, off, req);
+    case NVME_LOG_FDP_RUH_USAGE:
+        return nvme_fdp_ruh_usage(n, lspi, dw10, dw12, len, off, req);
+    case NVME_LOG_FDP_STATS:
+        return nvme_fdp_stats(n, lspi, len, off, req);
+    case NVME_LOG_FDP_EVENTS:
+        return nvme_fdp_events(n, lspi, len, off, req);
      default:
          trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
          return NVME_INVALID_FIELD | NVME_DNR;
@@ -4615,6 +5211,7 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
  
  static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
  {
+    PCIDevice *pci = PCI_DEVICE(n);
      uint16_t offset = (cq->cqid << 3) + (1 << 2);
  
      n->cq[cq->cqid] = NULL;
@@ -4625,8 +5222,8 @@ static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
          event_notifier_set_handler(&cq->notifier, NULL);
          event_notifier_cleanup(&cq->notifier);
      }
-    if (msix_enabled(&n->parent_obj)) {
-        msix_vector_unuse(&n->parent_obj, cq->vector);
+    if (msix_enabled(pci)) {
+        msix_vector_unuse(pci, cq->vector);
      }
      if (cq->cqid) {
          g_free(cq);
@@ -4664,8 +5261,10 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
                           uint16_t cqid, uint16_t vector, uint16_t size,
                           uint16_t irq_enabled)
  {
-    if (msix_enabled(&n->parent_obj)) {
-        msix_vector_use(&n->parent_obj, vector);
+    PCIDevice *pci = PCI_DEVICE(n);
+
+    if (msix_enabled(pci)) {
+        msix_vector_use(pci, vector);
      }
      cq->ctrl = n;
      cq->cqid = cqid;
@@ -4688,7 +5287,8 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
          }
      }
      n->cq[cqid] = cq;
-    cq->bh = qemu_bh_new(nvme_post_cqes, cq);
+    cq->bh = qemu_bh_new_guarded(nvme_post_cqes, cq,
+                                 &DEVICE(cq->ctrl)->mem_reentrancy_guard);
  }
  
  static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req)
@@ -4700,10 +5300,18 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req)
      uint16_t qsize = le16_to_cpu(c->qsize);
      uint16_t qflags = le16_to_cpu(c->cq_flags);
      uint64_t prp1 = le64_to_cpu(c->prp1);
+    uint32_t cc = ldq_le_p(&n->bar.cc);
+    uint8_t iocqes = NVME_CC_IOCQES(cc);
+    uint8_t iosqes = NVME_CC_IOSQES(cc);
  
      trace_pci_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
                               NVME_CQ_FLAGS_IEN(qflags) != 0);
  
+    if (iosqes != NVME_SQES || iocqes != NVME_CQES) {
+        trace_pci_nvme_err_invalid_create_cq_entry_size(iosqes, iocqes);
+        return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
+    }
+
      if (unlikely(!cqid || cqid > n->conf_ioqpairs || n->cq[cqid] != NULL)) {
          trace_pci_nvme_err_invalid_create_cq_cqid(cqid);
          return NVME_INVALID_QID | NVME_DNR;
@@ -4716,7 +5324,7 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req)
          trace_pci_nvme_err_invalid_create_cq_addr(prp1);
          return NVME_INVALID_PRP_OFFSET | NVME_DNR;
      }
-    if (unlikely(!msix_enabled(&n->parent_obj) && vector)) {
+    if (unlikely(!msix_enabled(PCI_DEVICE(n)) && vector)) {
          trace_pci_nvme_err_invalid_create_cq_vector(vector);
          return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
      }
@@ -5190,6 +5798,84 @@ static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
      return nvme_c2h(n, (uint8_t *)&timestamp, sizeof(timestamp), req);
  }
  
+static int nvme_get_feature_fdp(NvmeCtrl *n, uint32_t endgrpid,
+                                uint32_t *result)
+{
+    *result = 0;
+
+    if (!n->subsys || !n->subsys->endgrp.fdp.enabled) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    *result = FIELD_DP16(0, FEAT_FDP, FDPE, 1);
+    *result = FIELD_DP16(*result, FEAT_FDP, CONF_NDX, 0);
+
+    return NVME_SUCCESS;
+}
+
+static uint16_t nvme_get_feature_fdp_events(NvmeCtrl *n, NvmeNamespace *ns,
+                                            NvmeRequest *req, uint32_t *result)
+{
+    NvmeCmd *cmd = &req->cmd;
+    uint32_t cdw11 = le32_to_cpu(cmd->cdw11);
+    uint16_t ph = cdw11 & 0xffff;
+    uint8_t noet = (cdw11 >> 16) & 0xff;
+    uint16_t ruhid, ret;
+    uint32_t nentries = 0;
+    uint8_t s_events_ndx = 0;
+    size_t s_events_siz = sizeof(NvmeFdpEventDescr) * noet;
+    g_autofree NvmeFdpEventDescr *s_events = g_malloc0(s_events_siz);
+    NvmeRuHandle *ruh;
+    NvmeFdpEventDescr *s_event;
+
+    if (!n->subsys || !n->subsys->endgrp.fdp.enabled) {
+        return NVME_FDP_DISABLED | NVME_DNR;
+    }
+
+    if (!nvme_ph_valid(ns, ph)) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    ruhid = ns->fdp.phs[ph];
+    ruh = &n->subsys->endgrp.fdp.ruhs[ruhid];
+
+    assert(ruh);
+
+    if (unlikely(noet == 0)) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    for (uint8_t event_type = 0; event_type < FDP_EVT_MAX; event_type++) {
+        uint8_t shift = nvme_fdp_evf_shifts[event_type];
+        if (!shift && event_type) {
+            /*
+             * only first entry (event_type == 0) has a shift value of 0
+             * other entries are simply unpopulated.
+             */
+            continue;
+        }
+
+        nentries++;
+
+        s_event = &s_events[s_events_ndx];
+        s_event->evt = event_type;
+        s_event->evta = (ruh->event_filter >> shift) & 0x1;
+
+        /* break if all `noet` entries are filled */
+        if ((++s_events_ndx) == noet) {
+            break;
+        }
+    }
+
+    ret = nvme_c2h(n, s_events, s_events_siz, req);
+    if (ret) {
+        return ret;
+    }
+
+    *result = nentries;
+    return NVME_SUCCESS;
+}
+
  static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
  {
      NvmeCmd *cmd = &req->cmd;
@@ -5202,6 +5888,7 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
      uint16_t iv;
      NvmeNamespace *ns;
      int i;
+    uint16_t endgrpid = 0, ret = NVME_SUCCESS;
  
      static const uint32_t nvme_feature_default[NVME_FID_MAX] = {
          [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT,
@@ -5299,6 +5986,33 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
      case NVME_HOST_BEHAVIOR_SUPPORT:
          return nvme_c2h(n, (uint8_t *)&n->features.hbs,
                          sizeof(n->features.hbs), req);
+    case NVME_FDP_MODE:
+        endgrpid = dw11 & 0xff;
+
+        if (endgrpid != 0x1) {
+            return NVME_INVALID_FIELD | NVME_DNR;
+        }
+
+        ret = nvme_get_feature_fdp(n, endgrpid, &result);
+        if (ret) {
+            return ret;
+        }
+        goto out;
+    case NVME_FDP_EVENTS:
+        if (!nvme_nsid_valid(n, nsid)) {
+            return NVME_INVALID_NSID | NVME_DNR;
+        }
+
+        ns = nvme_ns(n, nsid);
+        if (unlikely(!ns)) {
+            return NVME_INVALID_FIELD | NVME_DNR;
+        }
+
+        ret = nvme_get_feature_fdp_events(n, ns, req, &result);
+        if (ret) {
+            return ret;
+        }
+        goto out;
      default:
          break;
      }
@@ -5331,6 +6045,20 @@ defaults:
          if (iv == n->admin_cq.vector) {
              result |= NVME_INTVC_NOCOALESCING;
          }
+        break;
+    case NVME_FDP_MODE:
+        endgrpid = dw11 & 0xff;
+
+        if (endgrpid != 0x1) {
+            return NVME_INVALID_FIELD | NVME_DNR;
+        }
+
+        ret = nvme_get_feature_fdp(n, endgrpid, &result);
+        if (ret) {
+            return ret;
+        }
+        goto out;
+
          break;
      default:
          result = nvme_feature_default[fid];
@@ -5339,7 +6067,7 @@ defaults:
  
  out:
      req->cqe.result = cpu_to_le32(result);
-    return NVME_SUCCESS;
+    return ret;
  }
  
  static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
@@ -5357,6 +6085,51 @@ static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
      return NVME_SUCCESS;
  }
  
+static uint16_t nvme_set_feature_fdp_events(NvmeCtrl *n, NvmeNamespace *ns,
+                                            NvmeRequest *req)
+{
+    NvmeCmd *cmd = &req->cmd;
+    uint32_t cdw11 = le32_to_cpu(cmd->cdw11);
+    uint16_t ph = cdw11 & 0xffff;
+    uint8_t noet = (cdw11 >> 16) & 0xff;
+    uint16_t ret, ruhid;
+    uint8_t enable = le32_to_cpu(cmd->cdw12) & 0x1;
+    uint8_t event_mask = 0;
+    unsigned int i;
+    g_autofree uint8_t *events = g_malloc0(noet);
+    NvmeRuHandle *ruh = NULL;
+
+    assert(ns);
+
+    if (!n->subsys || !n->subsys->endgrp.fdp.enabled) {
+        return NVME_FDP_DISABLED | NVME_DNR;
+    }
+
+    if (!nvme_ph_valid(ns, ph)) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    ruhid = ns->fdp.phs[ph];
+    ruh = &n->subsys->endgrp.fdp.ruhs[ruhid];
+
+    ret = nvme_h2c(n, events, noet, req);
+    if (ret) {
+        return ret;
+    }
+
+    for (i = 0; i < noet; i++) {
+        event_mask |= (1 << nvme_fdp_evf_shifts[events[i]]);
+    }
+
+    if (enable) {
+        ruh->event_filter |= event_mask;
+    } else {
+        ruh->event_filter = ruh->event_filter & ~event_mask;
+    }
+
+    return NVME_SUCCESS;
+}
+
  static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
  {
      NvmeNamespace *ns = NULL;
@@ -5516,6 +6289,11 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
              return NVME_CMD_SET_CMB_REJECTED | NVME_DNR;
          }
          break;
+    case NVME_FDP_MODE:
+        /* spec: abort with cmd seq err if there's one or more NS' in endgrp */
+        return NVME_CMD_SEQ_ERROR | NVME_DNR;
+    case NVME_FDP_EVENTS:
+        return nvme_set_feature_fdp_events(n, ns, req);
      default:
          return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
      }
@@ -5694,7 +6472,6 @@ static void nvme_format_cancel(BlockAIOCB *aiocb)
  static const AIOCBInfo nvme_format_aiocb_info = {
      .aiocb_size = sizeof(NvmeFormatAIOCB),
      .cancel_async = nvme_format_cancel,
-    .get_aio_context = nvme_get_aio_context,
  };
  
  static void nvme_format_set(NvmeNamespace *ns, uint8_t lbaf, uint8_t mset,
@@ -5959,6 +6736,7 @@ static uint16_t nvme_assign_virt_res_to_sec(NvmeCtrl *n, NvmeRequest *req,
  
  static uint16_t nvme_virt_set_state(NvmeCtrl *n, uint16_t cntlid, bool online)
  {
+    PCIDevice *pci = PCI_DEVICE(n);
      NvmeCtrl *sn = NULL;
      NvmeSecCtrlEntry *sctrl;
      int vf_index;
@@ -5968,9 +6746,9 @@ static uint16_t nvme_virt_set_state(NvmeCtrl *n, uint16_t cntlid, bool online)
          return NVME_INVALID_CTRL_ID | NVME_DNR;
      }
  
-    if (!pci_is_vf(&n->parent_obj)) {
+    if (!pci_is_vf(pci)) {
          vf_index = le16_to_cpu(sctrl->vfn) - 1;
-        sn = NVME(pcie_sriov_get_vf_at_index(&n->parent_obj, vf_index));
+        sn = NVME(pcie_sriov_get_vf_at_index(pci, vf_index));
      }
  
      if (online) {
@@ -6028,6 +6806,7 @@ static uint16_t nvme_virt_mngmt(NvmeCtrl *n, NvmeRequest *req)
  
  static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req)
  {
+    PCIDevice *pci = PCI_DEVICE(n);
      uint64_t dbs_addr = le64_to_cpu(req->cmd.dptr.prp1);
      uint64_t eis_addr = le64_to_cpu(req->cmd.dptr.prp2);
      int i;
@@ -6054,8 +6833,7 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req)
               */
              sq->db_addr = dbs_addr + (i << 3);
              sq->ei_addr = eis_addr + (i << 3);
-            pci_dma_write(&n->parent_obj, sq->db_addr, &sq->tail,
-                    sizeof(sq->tail));
+            stl_le_pci_dma(pci, sq->db_addr, sq->tail, MEMTXATTRS_UNSPECIFIED);
  
              if (n->params.ioeventfd && sq->sqid != 0) {
                  if (!nvme_init_sq_ioeventfd(sq)) {
@@ -6068,8 +6846,7 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req)
              /* CAP.DSTRD is 0, so offset of ith cq db_addr is (i<<3)+(1<<2) */
              cq->db_addr = dbs_addr + (i << 3) + (1 << 2);
              cq->ei_addr = eis_addr + (i << 3) + (1 << 2);
-            pci_dma_write(&n->parent_obj, cq->db_addr, &cq->head,
-                    sizeof(cq->head));
+            stl_le_pci_dma(pci, cq->db_addr, cq->head, MEMTXATTRS_UNSPECIFIED);
  
              if (n->params.ioeventfd && cq->cqid != 0) {
                  if (!nvme_init_cq_ioeventfd(cq)) {
@@ -6084,6 +6861,61 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req)
      return NVME_SUCCESS;
  }
  
+static uint16_t nvme_directive_send(NvmeCtrl *n, NvmeRequest *req)
+{
+    return NVME_INVALID_FIELD | NVME_DNR;
+}
+
+static uint16_t nvme_directive_receive(NvmeCtrl *n, NvmeRequest *req)
+{
+    NvmeNamespace *ns;
+    uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
+    uint32_t dw11 = le32_to_cpu(req->cmd.cdw11);
+    uint32_t nsid = le32_to_cpu(req->cmd.nsid);
+    uint8_t doper, dtype;
+    uint32_t numd, trans_len;
+    NvmeDirectiveIdentify id = {
+        .supported = 1 << NVME_DIRECTIVE_IDENTIFY,
+        .enabled = 1 << NVME_DIRECTIVE_IDENTIFY,
+    };
+
+    numd = dw10 + 1;
+    doper = dw11 & 0xff;
+    dtype = (dw11 >> 8) & 0xff;
+
+    trans_len = MIN(sizeof(NvmeDirectiveIdentify), numd << 2);
+
+    if (nsid == NVME_NSID_BROADCAST || dtype != NVME_DIRECTIVE_IDENTIFY ||
+        doper != NVME_DIRECTIVE_RETURN_PARAMS) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    ns = nvme_ns(n, nsid);
+    if (!ns) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    switch (dtype) {
+    case NVME_DIRECTIVE_IDENTIFY:
+        switch (doper) {
+        case NVME_DIRECTIVE_RETURN_PARAMS:
+            if (ns->endgrp && ns->endgrp->fdp.enabled) {
+                id.supported |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT;
+                id.enabled |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT;
+                id.persistent |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT;
+            }
+
+            return nvme_c2h(n, (uint8_t *)&id, trans_len, req);
+
+        default:
+            return NVME_INVALID_FIELD | NVME_DNR;
+        }
+
+    default:
+        return NVME_INVALID_FIELD;
+    }
+}
+
  static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
  {
      trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode,
@@ -6132,6 +6964,10 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
          return nvme_dbbuf_config(n, req);
      case NVME_ADM_CMD_FORMAT_NVM:
          return nvme_format(n, req);
+    case NVME_ADM_CMD_DIRECTIVE_SEND:
+        return nvme_directive_send(n, req);
+    case NVME_ADM_CMD_DIRECTIVE_RECV:
+        return nvme_directive_receive(n, req);
      default:
          assert(false);
      }
@@ -6141,16 +6977,18 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
  
  static void nvme_update_sq_eventidx(const NvmeSQueue *sq)
  {
-    pci_dma_write(&sq->ctrl->parent_obj, sq->ei_addr, &sq->tail,
-                  sizeof(sq->tail));
-    trace_pci_nvme_eventidx_sq(sq->sqid, sq->tail);
+    trace_pci_nvme_update_sq_eventidx(sq->sqid, sq->tail);
+
+    stl_le_pci_dma(PCI_DEVICE(sq->ctrl), sq->ei_addr, sq->tail,
+                   MEMTXATTRS_UNSPECIFIED);
  }
  
  static void nvme_update_sq_tail(NvmeSQueue *sq)
  {
-    pci_dma_read(&sq->ctrl->parent_obj, sq->db_addr, &sq->tail,
-                 sizeof(sq->tail));
-    trace_pci_nvme_shadow_doorbell_sq(sq->sqid, sq->tail);
+    ldl_le_pci_dma(PCI_DEVICE(sq->ctrl), sq->db_addr, &sq->tail,
+                   MEMTXATTRS_UNSPECIFIED);
+
+    trace_pci_nvme_update_sq_tail(sq->sqid, sq->tail);
  }
  
  static void nvme_process_sq(void *opaque)
@@ -6169,7 +7007,7 @@ static void nvme_process_sq(void *opaque)
      }
  
      while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
-        addr = sq->dma_addr + sq->head * n->sqe_size;
+        addr = sq->dma_addr + (sq->head << NVME_SQES);
          if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) {
              trace_pci_nvme_err_addr_read(addr);
              trace_pci_nvme_err_cfs();
@@ -6216,7 +7054,7 @@ static void nvme_update_msixcap_ts(PCIDevice *pci_dev, uint32_t table_size)
  
  static void nvme_activate_virt_res(NvmeCtrl *n)
  {
-    PCIDevice *pci_dev = &n->parent_obj;
+    PCIDevice *pci_dev = PCI_DEVICE(n);
      NvmePriCtrlCap *cap = &n->pri_ctrl_cap;
      NvmeSecCtrlEntry *sctrl;
  
@@ -6239,7 +7077,7 @@ static void nvme_activate_virt_res(NvmeCtrl *n)
  
  static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst)
  {
-    PCIDevice *pci_dev = &n->parent_obj;
+    PCIDevice *pci_dev = PCI_DEVICE(n);
      NvmeSecCtrlEntry *sctrl;
      NvmeNamespace *ns;
      int i;
@@ -6356,11 +7194,9 @@ static int nvme_start_ctrl(NvmeCtrl *n)
      uint32_t page_size = 1 << page_bits;
      NvmeSecCtrlEntry *sctrl = nvme_sctrl(n);
  
-    if (pci_is_vf(&n->parent_obj) && !sctrl->scs) {
+    if (pci_is_vf(PCI_DEVICE(n)) && !sctrl->scs) {
          trace_pci_nvme_err_startfail_virt_state(le16_to_cpu(sctrl->nvi),
-                                                le16_to_cpu(sctrl->nvq),
-                                                sctrl->scs ? "ONLINE" :
-                                                             "OFFLINE");
+                                                le16_to_cpu(sctrl->nvq));
          return -1;
      }
      if (unlikely(n->cq[0])) {
@@ -6396,34 +7232,6 @@ static int nvme_start_ctrl(NvmeCtrl *n)
                      NVME_CAP_MPSMAX(cap));
          return -1;
      }
-    if (unlikely(NVME_CC_IOCQES(cc) <
-                 NVME_CTRL_CQES_MIN(n->id_ctrl.cqes))) {
-        trace_pci_nvme_err_startfail_cqent_too_small(
-                    NVME_CC_IOCQES(cc),
-                    NVME_CTRL_CQES_MIN(cap));
-        return -1;
-    }
-    if (unlikely(NVME_CC_IOCQES(cc) >
-                 NVME_CTRL_CQES_MAX(n->id_ctrl.cqes))) {
-        trace_pci_nvme_err_startfail_cqent_too_large(
-                    NVME_CC_IOCQES(cc),
-                    NVME_CTRL_CQES_MAX(cap));
-        return -1;
-    }
-    if (unlikely(NVME_CC_IOSQES(cc) <
-                 NVME_CTRL_SQES_MIN(n->id_ctrl.sqes))) {
-        trace_pci_nvme_err_startfail_sqent_too_small(
-                    NVME_CC_IOSQES(cc),
-                    NVME_CTRL_SQES_MIN(cap));
-        return -1;
-    }
-    if (unlikely(NVME_CC_IOSQES(cc) >
-                 NVME_CTRL_SQES_MAX(n->id_ctrl.sqes))) {
-        trace_pci_nvme_err_startfail_sqent_too_large(
-                    NVME_CC_IOSQES(cc),
-                    NVME_CTRL_SQES_MAX(cap));
-        return -1;
-    }
      if (unlikely(!NVME_AQA_ASQS(aqa))) {
          trace_pci_nvme_err_startfail_asqent_sz_zero();
          return -1;
@@ -6436,8 +7244,6 @@ static int nvme_start_ctrl(NvmeCtrl *n)
      n->page_bits = page_bits;
      n->page_size = page_size;
      n->max_prp_ents = n->page_size / sizeof(uint64_t);
-    n->cqe_size = 1 << NVME_CC_IOCQES(cc);
-    n->sqe_size = 1 << NVME_CC_IOSQES(cc);
      nvme_init_cq(&n->admin_cq, n, acq, 0, 0, NVME_AQA_ACQS(aqa) + 1, 1);
      nvme_init_sq(&n->admin_sq, n, asq, 0, 0, NVME_AQA_ASQS(aqa) + 1);
  
@@ -6471,6 +7277,7 @@ static void nvme_cmb_enable_regs(NvmeCtrl *n)
  static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
                             unsigned size)
  {
+    PCIDevice *pci = PCI_DEVICE(n);
      uint64_t cap = ldq_le_p(&n->bar.cap);
      uint32_t cc = ldl_le_p(&n->bar.cc);
      uint32_t intms = ldl_le_p(&n->bar.intms);
@@ -6494,7 +7301,7 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
  
      switch (offset) {
      case NVME_REG_INTMS:
-        if (unlikely(msix_enabled(&(n->parent_obj)))) {
+        if (unlikely(msix_enabled(pci))) {
              NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
                             "undefined access to interrupt mask set"
                             " when MSI-X is enabled");
@@ -6507,7 +7314,7 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
          nvme_irq_check(n);
          break;
      case NVME_REG_INTMC:
-        if (unlikely(msix_enabled(&(n->parent_obj)))) {
+        if (unlikely(msix_enabled(pci))) {
              NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
                             "undefined access to interrupt mask clr"
                             " when MSI-X is enabled");
@@ -6732,7 +7539,7 @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
          return 0;
      }
  
-    if (pci_is_vf(&n->parent_obj) && !nvme_sctrl(n)->scs &&
+    if (pci_is_vf(PCI_DEVICE(n)) && !nvme_sctrl(n)->scs &&
          addr != NVME_REG_CSTS) {
          trace_pci_nvme_err_ignored_mmio_vf_offline(addr, size);
          return 0;
@@ -6753,6 +7560,7 @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
  
  static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
  {
+    PCIDevice *pci = PCI_DEVICE(n);
      uint32_t qid;
  
      if (unlikely(addr & ((1 << 2) - 1))) {
@@ -6779,7 +7587,7 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
              /*
               * NVM Express v1.3d, Section 4.1 state: "If host software writes
               * an invalid value to the Submission Queue Tail Doorbell or
-             * Completion Queue Head Doorbell regiter and an Asynchronous Event
+             * Completion Queue Head Doorbell register and an Asynchronous Event
               * Request command is outstanding, then an asynchronous event is
               * posted to the Admin Completion Queue with a status code of
               * Invalid Doorbell Write Value."
@@ -6820,8 +7628,7 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
          start_sqs = nvme_cq_full(cq) ? 1 : 0;
          cq->head = new_head;
          if (!qid && n->dbbuf_enabled) {
-            pci_dma_write(&n->parent_obj, cq->db_addr, &cq->head,
-                          sizeof(cq->head));
+            stl_le_pci_dma(pci, cq->db_addr, cq->head, MEMTXATTRS_UNSPECIFIED);
          }
          if (start_sqs) {
              NvmeSQueue *sq;
@@ -6894,8 +7701,7 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
               * including ones that run on Linux, are not updating Admin Queues,
               * so we can't trust reading it for an appropriate sq tail.
               */
-            pci_dma_write(&n->parent_obj, sq->db_addr, &sq->tail,
-                          sizeof(sq->tail));
+            stl_le_pci_dma(pci, sq->db_addr, sq->tail, MEMTXATTRS_UNSPECIFIED);
          }
  
          qemu_bh_schedule(sq->bh);
@@ -6909,7 +7715,7 @@ static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data,
  
      trace_pci_nvme_mmio_write(addr, data, size);
  
-    if (pci_is_vf(&n->parent_obj) && !nvme_sctrl(n)->scs &&
+    if (pci_is_vf(PCI_DEVICE(n)) && !nvme_sctrl(n)->scs &&
          addr != NVME_REG_CSTS) {
          trace_pci_nvme_err_ignored_mmio_vf_offline(addr, size);
          return;
@@ -6955,7 +7761,7 @@ static const MemoryRegionOps nvme_cmb_ops = {
      },
  };
  
-static void nvme_check_constraints(NvmeCtrl *n, Error **errp)
+static bool nvme_check_params(NvmeCtrl *n, Error **errp)
  {
      NvmeParams *params = &n->params;
  
@@ -6969,38 +7775,38 @@ static void nvme_check_constraints(NvmeCtrl *n, Error **errp)
      if (n->namespace.blkconf.blk && n->subsys) {
          error_setg(errp, "subsystem support is unavailable with legacy "
                     "namespace ('drive' property)");
-        return;
+        return false;
      }
  
      if (params->max_ioqpairs < 1 ||
          params->max_ioqpairs > NVME_MAX_IOQPAIRS) {
          error_setg(errp, "max_ioqpairs must be between 1 and %d",
                     NVME_MAX_IOQPAIRS);
-        return;
+        return false;
      }
  
      if (params->msix_qsize < 1 ||
          params->msix_qsize > PCI_MSIX_FLAGS_QSIZE + 1) {
          error_setg(errp, "msix_qsize must be between 1 and %d",
                     PCI_MSIX_FLAGS_QSIZE + 1);
-        return;
+        return false;
      }
  
      if (!params->serial) {
          error_setg(errp, "serial property not set");
-        return;
+        return false;
      }
  
      if (n->pmr.dev) {
          if (host_memory_backend_is_mapped(n->pmr.dev)) {
              error_setg(errp, "can't use already busy memdev: %s",
                         object_get_canonical_path_component(OBJECT(n->pmr.dev)));
-            return;
+            return false;
          }
  
          if (!is_power_of_2(n->pmr.dev->size)) {
              error_setg(errp, "pmr backend size needs to be power of 2 in size");
-            return;
+            return false;
          }
  
          host_memory_backend_set_mapped(n->pmr.dev, true);
@@ -7009,64 +7815,64 @@ static void nvme_check_constraints(NvmeCtrl *n, Error **errp)
      if (n->params.zasl > n->params.mdts) {
          error_setg(errp, "zoned.zasl (Zone Append Size Limit) must be less "
                     "than or equal to mdts (Maximum Data Transfer Size)");
-        return;
+        return false;
      }
  
      if (!n->params.vsl) {
          error_setg(errp, "vsl must be non-zero");
-        return;
+        return false;
      }
  
      if (params->sriov_max_vfs) {
          if (!n->subsys) {
              error_setg(errp, "subsystem is required for the use of SR-IOV");
-            return;
+            return false;
          }
  
          if (params->sriov_max_vfs > NVME_MAX_VFS) {
              error_setg(errp, "sriov_max_vfs must be between 0 and %d",
                         NVME_MAX_VFS);
-            return;
+            return false;
          }
  
          if (params->cmb_size_mb) {
              error_setg(errp, "CMB is not supported with SR-IOV");
-            return;
+            return false;
          }
  
          if (n->pmr.dev) {
              error_setg(errp, "PMR is not supported with SR-IOV");
-            return;
+            return false;
          }
  
          if (!params->sriov_vq_flexible || !params->sriov_vi_flexible) {
              error_setg(errp, "both sriov_vq_flexible and sriov_vi_flexible"
                         " must be set for the use of SR-IOV");
-            return;
+            return false;
          }
  
          if (params->sriov_vq_flexible < params->sriov_max_vfs * 2) {
              error_setg(errp, "sriov_vq_flexible must be greater than or equal"
                         " to %d (sriov_max_vfs * 2)", params->sriov_max_vfs * 2);
-            return;
+            return false;
          }
  
          if (params->max_ioqpairs < params->sriov_vq_flexible + 2) {
              error_setg(errp, "(max_ioqpairs - sriov_vq_flexible) must be"
                         " greater than or equal to 2");
-            return;
+            return false;
          }
  
          if (params->sriov_vi_flexible < params->sriov_max_vfs) {
              error_setg(errp, "sriov_vi_flexible must be greater than or equal"
                         " to %d (sriov_max_vfs)", params->sriov_max_vfs);
-            return;
+            return false;
          }
  
          if (params->msix_qsize < params->sriov_vi_flexible + 1) {
              error_setg(errp, "(msix_qsize - sriov_vi_flexible) must be"
                         " greater than or equal to 1");
-            return;
+            return false;
          }
  
          if (params->sriov_max_vi_per_vf &&
@@ -7074,7 +7880,7 @@ static void nvme_check_constraints(NvmeCtrl *n, Error **errp)
              error_setg(errp, "sriov_max_vi_per_vf must meet:"
                         " (sriov_max_vi_per_vf - 1) %% %d == 0 and"
                         " sriov_max_vi_per_vf >= 1", NVME_VF_RES_GRANULARITY);
-            return;
+            return false;
          }
  
          if (params->sriov_max_vq_per_vf &&
@@ -7083,9 +7889,11 @@ static void nvme_check_constraints(NvmeCtrl *n, Error **errp)
              error_setg(errp, "sriov_max_vq_per_vf must meet:"
                         " (sriov_max_vq_per_vf - 1) %% %d == 0 and"
                         " sriov_max_vq_per_vf >= 2", NVME_VF_RES_GRANULARITY);
-            return;
+            return false;
          }
      }
+
+    return true;
  }
  
  static void nvme_init_state(NvmeCtrl *n)
@@ -7093,10 +7901,11 @@ static void nvme_init_state(NvmeCtrl *n)
      NvmePriCtrlCap *cap = &n->pri_ctrl_cap;
      NvmeSecCtrlList *list = &n->sec_ctrl_list;
      NvmeSecCtrlEntry *sctrl;
+    PCIDevice *pci = PCI_DEVICE(n);
      uint8_t max_vfs;
      int i;
  
-    if (pci_is_vf(&n->parent_obj)) {
+    if (pci_is_vf(pci)) {
          sctrl = nvme_sctrl(n);
          max_vfs = 0;
          n->conf_ioqpairs = sctrl->nvq ? le16_to_cpu(sctrl->nvq) - 1 : 0;
@@ -7125,7 +7934,7 @@ static void nvme_init_state(NvmeCtrl *n)
      cap->cntlid = cpu_to_le16(n->cntlid);
      cap->crt = NVME_CRT_VQ | NVME_CRT_VI;
  
-    if (pci_is_vf(&n->parent_obj)) {
+    if (pci_is_vf(pci)) {
          cap->vqprt = cpu_to_le16(1 + n->conf_ioqpairs);
      } else {
          cap->vqprt = cpu_to_le16(1 + n->params.max_ioqpairs -
@@ -7138,7 +7947,7 @@ static void nvme_init_state(NvmeCtrl *n)
                          cap->vqfrt / MAX(max_vfs, 1);
      }
  
-    if (pci_is_vf(&n->parent_obj)) {
+    if (pci_is_vf(pci)) {
          cap->viprt = cpu_to_le16(n->conf_msix_qsize);
      } else {
          cap->viprt = cpu_to_le16(n->params.msix_qsize -
@@ -7261,15 +8070,14 @@ static int nvme_add_pm_capability(PCIDevice *pci_dev, uint8_t offset)
      return 0;
  }
  
-static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
+static bool nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
  {
+    ERRP_GUARD();
      uint8_t *pci_conf = pci_dev->config;
      uint64_t bar_size;
      unsigned msix_table_offset, msix_pba_offset;
      int ret;
  
-    Error *err = NULL;
-
      pci_conf[PCI_INTERRUPT_PIN] = 1;
      pci_config_set_prog_interface(pci_conf, 0x2);
  
@@ -7286,7 +8094,7 @@ static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
      pcie_endpoint_cap_init(pci_dev, 0x80);
      pcie_cap_flr_init(pci_dev);
      if (n->params.sriov_max_vfs) {
-        pcie_ari_init(pci_dev, 0x100, 1);
+        pcie_ari_init(pci_dev, 0x100);
      }
  
      /* add one to max_ioqpairs to account for the admin queue pair */
@@ -7306,14 +8114,14 @@ static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
      }
      ret = msix_init(pci_dev, n->params.msix_qsize,
                      &n->bar0, 0, msix_table_offset,
-                    &n->bar0, 0, msix_pba_offset, 0, &err);
-    if (ret < 0) {
-        if (ret == -ENOTSUP) {
-            warn_report_err(err);
-        } else {
-            error_propagate(errp, err);
-            return ret;
-        }
+                    &n->bar0, 0, msix_pba_offset, 0, errp);
+    if (ret == -ENOTSUP) {
+        /* report that msix is not supported, but do not error out */
+        warn_report_err(*errp);
+        *errp = NULL;
+    } else if (ret < 0) {
+        /* propagate error to caller */
+        return false;
      }
  
      nvme_update_msixcap_ts(pci_dev, n->conf_msix_qsize);
@@ -7330,7 +8138,7 @@ static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
          nvme_init_sriov(n, pci_dev, 0x120);
      }
  
-    return 0;
+    return true;
  }
  
  static void nvme_init_subnqn(NvmeCtrl *n)
@@ -7352,6 +8160,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
      uint8_t *pci_conf = pci_dev->config;
      uint64_t cap = ldq_le_p(&n->bar.cap);
      NvmeSecCtrlEntry *sctrl = nvme_sctrl(n);
+    uint32_t ctratt;
  
      id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID));
      id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID));
@@ -7362,7 +8171,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
      id->cntlid = cpu_to_le16(n->cntlid);
  
      id->oaes = cpu_to_le32(NVME_OAES_NS_ATTR);
-    id->ctratt |= cpu_to_le32(NVME_CTRATT_ELBAS);
+    ctratt = NVME_CTRATT_ELBAS;
  
      id->rab = 6;
  
@@ -7379,7 +8188,8 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
      id->mdts = n->params.mdts;
      id->ver = cpu_to_le32(NVME_SPEC_VER);
      id->oacs =
-        cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT | NVME_OACS_DBBUF);
+        cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT | NVME_OACS_DBBUF |
+                    NVME_OACS_DIRECTIVES);
      id->cntrltype = 0x1;
  
      /*
@@ -7402,8 +8212,8 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
      id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING);
      id->cctemp = cpu_to_le16(NVME_TEMPERATURE_CRITICAL);
  
-    id->sqes = (0x6 << 4) | 0x6;
-    id->cqes = (0x4 << 4) | 0x4;
+    id->sqes = (NVME_SQES << 4) | NVME_SQES;
+    id->cqes = (NVME_CQES << 4) | NVME_CQES;
      id->nn = cpu_to_le32(NVME_MAX_NAMESPACES);
      id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
                             NVME_ONCS_FEATURES | NVME_ONCS_DSM |
@@ -7429,8 +8239,17 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
  
      if (n->subsys) {
          id->cmic |= NVME_CMIC_MULTI_CTRL;
+        ctratt |= NVME_CTRATT_ENDGRPS;
+
+        id->endgidmax = cpu_to_le16(0x1);
+
+        if (n->subsys->endgrp.fdp.enabled) {
+            ctratt |= NVME_CTRATT_FDPS;
+        }
      }
  
+    id->ctratt = cpu_to_le32(ctratt);
+
      NVME_CAP_SET_MQES(cap, 0x7ff);
      NVME_CAP_SET_CQR(cap, 1);
      NVME_CAP_SET_TO(cap, 0xf);
@@ -7445,7 +8264,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
      stl_le_p(&n->bar.vs, NVME_SPEC_VER);
      n->bar.intmc = n->bar.intms = 0;
  
-    if (pci_is_vf(&n->parent_obj) && !sctrl->scs) {
+    if (pci_is_vf(pci_dev) && !sctrl->scs) {
          stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
      }
  }
@@ -7483,8 +8302,8 @@ void nvme_attach_ns(NvmeCtrl *n, NvmeNamespace *ns)
  static void nvme_realize(PCIDevice *pci_dev, Error **errp)
  {
      NvmeCtrl *n = NVME(pci_dev);
+    DeviceState *dev = DEVICE(pci_dev);
      NvmeNamespace *ns;
-    Error *local_err = NULL;
      NvmeCtrl *pn = NVME(pcie_sriov_get_pf(pci_dev));
  
      if (pci_is_vf(pci_dev)) {
@@ -7496,21 +8315,17 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
          n->subsys = pn->subsys;
      }
  
-    nvme_check_constraints(n, &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
+    if (!nvme_check_params(n, errp)) {
          return;
      }
  
-    qbus_init(&n->bus, sizeof(NvmeBus), TYPE_NVME_BUS,
-              &pci_dev->qdev, n->parent_obj.qdev.id);
+    qbus_init(&n->bus, sizeof(NvmeBus), TYPE_NVME_BUS, dev, dev->id);
  
      if (nvme_init_subsys(n, errp)) {
-        error_propagate(errp, local_err);
          return;
      }
      nvme_init_state(n);
-    if (nvme_init_pci(n, pci_dev, errp)) {
+    if (!nvme_init_pci(n, pci_dev, errp)) {
          return;
      }
      nvme_init_ctrl(n, pci_dev);