]> git.proxmox.com Git - mirror_qemu.git/blame - hw/block/nvme.c
hw/block/nvme: factor out namespace setup
[mirror_qemu.git] / hw / block / nvme.c
CommitLineData
f3c507ad
KB
1/*
2 * QEMU NVM Express Controller
3 *
4 * Copyright (c) 2012, Intel Corporation
5 *
6 * Written by Keith Busch <keith.busch@intel.com>
7 *
8 * This code is licensed under the GNU GPL v2 or later.
9 */
10
11/**
a896f7f2 12 * Reference Specs: http://www.nvmexpress.org, 1.2, 1.1, 1.0e
f3c507ad
KB
13 *
14 * http://www.nvmexpress.org/resources/
15 */
16
17/**
18 * Usage: add options:
19 * -drive file=<file>,if=none,id=<drive_id>
a896f7f2 20 * -device nvme,drive=<drive_id>,serial=<serial>,id=<id[optional]>, \
7c895269 21 * cmb_size_mb=<cmb_size_mb[optional]>, \
6cf94132 22 * [pmrdev=<mem_backend_file_id>,] \
dce22c86 23 * max_ioqpairs=<N[optional]>
a896f7f2
SB
24 *
25 * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
b2b2b67a 26 * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
6cf94132
AJ
27 *
28 * cmb_size_mb= and pmrdev= options are mutually exclusive due to limitation
29 * in available BAR's. cmb_size_mb= will take precedence over pmrdev= when
30 * both provided.
31 * Enabling pmr emulation can be achieved by pointing to memory-backend-file.
32 * For example:
33 * -object memory-backend-file,id=<mem_id>,share=on,mem-path=<file_path>, \
34 * size=<size> .... -device nvme,...,pmrdev=<mem_id>
f3c507ad
KB
35 */
36
80c71a24 37#include "qemu/osdep.h"
e8400cf3 38#include "qemu/units.h"
dce22c86 39#include "qemu/error-report.h"
a9c94277 40#include "hw/block/block.h"
a9c94277
MA
41#include "hw/pci/msix.h"
42#include "hw/pci/pci.h"
a27bd6c7 43#include "hw/qdev-properties.h"
d6454270 44#include "migration/vmstate.h"
33739c71 45#include "sysemu/sysemu.h"
da34e65c 46#include "qapi/error.h"
33739c71 47#include "qapi/visitor.h"
6cf94132 48#include "sysemu/hostmem.h"
4be74634 49#include "sysemu/block-backend.h"
bc2a2364 50#include "exec/memory.h"
1ee24514 51#include "qemu/log.h"
0b8fa32f 52#include "qemu/module.h"
6b39bad0 53#include "qemu/cutils.h"
1ee24514 54#include "trace.h"
f3c507ad
KB
55#include "nvme.h"
56
f7e8c23f
KJ
57#define NVME_REG_SIZE 0x1000
58#define NVME_DB_SIZE 4
59
1ee24514
DG
60#define NVME_GUEST_ERR(trace, fmt, ...) \
61 do { \
62 (trace_##trace)(__VA_ARGS__); \
63 qemu_log_mask(LOG_GUEST_ERROR, #trace \
64 " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \
65 } while (0)
66
f3c507ad
KB
67static void nvme_process_sq(void *opaque);
68
b4529c5c
KJ
69static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
70{
71 hwaddr low = n->ctrl_mem.addr;
72 hwaddr hi = n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size);
73
74 return addr >= low && addr < hi;
75}
76
a896f7f2
SB
77static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
78{
e1731e81 79 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr)) {
a896f7f2 80 memcpy(buf, (void *)&n->cmbuf[addr - n->ctrl_mem.addr], size);
b4529c5c 81 return;
a896f7f2 82 }
b4529c5c
KJ
83
84 pci_dma_read(&n->parent_obj, addr, buf, size);
a896f7f2
SB
85}
86
f3c507ad
KB
87static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
88{
dce22c86 89 return sqid < n->params.max_ioqpairs + 1 && n->sq[sqid] != NULL ? 0 : -1;
f3c507ad
KB
90}
91
92static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
93{
dce22c86 94 return cqid < n->params.max_ioqpairs + 1 && n->cq[cqid] != NULL ? 0 : -1;
f3c507ad
KB
95}
96
97static void nvme_inc_cq_tail(NvmeCQueue *cq)
98{
99 cq->tail++;
100 if (cq->tail >= cq->size) {
101 cq->tail = 0;
102 cq->phase = !cq->phase;
103 }
104}
105
106static void nvme_inc_sq_head(NvmeSQueue *sq)
107{
108 sq->head = (sq->head + 1) % sq->size;
109}
110
111static uint8_t nvme_cq_full(NvmeCQueue *cq)
112{
113 return (cq->tail + 1) % cq->size == cq->head;
114}
115
116static uint8_t nvme_sq_empty(NvmeSQueue *sq)
117{
118 return sq->head == sq->tail;
119}
120
5e9aa92e
HN
121static void nvme_irq_check(NvmeCtrl *n)
122{
123 if (msix_enabled(&(n->parent_obj))) {
124 return;
125 }
126 if (~n->bar.intms & n->irq_status) {
127 pci_irq_assert(&n->parent_obj);
128 } else {
129 pci_irq_deassert(&n->parent_obj);
130 }
131}
132
133static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq)
f3c507ad
KB
134{
135 if (cq->irq_enabled) {
136 if (msix_enabled(&(n->parent_obj))) {
6f4ee2e9 137 trace_pci_nvme_irq_msix(cq->vector);
f3c507ad
KB
138 msix_notify(&(n->parent_obj), cq->vector);
139 } else {
6f4ee2e9 140 trace_pci_nvme_irq_pin();
ca247d35
KJ
141 assert(cq->vector < 32);
142 n->irq_status |= 1 << cq->vector;
5e9aa92e 143 nvme_irq_check(n);
f3c507ad 144 }
1ee24514 145 } else {
6f4ee2e9 146 trace_pci_nvme_irq_masked();
f3c507ad
KB
147 }
148}
149
5e9aa92e
HN
150static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq)
151{
152 if (cq->irq_enabled) {
153 if (msix_enabled(&(n->parent_obj))) {
154 return;
155 } else {
ca247d35
KJ
156 assert(cq->vector < 32);
157 n->irq_status &= ~(1 << cq->vector);
5e9aa92e
HN
158 nvme_irq_check(n);
159 }
160 }
161}
162
b2b2b67a
SB
163static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
164 uint64_t prp2, uint32_t len, NvmeCtrl *n)
f3c507ad
KB
165{
166 hwaddr trans_len = n->page_size - (prp1 % n->page_size);
167 trans_len = MIN(len, trans_len);
168 int num_prps = (len >> n->page_bits) + 1;
169
1ee24514 170 if (unlikely(!prp1)) {
6f4ee2e9 171 trace_pci_nvme_err_invalid_prp();
f3c507ad 172 return NVME_INVALID_FIELD | NVME_DNR;
e1731e81 173 } else if (n->bar.cmbsz && prp1 >= n->ctrl_mem.addr &&
b2b2b67a
SB
174 prp1 < n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size)) {
175 qsg->nsg = 0;
176 qemu_iovec_init(iov, num_prps);
177 qemu_iovec_add(iov, (void *)&n->cmbuf[prp1 - n->ctrl_mem.addr], trans_len);
178 } else {
179 pci_dma_sglist_init(qsg, &n->parent_obj, num_prps);
180 qemu_sglist_add(qsg, prp1, trans_len);
f3c507ad 181 }
f3c507ad
KB
182 len -= trans_len;
183 if (len) {
1ee24514 184 if (unlikely(!prp2)) {
6f4ee2e9 185 trace_pci_nvme_err_invalid_prp2_missing();
f3c507ad
KB
186 goto unmap;
187 }
188 if (len > n->page_size) {
189 uint64_t prp_list[n->max_prp_ents];
190 uint32_t nents, prp_trans;
191 int i = 0;
192
193 nents = (len + n->page_size - 1) >> n->page_bits;
194 prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
b2b2b67a 195 nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
f3c507ad
KB
196 while (len != 0) {
197 uint64_t prp_ent = le64_to_cpu(prp_list[i]);
198
199 if (i == n->max_prp_ents - 1 && len > n->page_size) {
1ee24514 200 if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) {
6f4ee2e9 201 trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
f3c507ad
KB
202 goto unmap;
203 }
204
205 i = 0;
206 nents = (len + n->page_size - 1) >> n->page_bits;
207 prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
b2b2b67a 208 nvme_addr_read(n, prp_ent, (void *)prp_list,
f3c507ad
KB
209 prp_trans);
210 prp_ent = le64_to_cpu(prp_list[i]);
211 }
212
1ee24514 213 if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) {
6f4ee2e9 214 trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
f3c507ad
KB
215 goto unmap;
216 }
217
218 trans_len = MIN(len, n->page_size);
b2b2b67a
SB
219 if (qsg->nsg){
220 qemu_sglist_add(qsg, prp_ent, trans_len);
221 } else {
222 qemu_iovec_add(iov, (void *)&n->cmbuf[prp_ent - n->ctrl_mem.addr], trans_len);
223 }
f3c507ad
KB
224 len -= trans_len;
225 i++;
226 }
227 } else {
1ee24514 228 if (unlikely(prp2 & (n->page_size - 1))) {
6f4ee2e9 229 trace_pci_nvme_err_invalid_prp2_align(prp2);
f3c507ad
KB
230 goto unmap;
231 }
b2b2b67a
SB
232 if (qsg->nsg) {
233 qemu_sglist_add(qsg, prp2, len);
234 } else {
235 qemu_iovec_add(iov, (void *)&n->cmbuf[prp2 - n->ctrl_mem.addr], trans_len);
236 }
f3c507ad
KB
237 }
238 }
239 return NVME_SUCCESS;
240
241 unmap:
242 qemu_sglist_destroy(qsg);
243 return NVME_INVALID_FIELD | NVME_DNR;
244}
245
3036a626
KH
246static uint16_t nvme_dma_write_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
247 uint64_t prp1, uint64_t prp2)
248{
249 QEMUSGList qsg;
250 QEMUIOVector iov;
251 uint16_t status = NVME_SUCCESS;
252
253 if (nvme_map_prp(&qsg, &iov, prp1, prp2, len, n)) {
254 return NVME_INVALID_FIELD | NVME_DNR;
255 }
256 if (qsg.nsg > 0) {
257 if (dma_buf_write(ptr, len, &qsg)) {
258 status = NVME_INVALID_FIELD | NVME_DNR;
259 }
260 qemu_sglist_destroy(&qsg);
261 } else {
262 if (qemu_iovec_to_buf(&iov, 0, ptr, len) != len) {
263 status = NVME_INVALID_FIELD | NVME_DNR;
264 }
265 qemu_iovec_destroy(&iov);
266 }
267 return status;
268}
269
f3c507ad
KB
270static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
271 uint64_t prp1, uint64_t prp2)
272{
273 QEMUSGList qsg;
b2b2b67a
SB
274 QEMUIOVector iov;
275 uint16_t status = NVME_SUCCESS;
f3c507ad 276
6f4ee2e9 277 trace_pci_nvme_dma_read(prp1, prp2);
1ee24514 278
b2b2b67a 279 if (nvme_map_prp(&qsg, &iov, prp1, prp2, len, n)) {
f3c507ad
KB
280 return NVME_INVALID_FIELD | NVME_DNR;
281 }
b2b2b67a 282 if (qsg.nsg > 0) {
1ee24514 283 if (unlikely(dma_buf_read(ptr, len, &qsg))) {
6f4ee2e9 284 trace_pci_nvme_err_invalid_dma();
b2b2b67a
SB
285 status = NVME_INVALID_FIELD | NVME_DNR;
286 }
f3c507ad 287 qemu_sglist_destroy(&qsg);
b2b2b67a 288 } else {
25349e82 289 if (unlikely(qemu_iovec_from_buf(&iov, 0, ptr, len) != len)) {
6f4ee2e9 290 trace_pci_nvme_err_invalid_dma();
b2b2b67a
SB
291 status = NVME_INVALID_FIELD | NVME_DNR;
292 }
293 qemu_iovec_destroy(&iov);
f3c507ad 294 }
b2b2b67a 295 return status;
f3c507ad
KB
296}
297
298static void nvme_post_cqes(void *opaque)
299{
300 NvmeCQueue *cq = opaque;
301 NvmeCtrl *n = cq->ctrl;
302 NvmeRequest *req, *next;
303
304 QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
305 NvmeSQueue *sq;
306 hwaddr addr;
307
308 if (nvme_cq_full(cq)) {
309 break;
310 }
311
312 QTAILQ_REMOVE(&cq->req_list, req, entry);
313 sq = req->sq;
314 req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase);
315 req->cqe.sq_id = cpu_to_le16(sq->sqid);
316 req->cqe.sq_head = cpu_to_le16(sq->head);
317 addr = cq->dma_addr + cq->tail * n->cqe_size;
318 nvme_inc_cq_tail(cq);
319 pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe,
320 sizeof(req->cqe));
321 QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
322 }
6da02181
KB
323 if (cq->tail != cq->head) {
324 nvme_irq_assert(n, cq);
325 }
f3c507ad
KB
326}
327
328static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
329{
330 assert(cq->cqid == req->sq->cqid);
331 QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
332 QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
bc72ad67 333 timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
f3c507ad
KB
334}
335
336static void nvme_rw_cb(void *opaque, int ret)
337{
338 NvmeRequest *req = opaque;
339 NvmeSQueue *sq = req->sq;
340 NvmeCtrl *n = sq->ctrl;
341 NvmeCQueue *cq = n->cq[sq->cqid];
342
f3c507ad 343 if (!ret) {
1753f3dc 344 block_acct_done(blk_get_stats(n->conf.blk), &req->acct);
f3c507ad
KB
345 req->status = NVME_SUCCESS;
346 } else {
1753f3dc 347 block_acct_failed(blk_get_stats(n->conf.blk), &req->acct);
f3c507ad
KB
348 req->status = NVME_INTERNAL_DEV_ERROR;
349 }
8b9d74e0
CH
350 if (req->has_sg) {
351 qemu_sglist_destroy(&req->qsg);
352 }
f3c507ad
KB
353 nvme_enqueue_req_completion(cq, req);
354}
355
8b9d74e0
CH
356static uint16_t nvme_flush(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
357 NvmeRequest *req)
358{
359 req->has_sg = false;
360 block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0,
361 BLOCK_ACCT_FLUSH);
362 req->aiocb = blk_aio_flush(n->conf.blk, nvme_rw_cb, req);
363
364 return NVME_NO_COMPLETE;
365}
366
c03e7ef1
CH
367static uint16_t nvme_write_zeros(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
368 NvmeRequest *req)
369{
370 NvmeRwCmd *rw = (NvmeRwCmd *)cmd;
371 const uint8_t lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
372 const uint8_t data_shift = ns->id_ns.lbaf[lba_index].ds;
373 uint64_t slba = le64_to_cpu(rw->slba);
374 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
9d6459d2
KB
375 uint64_t offset = slba << data_shift;
376 uint32_t count = nlb << data_shift;
c03e7ef1 377
1ee24514 378 if (unlikely(slba + nlb > ns->id_ns.nsze)) {
6f4ee2e9 379 trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
c03e7ef1
CH
380 return NVME_LBA_RANGE | NVME_DNR;
381 }
382
383 req->has_sg = false;
384 block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0,
385 BLOCK_ACCT_WRITE);
9d6459d2 386 req->aiocb = blk_aio_pwrite_zeroes(n->conf.blk, offset, count,
c03e7ef1
CH
387 BDRV_REQ_MAY_UNMAP, nvme_rw_cb, req);
388 return NVME_NO_COMPLETE;
389}
390
f3c507ad
KB
391static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
392 NvmeRequest *req)
393{
394 NvmeRwCmd *rw = (NvmeRwCmd *)cmd;
395 uint32_t nlb = le32_to_cpu(rw->nlb) + 1;
396 uint64_t slba = le64_to_cpu(rw->slba);
397 uint64_t prp1 = le64_to_cpu(rw->prp1);
398 uint64_t prp2 = le64_to_cpu(rw->prp2);
399
400 uint8_t lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
401 uint8_t data_shift = ns->id_ns.lbaf[lba_index].ds;
2115f2a1 402 uint64_t data_size = (uint64_t)nlb << data_shift;
cbe0ed62 403 uint64_t data_offset = slba << data_shift;
f3c507ad 404 int is_write = rw->opcode == NVME_CMD_WRITE ? 1 : 0;
1753f3dc 405 enum BlockAcctType acct = is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ;
f3c507ad 406
6f4ee2e9 407 trace_pci_nvme_rw(is_write ? "write" : "read", nlb, data_size, slba);
1ee24514
DG
408
409 if (unlikely((slba + nlb) > ns->id_ns.nsze)) {
1753f3dc 410 block_acct_invalid(blk_get_stats(n->conf.blk), acct);
6f4ee2e9 411 trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
f3c507ad
KB
412 return NVME_LBA_RANGE | NVME_DNR;
413 }
1753f3dc 414
b2b2b67a 415 if (nvme_map_prp(&req->qsg, &req->iov, prp1, prp2, data_size, n)) {
1753f3dc 416 block_acct_invalid(blk_get_stats(n->conf.blk), acct);
f3c507ad
KB
417 return NVME_INVALID_FIELD | NVME_DNR;
418 }
1753f3dc 419
1753f3dc 420 dma_acct_start(n->conf.blk, &req->acct, &req->qsg, acct);
b2b2b67a
SB
421 if (req->qsg.nsg > 0) {
422 req->has_sg = true;
423 req->aiocb = is_write ?
424 dma_blk_write(n->conf.blk, &req->qsg, data_offset, BDRV_SECTOR_SIZE,
425 nvme_rw_cb, req) :
426 dma_blk_read(n->conf.blk, &req->qsg, data_offset, BDRV_SECTOR_SIZE,
427 nvme_rw_cb, req);
428 } else {
429 req->has_sg = false;
430 req->aiocb = is_write ?
431 blk_aio_pwritev(n->conf.blk, data_offset, &req->iov, 0, nvme_rw_cb,
432 req) :
433 blk_aio_preadv(n->conf.blk, data_offset, &req->iov, 0, nvme_rw_cb,
434 req);
435 }
f3c507ad
KB
436
437 return NVME_NO_COMPLETE;
438}
439
440static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
441{
442 NvmeNamespace *ns;
443 uint32_t nsid = le32_to_cpu(cmd->nsid);
444
1ee24514 445 if (unlikely(nsid == 0 || nsid > n->num_namespaces)) {
6f4ee2e9 446 trace_pci_nvme_err_invalid_ns(nsid, n->num_namespaces);
f3c507ad
KB
447 return NVME_INVALID_NSID | NVME_DNR;
448 }
449
450 ns = &n->namespaces[nsid - 1];
451 switch (cmd->opcode) {
452 case NVME_CMD_FLUSH:
8b9d74e0 453 return nvme_flush(n, ns, cmd, req);
c03e7ef1
CH
454 case NVME_CMD_WRITE_ZEROS:
455 return nvme_write_zeros(n, ns, cmd, req);
f3c507ad
KB
456 case NVME_CMD_WRITE:
457 case NVME_CMD_READ:
458 return nvme_rw(n, ns, cmd, req);
459 default:
6f4ee2e9 460 trace_pci_nvme_err_invalid_opc(cmd->opcode);
f3c507ad
KB
461 return NVME_INVALID_OPCODE | NVME_DNR;
462 }
463}
464
465static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
466{
467 n->sq[sq->sqid] = NULL;
bc72ad67
AB
468 timer_del(sq->timer);
469 timer_free(sq->timer);
f3c507ad
KB
470 g_free(sq->io_req);
471 if (sq->sqid) {
472 g_free(sq);
473 }
474}
475
476static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeCmd *cmd)
477{
478 NvmeDeleteQ *c = (NvmeDeleteQ *)cmd;
479 NvmeRequest *req, *next;
480 NvmeSQueue *sq;
481 NvmeCQueue *cq;
482 uint16_t qid = le16_to_cpu(c->qid);
483
1ee24514 484 if (unlikely(!qid || nvme_check_sqid(n, qid))) {
6f4ee2e9 485 trace_pci_nvme_err_invalid_del_sq(qid);
f3c507ad
KB
486 return NVME_INVALID_QID | NVME_DNR;
487 }
488
6f4ee2e9 489 trace_pci_nvme_del_sq(qid);
1ee24514 490
f3c507ad
KB
491 sq = n->sq[qid];
492 while (!QTAILQ_EMPTY(&sq->out_req_list)) {
493 req = QTAILQ_FIRST(&sq->out_req_list);
494 assert(req->aiocb);
4be74634 495 blk_aio_cancel(req->aiocb);
f3c507ad
KB
496 }
497 if (!nvme_check_cqid(n, sq->cqid)) {
498 cq = n->cq[sq->cqid];
499 QTAILQ_REMOVE(&cq->sq_list, sq, entry);
500
501 nvme_post_cqes(cq);
502 QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
503 if (req->sq == sq) {
504 QTAILQ_REMOVE(&cq->req_list, req, entry);
505 QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
506 }
507 }
508 }
509
510 nvme_free_sq(sq, n);
511 return NVME_SUCCESS;
512}
513
514static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
515 uint16_t sqid, uint16_t cqid, uint16_t size)
516{
517 int i;
518 NvmeCQueue *cq;
519
520 sq->ctrl = n;
521 sq->dma_addr = dma_addr;
522 sq->sqid = sqid;
523 sq->size = size;
524 sq->cqid = cqid;
525 sq->head = sq->tail = 0;
02c4f26b 526 sq->io_req = g_new(NvmeRequest, sq->size);
f3c507ad
KB
527
528 QTAILQ_INIT(&sq->req_list);
529 QTAILQ_INIT(&sq->out_req_list);
530 for (i = 0; i < sq->size; i++) {
531 sq->io_req[i].sq = sq;
532 QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
533 }
bc72ad67 534 sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
f3c507ad
KB
535
536 assert(n->cq[cqid]);
537 cq = n->cq[cqid];
538 QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry);
539 n->sq[sqid] = sq;
540}
541
542static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeCmd *cmd)
543{
544 NvmeSQueue *sq;
545 NvmeCreateSq *c = (NvmeCreateSq *)cmd;
546
547 uint16_t cqid = le16_to_cpu(c->cqid);
548 uint16_t sqid = le16_to_cpu(c->sqid);
549 uint16_t qsize = le16_to_cpu(c->qsize);
550 uint16_t qflags = le16_to_cpu(c->sq_flags);
551 uint64_t prp1 = le64_to_cpu(c->prp1);
552
6f4ee2e9 553 trace_pci_nvme_create_sq(prp1, sqid, cqid, qsize, qflags);
1ee24514
DG
554
555 if (unlikely(!cqid || nvme_check_cqid(n, cqid))) {
6f4ee2e9 556 trace_pci_nvme_err_invalid_create_sq_cqid(cqid);
f3c507ad
KB
557 return NVME_INVALID_CQID | NVME_DNR;
558 }
1ee24514 559 if (unlikely(!sqid || !nvme_check_sqid(n, sqid))) {
6f4ee2e9 560 trace_pci_nvme_err_invalid_create_sq_sqid(sqid);
f3c507ad
KB
561 return NVME_INVALID_QID | NVME_DNR;
562 }
1ee24514 563 if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) {
6f4ee2e9 564 trace_pci_nvme_err_invalid_create_sq_size(qsize);
f3c507ad
KB
565 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
566 }
1ee24514 567 if (unlikely(!prp1 || prp1 & (n->page_size - 1))) {
6f4ee2e9 568 trace_pci_nvme_err_invalid_create_sq_addr(prp1);
f3c507ad
KB
569 return NVME_INVALID_FIELD | NVME_DNR;
570 }
1ee24514 571 if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) {
6f4ee2e9 572 trace_pci_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags));
f3c507ad
KB
573 return NVME_INVALID_FIELD | NVME_DNR;
574 }
575 sq = g_malloc0(sizeof(*sq));
576 nvme_init_sq(sq, n, prp1, sqid, cqid, qsize + 1);
577 return NVME_SUCCESS;
578}
579
580static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
581{
582 n->cq[cq->cqid] = NULL;
bc72ad67
AB
583 timer_del(cq->timer);
584 timer_free(cq->timer);
f3c507ad
KB
585 msix_vector_unuse(&n->parent_obj, cq->vector);
586 if (cq->cqid) {
587 g_free(cq);
588 }
589}
590
591static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeCmd *cmd)
592{
593 NvmeDeleteQ *c = (NvmeDeleteQ *)cmd;
594 NvmeCQueue *cq;
595 uint16_t qid = le16_to_cpu(c->qid);
596
1ee24514 597 if (unlikely(!qid || nvme_check_cqid(n, qid))) {
6f4ee2e9 598 trace_pci_nvme_err_invalid_del_cq_cqid(qid);
f3c507ad
KB
599 return NVME_INVALID_CQID | NVME_DNR;
600 }
601
602 cq = n->cq[qid];
1ee24514 603 if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) {
6f4ee2e9 604 trace_pci_nvme_err_invalid_del_cq_notempty(qid);
f3c507ad
KB
605 return NVME_INVALID_QUEUE_DEL;
606 }
ad3a7e45 607 nvme_irq_deassert(n, cq);
6f4ee2e9 608 trace_pci_nvme_del_cq(qid);
f3c507ad
KB
609 nvme_free_cq(cq, n);
610 return NVME_SUCCESS;
611}
612
613static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
614 uint16_t cqid, uint16_t vector, uint16_t size, uint16_t irq_enabled)
615{
616 cq->ctrl = n;
617 cq->cqid = cqid;
618 cq->size = size;
619 cq->dma_addr = dma_addr;
620 cq->phase = 1;
621 cq->irq_enabled = irq_enabled;
622 cq->vector = vector;
623 cq->head = cq->tail = 0;
624 QTAILQ_INIT(&cq->req_list);
625 QTAILQ_INIT(&cq->sq_list);
626 msix_vector_use(&n->parent_obj, cq->vector);
627 n->cq[cqid] = cq;
bc72ad67 628 cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
f3c507ad
KB
629}
630
631static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd)
632{
633 NvmeCQueue *cq;
634 NvmeCreateCq *c = (NvmeCreateCq *)cmd;
635 uint16_t cqid = le16_to_cpu(c->cqid);
636 uint16_t vector = le16_to_cpu(c->irq_vector);
637 uint16_t qsize = le16_to_cpu(c->qsize);
638 uint16_t qflags = le16_to_cpu(c->cq_flags);
639 uint64_t prp1 = le64_to_cpu(c->prp1);
640
6f4ee2e9
KJ
641 trace_pci_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
642 NVME_CQ_FLAGS_IEN(qflags) != 0);
1ee24514
DG
643
644 if (unlikely(!cqid || !nvme_check_cqid(n, cqid))) {
6f4ee2e9 645 trace_pci_nvme_err_invalid_create_cq_cqid(cqid);
f3c507ad
KB
646 return NVME_INVALID_CQID | NVME_DNR;
647 }
1ee24514 648 if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) {
6f4ee2e9 649 trace_pci_nvme_err_invalid_create_cq_size(qsize);
f3c507ad
KB
650 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
651 }
1ee24514 652 if (unlikely(!prp1)) {
6f4ee2e9 653 trace_pci_nvme_err_invalid_create_cq_addr(prp1);
f3c507ad
KB
654 return NVME_INVALID_FIELD | NVME_DNR;
655 }
ca247d35
KJ
656 if (unlikely(!msix_enabled(&n->parent_obj) && vector)) {
657 trace_pci_nvme_err_invalid_create_cq_vector(vector);
658 return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
659 }
dce22c86 660 if (unlikely(vector > n->params.max_ioqpairs)) {
6f4ee2e9 661 trace_pci_nvme_err_invalid_create_cq_vector(vector);
f3c507ad
KB
662 return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
663 }
1ee24514 664 if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) {
6f4ee2e9 665 trace_pci_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags));
f3c507ad
KB
666 return NVME_INVALID_FIELD | NVME_DNR;
667 }
668
669 cq = g_malloc0(sizeof(*cq));
670 nvme_init_cq(cq, n, prp1, cqid, vector, qsize + 1,
671 NVME_CQ_FLAGS_IEN(qflags));
672 return NVME_SUCCESS;
673}
674
03035a23
CH
675static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeIdentify *c)
676{
677 uint64_t prp1 = le64_to_cpu(c->prp1);
678 uint64_t prp2 = le64_to_cpu(c->prp2);
679
6f4ee2e9 680 trace_pci_nvme_identify_ctrl();
1ee24514 681
03035a23
CH
682 return nvme_dma_read_prp(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl),
683 prp1, prp2);
684}
685
686static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeIdentify *c)
f3c507ad
KB
687{
688 NvmeNamespace *ns;
f3c507ad
KB
689 uint32_t nsid = le32_to_cpu(c->nsid);
690 uint64_t prp1 = le64_to_cpu(c->prp1);
691 uint64_t prp2 = le64_to_cpu(c->prp2);
692
6f4ee2e9 693 trace_pci_nvme_identify_ns(nsid);
1ee24514
DG
694
695 if (unlikely(nsid == 0 || nsid > n->num_namespaces)) {
6f4ee2e9 696 trace_pci_nvme_err_invalid_ns(nsid, n->num_namespaces);
f3c507ad
KB
697 return NVME_INVALID_NSID | NVME_DNR;
698 }
699
700 ns = &n->namespaces[nsid - 1];
1ee24514 701
f3c507ad
KB
702 return nvme_dma_read_prp(n, (uint8_t *)&ns->id_ns, sizeof(ns->id_ns),
703 prp1, prp2);
704}
705
03035a23
CH
706static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeIdentify *c)
707{
3e829fd4 708 static const int data_len = NVME_IDENTIFY_DATA_SIZE;
03035a23
CH
709 uint32_t min_nsid = le32_to_cpu(c->nsid);
710 uint64_t prp1 = le64_to_cpu(c->prp1);
711 uint64_t prp2 = le64_to_cpu(c->prp2);
712 uint32_t *list;
713 uint16_t ret;
714 int i, j = 0;
715
6f4ee2e9 716 trace_pci_nvme_identify_nslist(min_nsid);
1ee24514 717
03035a23
CH
718 list = g_malloc0(data_len);
719 for (i = 0; i < n->num_namespaces; i++) {
720 if (i < min_nsid) {
721 continue;
722 }
723 list[j++] = cpu_to_le32(i + 1);
724 if (j == data_len / sizeof(uint32_t)) {
725 break;
726 }
727 }
728 ret = nvme_dma_read_prp(n, (uint8_t *)list, data_len, prp1, prp2);
729 g_free(list);
730 return ret;
731}
732
03035a23
CH
733static uint16_t nvme_identify(NvmeCtrl *n, NvmeCmd *cmd)
734{
735 NvmeIdentify *c = (NvmeIdentify *)cmd;
736
737 switch (le32_to_cpu(c->cns)) {
3e829fd4 738 case NVME_ID_CNS_NS:
03035a23 739 return nvme_identify_ns(n, c);
3e829fd4 740 case NVME_ID_CNS_CTRL:
03035a23 741 return nvme_identify_ctrl(n, c);
3e829fd4 742 case NVME_ID_CNS_NS_ACTIVE_LIST:
03035a23
CH
743 return nvme_identify_nslist(n, c);
744 default:
6f4ee2e9 745 trace_pci_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns));
03035a23
CH
746 return NVME_INVALID_FIELD | NVME_DNR;
747 }
748}
749
3036a626
KH
750static inline void nvme_set_timestamp(NvmeCtrl *n, uint64_t ts)
751{
6f4ee2e9 752 trace_pci_nvme_setfeat_timestamp(ts);
3036a626
KH
753
754 n->host_timestamp = le64_to_cpu(ts);
755 n->timestamp_set_qemu_clock_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
756}
757
758static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n)
759{
760 uint64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
761 uint64_t elapsed_time = current_time - n->timestamp_set_qemu_clock_ms;
762
763 union nvme_timestamp {
764 struct {
765 uint64_t timestamp:48;
766 uint64_t sync:1;
767 uint64_t origin:3;
768 uint64_t rsvd1:12;
769 };
770 uint64_t all;
771 };
772
773 union nvme_timestamp ts;
774 ts.all = 0;
775
776 /*
777 * If the sum of the Timestamp value set by the host and the elapsed
778 * time exceeds 2^48, the value returned should be reduced modulo 2^48.
779 */
780 ts.timestamp = (n->host_timestamp + elapsed_time) & 0xffffffffffff;
781
782 /* If the host timestamp is non-zero, set the timestamp origin */
783 ts.origin = n->host_timestamp ? 0x01 : 0x00;
784
6f4ee2e9 785 trace_pci_nvme_getfeat_timestamp(ts.all);
3036a626
KH
786
787 return cpu_to_le64(ts.all);
788}
789
790static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeCmd *cmd)
791{
792 uint64_t prp1 = le64_to_cpu(cmd->prp1);
793 uint64_t prp2 = le64_to_cpu(cmd->prp2);
794
795 uint64_t timestamp = nvme_get_timestamp(n);
796
797 return nvme_dma_read_prp(n, (uint8_t *)&timestamp,
798 sizeof(timestamp), prp1, prp2);
799}
800
f3c507ad
KB
801static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
802{
803 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
30349fd0 804 uint32_t result;
f3c507ad
KB
805
806 switch (dw10) {
aacd5650 807 case NVME_VOLATILE_WRITE_CACHE:
30349fd0 808 result = blk_enable_write_cache(n->conf.blk);
6f4ee2e9 809 trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
30349fd0
CH
810 break;
811 case NVME_NUMBER_OF_QUEUES:
dce22c86
KJ
812 result = cpu_to_le32((n->params.max_ioqpairs - 1) |
813 ((n->params.max_ioqpairs - 1) << 16));
6f4ee2e9 814 trace_pci_nvme_getfeat_numq(result);
aacd5650 815 break;
3036a626
KH
816 case NVME_TIMESTAMP:
817 return nvme_get_feature_timestamp(n, cmd);
f3c507ad 818 default:
6f4ee2e9 819 trace_pci_nvme_err_invalid_getfeat(dw10);
f3c507ad
KB
820 return NVME_INVALID_FIELD | NVME_DNR;
821 }
30349fd0
CH
822
823 req->cqe.result = result;
f3c507ad
KB
824 return NVME_SUCCESS;
825}
826
3036a626
KH
827static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeCmd *cmd)
828{
829 uint16_t ret;
830 uint64_t timestamp;
831 uint64_t prp1 = le64_to_cpu(cmd->prp1);
832 uint64_t prp2 = le64_to_cpu(cmd->prp2);
833
834 ret = nvme_dma_write_prp(n, (uint8_t *)&timestamp,
835 sizeof(timestamp), prp1, prp2);
836 if (ret != NVME_SUCCESS) {
837 return ret;
838 }
839
840 nvme_set_timestamp(n, timestamp);
841
842 return NVME_SUCCESS;
843}
844
f3c507ad
KB
845static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
846{
847 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
30349fd0 848 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
f3c507ad
KB
849
850 switch (dw10) {
30349fd0
CH
851 case NVME_VOLATILE_WRITE_CACHE:
852 blk_set_enable_write_cache(n->conf.blk, dw11 & 1);
853 break;
f3c507ad 854 case NVME_NUMBER_OF_QUEUES:
6f4ee2e9
KJ
855 trace_pci_nvme_setfeat_numq((dw11 & 0xFFFF) + 1,
856 ((dw11 >> 16) & 0xFFFF) + 1,
dce22c86
KJ
857 n->params.max_ioqpairs,
858 n->params.max_ioqpairs);
859 req->cqe.result = cpu_to_le32((n->params.max_ioqpairs - 1) |
860 ((n->params.max_ioqpairs - 1) << 16));
f3c507ad 861 break;
3036a626
KH
862 case NVME_TIMESTAMP:
863 return nvme_set_feature_timestamp(n, cmd);
f3c507ad 864 default:
6f4ee2e9 865 trace_pci_nvme_err_invalid_setfeat(dw10);
f3c507ad
KB
866 return NVME_INVALID_FIELD | NVME_DNR;
867 }
868 return NVME_SUCCESS;
869}
870
871static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
872{
873 switch (cmd->opcode) {
874 case NVME_ADM_CMD_DELETE_SQ:
875 return nvme_del_sq(n, cmd);
876 case NVME_ADM_CMD_CREATE_SQ:
877 return nvme_create_sq(n, cmd);
878 case NVME_ADM_CMD_DELETE_CQ:
879 return nvme_del_cq(n, cmd);
880 case NVME_ADM_CMD_CREATE_CQ:
881 return nvme_create_cq(n, cmd);
882 case NVME_ADM_CMD_IDENTIFY:
883 return nvme_identify(n, cmd);
884 case NVME_ADM_CMD_SET_FEATURES:
885 return nvme_set_feature(n, cmd, req);
886 case NVME_ADM_CMD_GET_FEATURES:
887 return nvme_get_feature(n, cmd, req);
888 default:
6f4ee2e9 889 trace_pci_nvme_err_invalid_admin_opc(cmd->opcode);
f3c507ad
KB
890 return NVME_INVALID_OPCODE | NVME_DNR;
891 }
892}
893
894static void nvme_process_sq(void *opaque)
895{
896 NvmeSQueue *sq = opaque;
897 NvmeCtrl *n = sq->ctrl;
898 NvmeCQueue *cq = n->cq[sq->cqid];
899
900 uint16_t status;
901 hwaddr addr;
902 NvmeCmd cmd;
903 NvmeRequest *req;
904
905 while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
906 addr = sq->dma_addr + sq->head * n->sqe_size;
a896f7f2 907 nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd));
f3c507ad
KB
908 nvme_inc_sq_head(sq);
909
910 req = QTAILQ_FIRST(&sq->req_list);
911 QTAILQ_REMOVE(&sq->req_list, req, entry);
912 QTAILQ_INSERT_TAIL(&sq->out_req_list, req, entry);
913 memset(&req->cqe, 0, sizeof(req->cqe));
914 req->cqe.cid = cmd.cid;
915
916 status = sq->sqid ? nvme_io_cmd(n, &cmd, req) :
917 nvme_admin_cmd(n, &cmd, req);
918 if (status != NVME_NO_COMPLETE) {
919 req->status = status;
920 nvme_enqueue_req_completion(cq, req);
921 }
922 }
923}
924
925static void nvme_clear_ctrl(NvmeCtrl *n)
926{
927 int i;
928
6bf74636
ID
929 blk_drain(n->conf.blk);
930
dce22c86 931 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
f3c507ad
KB
932 if (n->sq[i] != NULL) {
933 nvme_free_sq(n->sq[i], n);
934 }
935 }
dce22c86 936 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
f3c507ad
KB
937 if (n->cq[i] != NULL) {
938 nvme_free_cq(n->cq[i], n);
939 }
940 }
941
4be74634 942 blk_flush(n->conf.blk);
f3c507ad
KB
943 n->bar.cc = 0;
944}
945
946static int nvme_start_ctrl(NvmeCtrl *n)
947{
948 uint32_t page_bits = NVME_CC_MPS(n->bar.cc) + 12;
949 uint32_t page_size = 1 << page_bits;
950
1ee24514 951 if (unlikely(n->cq[0])) {
6f4ee2e9 952 trace_pci_nvme_err_startfail_cq();
1ee24514
DG
953 return -1;
954 }
955 if (unlikely(n->sq[0])) {
6f4ee2e9 956 trace_pci_nvme_err_startfail_sq();
1ee24514
DG
957 return -1;
958 }
959 if (unlikely(!n->bar.asq)) {
6f4ee2e9 960 trace_pci_nvme_err_startfail_nbarasq();
1ee24514
DG
961 return -1;
962 }
963 if (unlikely(!n->bar.acq)) {
6f4ee2e9 964 trace_pci_nvme_err_startfail_nbaracq();
1ee24514
DG
965 return -1;
966 }
967 if (unlikely(n->bar.asq & (page_size - 1))) {
6f4ee2e9 968 trace_pci_nvme_err_startfail_asq_misaligned(n->bar.asq);
1ee24514
DG
969 return -1;
970 }
971 if (unlikely(n->bar.acq & (page_size - 1))) {
6f4ee2e9 972 trace_pci_nvme_err_startfail_acq_misaligned(n->bar.acq);
1ee24514
DG
973 return -1;
974 }
975 if (unlikely(NVME_CC_MPS(n->bar.cc) <
976 NVME_CAP_MPSMIN(n->bar.cap))) {
6f4ee2e9 977 trace_pci_nvme_err_startfail_page_too_small(
1ee24514
DG
978 NVME_CC_MPS(n->bar.cc),
979 NVME_CAP_MPSMIN(n->bar.cap));
980 return -1;
981 }
982 if (unlikely(NVME_CC_MPS(n->bar.cc) >
983 NVME_CAP_MPSMAX(n->bar.cap))) {
6f4ee2e9 984 trace_pci_nvme_err_startfail_page_too_large(
1ee24514
DG
985 NVME_CC_MPS(n->bar.cc),
986 NVME_CAP_MPSMAX(n->bar.cap));
987 return -1;
988 }
989 if (unlikely(NVME_CC_IOCQES(n->bar.cc) <
990 NVME_CTRL_CQES_MIN(n->id_ctrl.cqes))) {
6f4ee2e9 991 trace_pci_nvme_err_startfail_cqent_too_small(
1ee24514
DG
992 NVME_CC_IOCQES(n->bar.cc),
993 NVME_CTRL_CQES_MIN(n->bar.cap));
994 return -1;
995 }
996 if (unlikely(NVME_CC_IOCQES(n->bar.cc) >
997 NVME_CTRL_CQES_MAX(n->id_ctrl.cqes))) {
6f4ee2e9 998 trace_pci_nvme_err_startfail_cqent_too_large(
1ee24514
DG
999 NVME_CC_IOCQES(n->bar.cc),
1000 NVME_CTRL_CQES_MAX(n->bar.cap));
1001 return -1;
1002 }
1003 if (unlikely(NVME_CC_IOSQES(n->bar.cc) <
1004 NVME_CTRL_SQES_MIN(n->id_ctrl.sqes))) {
6f4ee2e9 1005 trace_pci_nvme_err_startfail_sqent_too_small(
1ee24514
DG
1006 NVME_CC_IOSQES(n->bar.cc),
1007 NVME_CTRL_SQES_MIN(n->bar.cap));
1008 return -1;
1009 }
1010 if (unlikely(NVME_CC_IOSQES(n->bar.cc) >
1011 NVME_CTRL_SQES_MAX(n->id_ctrl.sqes))) {
6f4ee2e9 1012 trace_pci_nvme_err_startfail_sqent_too_large(
1ee24514
DG
1013 NVME_CC_IOSQES(n->bar.cc),
1014 NVME_CTRL_SQES_MAX(n->bar.cap));
1015 return -1;
1016 }
1017 if (unlikely(!NVME_AQA_ASQS(n->bar.aqa))) {
6f4ee2e9 1018 trace_pci_nvme_err_startfail_asqent_sz_zero();
1ee24514
DG
1019 return -1;
1020 }
1021 if (unlikely(!NVME_AQA_ACQS(n->bar.aqa))) {
6f4ee2e9 1022 trace_pci_nvme_err_startfail_acqent_sz_zero();
f3c507ad
KB
1023 return -1;
1024 }
1025
1026 n->page_bits = page_bits;
1027 n->page_size = page_size;
1028 n->max_prp_ents = n->page_size / sizeof(uint64_t);
1029 n->cqe_size = 1 << NVME_CC_IOCQES(n->bar.cc);
1030 n->sqe_size = 1 << NVME_CC_IOSQES(n->bar.cc);
1031 nvme_init_cq(&n->admin_cq, n, n->bar.acq, 0, 0,
1032 NVME_AQA_ACQS(n->bar.aqa) + 1, 1);
1033 nvme_init_sq(&n->admin_sq, n, n->bar.asq, 0, 0,
1034 NVME_AQA_ASQS(n->bar.aqa) + 1);
1035
3036a626
KH
1036 nvme_set_timestamp(n, 0ULL);
1037
f3c507ad
KB
1038 return 0;
1039}
1040
1041static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
1042 unsigned size)
1043{
1ee24514 1044 if (unlikely(offset & (sizeof(uint32_t) - 1))) {
6f4ee2e9 1045 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_misaligned32,
1ee24514
DG
1046 "MMIO write not 32-bit aligned,"
1047 " offset=0x%"PRIx64"", offset);
1048 /* should be ignored, fall through for now */
1049 }
1050
1051 if (unlikely(size < sizeof(uint32_t))) {
6f4ee2e9 1052 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_toosmall,
1ee24514
DG
1053 "MMIO write smaller than 32-bits,"
1054 " offset=0x%"PRIx64", size=%u",
1055 offset, size);
1056 /* should be ignored, fall through for now */
1057 }
1058
f3c507ad 1059 switch (offset) {
1ee24514
DG
1060 case 0xc: /* INTMS */
1061 if (unlikely(msix_enabled(&(n->parent_obj)))) {
6f4ee2e9 1062 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
1ee24514
DG
1063 "undefined access to interrupt mask set"
1064 " when MSI-X is enabled");
1065 /* should be ignored, fall through for now */
1066 }
f3c507ad
KB
1067 n->bar.intms |= data & 0xffffffff;
1068 n->bar.intmc = n->bar.intms;
6f4ee2e9 1069 trace_pci_nvme_mmio_intm_set(data & 0xffffffff, n->bar.intmc);
5e9aa92e 1070 nvme_irq_check(n);
f3c507ad 1071 break;
1ee24514
DG
1072 case 0x10: /* INTMC */
1073 if (unlikely(msix_enabled(&(n->parent_obj)))) {
6f4ee2e9 1074 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
1ee24514
DG
1075 "undefined access to interrupt mask clr"
1076 " when MSI-X is enabled");
1077 /* should be ignored, fall through for now */
1078 }
f3c507ad
KB
1079 n->bar.intms &= ~(data & 0xffffffff);
1080 n->bar.intmc = n->bar.intms;
6f4ee2e9 1081 trace_pci_nvme_mmio_intm_clr(data & 0xffffffff, n->bar.intmc);
5e9aa92e 1082 nvme_irq_check(n);
f3c507ad 1083 break;
1ee24514 1084 case 0x14: /* CC */
6f4ee2e9 1085 trace_pci_nvme_mmio_cfg(data & 0xffffffff);
4a4d614f
DS
1086 /* Windows first sends data, then sends enable bit */
1087 if (!NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc) &&
1088 !NVME_CC_SHN(data) && !NVME_CC_SHN(n->bar.cc))
1089 {
1090 n->bar.cc = data;
1091 }
1092
f3c507ad
KB
1093 if (NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc)) {
1094 n->bar.cc = data;
1ee24514 1095 if (unlikely(nvme_start_ctrl(n))) {
6f4ee2e9 1096 trace_pci_nvme_err_startfail();
f3c507ad
KB
1097 n->bar.csts = NVME_CSTS_FAILED;
1098 } else {
6f4ee2e9 1099 trace_pci_nvme_mmio_start_success();
f3c507ad
KB
1100 n->bar.csts = NVME_CSTS_READY;
1101 }
1102 } else if (!NVME_CC_EN(data) && NVME_CC_EN(n->bar.cc)) {
6f4ee2e9 1103 trace_pci_nvme_mmio_stopped();
f3c507ad
KB
1104 nvme_clear_ctrl(n);
1105 n->bar.csts &= ~NVME_CSTS_READY;
1106 }
1107 if (NVME_CC_SHN(data) && !(NVME_CC_SHN(n->bar.cc))) {
6f4ee2e9 1108 trace_pci_nvme_mmio_shutdown_set();
1ee24514
DG
1109 nvme_clear_ctrl(n);
1110 n->bar.cc = data;
1111 n->bar.csts |= NVME_CSTS_SHST_COMPLETE;
f3c507ad 1112 } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(n->bar.cc)) {
6f4ee2e9 1113 trace_pci_nvme_mmio_shutdown_cleared();
1ee24514
DG
1114 n->bar.csts &= ~NVME_CSTS_SHST_COMPLETE;
1115 n->bar.cc = data;
1116 }
1117 break;
1118 case 0x1C: /* CSTS */
1119 if (data & (1 << 4)) {
6f4ee2e9 1120 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ssreset_w1c_unsupported,
1ee24514
DG
1121 "attempted to W1C CSTS.NSSRO"
1122 " but CAP.NSSRS is zero (not supported)");
1123 } else if (data != 0) {
6f4ee2e9 1124 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ro_csts,
1ee24514
DG
1125 "attempted to set a read only bit"
1126 " of controller status");
1127 }
1128 break;
1129 case 0x20: /* NSSR */
1130 if (data == 0x4E564D65) {
6f4ee2e9 1131 trace_pci_nvme_ub_mmiowr_ssreset_unsupported();
1ee24514
DG
1132 } else {
1133 /* The spec says that writes of other values have no effect */
1134 return;
f3c507ad
KB
1135 }
1136 break;
1ee24514 1137 case 0x24: /* AQA */
f3c507ad 1138 n->bar.aqa = data & 0xffffffff;
6f4ee2e9 1139 trace_pci_nvme_mmio_aqattr(data & 0xffffffff);
f3c507ad 1140 break;
1ee24514 1141 case 0x28: /* ASQ */
f3c507ad 1142 n->bar.asq = data;
6f4ee2e9 1143 trace_pci_nvme_mmio_asqaddr(data);
f3c507ad 1144 break;
1ee24514 1145 case 0x2c: /* ASQ hi */
f3c507ad 1146 n->bar.asq |= data << 32;
6f4ee2e9 1147 trace_pci_nvme_mmio_asqaddr_hi(data, n->bar.asq);
f3c507ad 1148 break;
1ee24514 1149 case 0x30: /* ACQ */
6f4ee2e9 1150 trace_pci_nvme_mmio_acqaddr(data);
f3c507ad
KB
1151 n->bar.acq = data;
1152 break;
1ee24514 1153 case 0x34: /* ACQ hi */
f3c507ad 1154 n->bar.acq |= data << 32;
6f4ee2e9 1155 trace_pci_nvme_mmio_acqaddr_hi(data, n->bar.acq);
f3c507ad 1156 break;
1ee24514 1157 case 0x38: /* CMBLOC */
6f4ee2e9 1158 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbloc_reserved,
1ee24514
DG
1159 "invalid write to reserved CMBLOC"
1160 " when CMBSZ is zero, ignored");
1161 return;
1162 case 0x3C: /* CMBSZ */
6f4ee2e9 1163 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbsz_readonly,
1ee24514
DG
1164 "invalid write to read only CMBSZ, ignored");
1165 return;
6cf94132 1166 case 0xE00: /* PMRCAP */
6f4ee2e9 1167 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrcap_readonly,
6cf94132
AJ
1168 "invalid write to PMRCAP register, ignored");
1169 return;
1170 case 0xE04: /* TODO PMRCTL */
1171 break;
1172 case 0xE08: /* PMRSTS */
6f4ee2e9 1173 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrsts_readonly,
6cf94132
AJ
1174 "invalid write to PMRSTS register, ignored");
1175 return;
1176 case 0xE0C: /* PMREBS */
6f4ee2e9 1177 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrebs_readonly,
6cf94132
AJ
1178 "invalid write to PMREBS register, ignored");
1179 return;
1180 case 0xE10: /* PMRSWTP */
6f4ee2e9 1181 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrswtp_readonly,
6cf94132
AJ
1182 "invalid write to PMRSWTP register, ignored");
1183 return;
1184 case 0xE14: /* TODO PMRMSC */
1185 break;
f3c507ad 1186 default:
6f4ee2e9 1187 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_invalid,
1ee24514
DG
1188 "invalid MMIO write,"
1189 " offset=0x%"PRIx64", data=%"PRIx64"",
1190 offset, data);
f3c507ad
KB
1191 break;
1192 }
1193}
1194
1195static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
1196{
1197 NvmeCtrl *n = (NvmeCtrl *)opaque;
1198 uint8_t *ptr = (uint8_t *)&n->bar;
1199 uint64_t val = 0;
1200
1ee24514 1201 if (unlikely(addr & (sizeof(uint32_t) - 1))) {
6f4ee2e9 1202 NVME_GUEST_ERR(pci_nvme_ub_mmiord_misaligned32,
1ee24514
DG
1203 "MMIO read not 32-bit aligned,"
1204 " offset=0x%"PRIx64"", addr);
1205 /* should RAZ, fall through for now */
1206 } else if (unlikely(size < sizeof(uint32_t))) {
6f4ee2e9 1207 NVME_GUEST_ERR(pci_nvme_ub_mmiord_toosmall,
1ee24514
DG
1208 "MMIO read smaller than 32-bits,"
1209 " offset=0x%"PRIx64"", addr);
1210 /* should RAZ, fall through for now */
1211 }
1212
f3c507ad 1213 if (addr < sizeof(n->bar)) {
6cf94132
AJ
1214 /*
1215 * When PMRWBM bit 1 is set then read from
1216 * from PMRSTS should ensure prior writes
1217 * made it to persistent media
1218 */
1219 if (addr == 0xE08 &&
1220 (NVME_PMRCAP_PMRWBM(n->bar.pmrcap) & 0x02)) {
bc2a2364 1221 memory_region_msync(&n->pmrdev->mr, 0, n->pmrdev->size);
6cf94132 1222 }
f3c507ad 1223 memcpy(&val, ptr + addr, size);
1ee24514 1224 } else {
6f4ee2e9 1225 NVME_GUEST_ERR(pci_nvme_ub_mmiord_invalid_ofs,
1ee24514
DG
1226 "MMIO read beyond last register,"
1227 " offset=0x%"PRIx64", returning 0", addr);
f3c507ad 1228 }
1ee24514 1229
f3c507ad
KB
1230 return val;
1231}
1232
1233static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
1234{
1235 uint32_t qid;
1236
1ee24514 1237 if (unlikely(addr & ((1 << 2) - 1))) {
6f4ee2e9 1238 NVME_GUEST_ERR(pci_nvme_ub_db_wr_misaligned,
1ee24514
DG
1239 "doorbell write not 32-bit aligned,"
1240 " offset=0x%"PRIx64", ignoring", addr);
f3c507ad
KB
1241 return;
1242 }
1243
1244 if (((addr - 0x1000) >> 2) & 1) {
1ee24514
DG
1245 /* Completion queue doorbell write */
1246
f3c507ad
KB
1247 uint16_t new_head = val & 0xffff;
1248 int start_sqs;
1249 NvmeCQueue *cq;
1250
1251 qid = (addr - (0x1000 + (1 << 2))) >> 3;
1ee24514 1252 if (unlikely(nvme_check_cqid(n, qid))) {
6f4ee2e9 1253 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cq,
1ee24514
DG
1254 "completion queue doorbell write"
1255 " for nonexistent queue,"
1256 " sqid=%"PRIu32", ignoring", qid);
f3c507ad
KB
1257 return;
1258 }
1259
1260 cq = n->cq[qid];
1ee24514 1261 if (unlikely(new_head >= cq->size)) {
6f4ee2e9 1262 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cqhead,
1ee24514
DG
1263 "completion queue doorbell write value"
1264 " beyond queue size, sqid=%"PRIu32","
1265 " new_head=%"PRIu16", ignoring",
1266 qid, new_head);
f3c507ad
KB
1267 return;
1268 }
1269
1270 start_sqs = nvme_cq_full(cq) ? 1 : 0;
1271 cq->head = new_head;
1272 if (start_sqs) {
1273 NvmeSQueue *sq;
1274 QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
bc72ad67 1275 timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
f3c507ad 1276 }
bc72ad67 1277 timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
f3c507ad
KB
1278 }
1279
5e9aa92e
HN
1280 if (cq->tail == cq->head) {
1281 nvme_irq_deassert(n, cq);
f3c507ad
KB
1282 }
1283 } else {
1ee24514
DG
1284 /* Submission queue doorbell write */
1285
f3c507ad
KB
1286 uint16_t new_tail = val & 0xffff;
1287 NvmeSQueue *sq;
1288
1289 qid = (addr - 0x1000) >> 3;
1ee24514 1290 if (unlikely(nvme_check_sqid(n, qid))) {
6f4ee2e9 1291 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sq,
1ee24514
DG
1292 "submission queue doorbell write"
1293 " for nonexistent queue,"
1294 " sqid=%"PRIu32", ignoring", qid);
f3c507ad
KB
1295 return;
1296 }
1297
1298 sq = n->sq[qid];
1ee24514 1299 if (unlikely(new_tail >= sq->size)) {
6f4ee2e9 1300 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sqtail,
1ee24514
DG
1301 "submission queue doorbell write value"
1302 " beyond queue size, sqid=%"PRIu32","
1303 " new_tail=%"PRIu16", ignoring",
1304 qid, new_tail);
f3c507ad
KB
1305 return;
1306 }
1307
1308 sq->tail = new_tail;
bc72ad67 1309 timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
f3c507ad
KB
1310 }
1311}
1312
1313static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data,
1314 unsigned size)
1315{
1316 NvmeCtrl *n = (NvmeCtrl *)opaque;
1317 if (addr < sizeof(n->bar)) {
1318 nvme_write_bar(n, addr, data, size);
1319 } else if (addr >= 0x1000) {
1320 nvme_process_db(n, addr, data);
1321 }
1322}
1323
1324static const MemoryRegionOps nvme_mmio_ops = {
1325 .read = nvme_mmio_read,
1326 .write = nvme_mmio_write,
1327 .endianness = DEVICE_LITTLE_ENDIAN,
1328 .impl = {
1329 .min_access_size = 2,
1330 .max_access_size = 8,
1331 },
1332};
1333
a896f7f2
SB
1334static void nvme_cmb_write(void *opaque, hwaddr addr, uint64_t data,
1335 unsigned size)
1336{
1337 NvmeCtrl *n = (NvmeCtrl *)opaque;
71a86dde 1338 stn_le_p(&n->cmbuf[addr], size, data);
a896f7f2
SB
1339}
1340
1341static uint64_t nvme_cmb_read(void *opaque, hwaddr addr, unsigned size)
1342{
a896f7f2 1343 NvmeCtrl *n = (NvmeCtrl *)opaque;
71a86dde 1344 return ldn_le_p(&n->cmbuf[addr], size);
a896f7f2
SB
1345}
1346
1347static const MemoryRegionOps nvme_cmb_ops = {
1348 .read = nvme_cmb_read,
1349 .write = nvme_cmb_write,
1350 .endianness = DEVICE_LITTLE_ENDIAN,
1351 .impl = {
87ad860c 1352 .min_access_size = 1,
a896f7f2
SB
1353 .max_access_size = 8,
1354 },
1355};
1356
54000c66 1357static void nvme_check_constraints(NvmeCtrl *n, Error **errp)
f3c507ad 1358{
54000c66 1359 NvmeParams *params = &n->params;
f3c507ad 1360
54000c66 1361 if (params->num_queues) {
dce22c86
KJ
1362 warn_report("num_queues is deprecated; please use max_ioqpairs "
1363 "instead");
1364
54000c66 1365 params->max_ioqpairs = params->num_queues - 1;
dce22c86
KJ
1366 }
1367
54000c66
KJ
1368 if (params->max_ioqpairs < 1 ||
1369 params->max_ioqpairs > PCI_MSIX_FLAGS_QSIZE) {
dce22c86
KJ
1370 error_setg(errp, "max_ioqpairs must be between 1 and %d",
1371 PCI_MSIX_FLAGS_QSIZE);
2410e133
LQ
1372 return;
1373 }
1374
4be74634 1375 if (!n->conf.blk) {
e01d6a41
MZ
1376 error_setg(errp, "drive property not set");
1377 return;
f3c507ad
KB
1378 }
1379
54000c66 1380 if (!params->serial) {
e01d6a41
MZ
1381 error_setg(errp, "serial property not set");
1382 return;
f3c507ad 1383 }
6cf94132 1384
1065abfb 1385 if (!n->params.cmb_size_mb && n->pmrdev) {
6cf94132
AJ
1386 if (host_memory_backend_is_mapped(n->pmrdev)) {
1387 char *path = object_get_canonical_path_component(OBJECT(n->pmrdev));
1388 error_setg(errp, "can't use already busy memdev: %s", path);
1389 g_free(path);
1390 return;
1391 }
1392
1393 if (!is_power_of_2(n->pmrdev->size)) {
1394 error_setg(errp, "pmr backend size needs to be power of 2 in size");
1395 return;
1396 }
1397
1398 host_memory_backend_set_mapped(n->pmrdev, true);
1399 }
54000c66
KJ
1400}
1401
a17f5018
KJ
1402static void nvme_init_state(NvmeCtrl *n)
1403{
1404 n->num_namespaces = 1;
1405 /* add one to max_ioqpairs to account for the admin queue pair */
1406 n->reg_size = pow2ceil(NVME_REG_SIZE +
1407 2 * (n->params.max_ioqpairs + 1) * NVME_DB_SIZE);
1408 n->namespaces = g_new0(NvmeNamespace, n->num_namespaces);
1409 n->sq = g_new0(NvmeSQueue *, n->params.max_ioqpairs + 1);
1410 n->cq = g_new0(NvmeCQueue *, n->params.max_ioqpairs + 1);
1411}
1412
90f45115
KJ
1413static void nvme_init_blk(NvmeCtrl *n, Error **errp)
1414{
1415 blkconf_blocksizes(&n->conf);
1416 blkconf_apply_backend_options(&n->conf, blk_is_read_only(n->conf.blk),
1417 false, errp);
1418}
1419
d634d742
KJ
1420static void nvme_init_namespace(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
1421{
1422 int64_t bs_size;
1423 NvmeIdNs *id_ns = &ns->id_ns;
1424
1425 bs_size = blk_getlength(n->conf.blk);
1426 if (bs_size < 0) {
1427 error_setg_errno(errp, -bs_size, "could not get backing file size");
1428 return;
1429 }
1430
1431 n->ns_size = bs_size;
1432
1433 id_ns->lbaf[0].ds = BDRV_SECTOR_BITS;
1434 id_ns->nsze = cpu_to_le64(nvme_ns_nlbas(n, ns));
1435
1436 /* no thin provisioning */
1437 id_ns->ncap = id_ns->nsze;
1438 id_ns->nuse = id_ns->ncap;
1439}
1440
54000c66
KJ
1441static void nvme_realize(PCIDevice *pci_dev, Error **errp)
1442{
1443 NvmeCtrl *n = NVME(pci_dev);
1444 NvmeIdCtrl *id = &n->id_ctrl;
1445 Error *local_err = NULL;
1446
1447 int i;
54000c66
KJ
1448 uint8_t *pci_conf;
1449
1450 nvme_check_constraints(n, &local_err);
1451 if (local_err) {
1452 error_propagate(errp, local_err);
1453 return;
1454 }
1455
a17f5018
KJ
1456 nvme_init_state(n);
1457
90f45115
KJ
1458 nvme_init_blk(n, &local_err);
1459 if (local_err) {
1460 error_propagate(errp, local_err);
e01d6a41 1461 return;
a17c17a2 1462 }
f3c507ad
KB
1463
1464 pci_conf = pci_dev->config;
1465 pci_conf[PCI_INTERRUPT_PIN] = 1;
1466 pci_config_set_prog_interface(pci_dev->config, 0x2);
1467 pci_config_set_class(pci_dev->config, PCI_CLASS_STORAGE_EXPRESS);
a3d25ddd 1468 pcie_endpoint_cap_init(pci_dev, 0x80);
f3c507ad 1469
2d256e6f
PB
1470 memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n,
1471 "nvme", n->reg_size);
a3d25ddd 1472 pci_register_bar(pci_dev, 0,
f3c507ad
KB
1473 PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64,
1474 &n->iomem);
dce22c86 1475 msix_init_exclusive_bar(pci_dev, n->params.max_ioqpairs + 1, 4, NULL);
f3c507ad
KB
1476
1477 id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID));
1478 id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID));
1479 strpadcpy((char *)id->mn, sizeof(id->mn), "QEMU NVMe Ctrl", ' ');
1480 strpadcpy((char *)id->fr, sizeof(id->fr), "1.0", ' ');
1065abfb 1481 strpadcpy((char *)id->sn, sizeof(id->sn), n->params.serial, ' ');
f3c507ad
KB
1482 id->rab = 6;
1483 id->ieee[0] = 0x00;
1484 id->ieee[1] = 0x02;
1485 id->ieee[2] = 0xb3;
1486 id->oacs = cpu_to_le16(0);
1487 id->frmw = 7 << 1;
1488 id->lpa = 1 << 0;
1489 id->sqes = (0x6 << 4) | 0x6;
1490 id->cqes = (0x4 << 4) | 0x4;
1491 id->nn = cpu_to_le32(n->num_namespaces);
3036a626 1492 id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROS | NVME_ONCS_TIMESTAMP);
f3c507ad
KB
1493 id->psd[0].mp = cpu_to_le16(0x9c4);
1494 id->psd[0].enlat = cpu_to_le32(0x10);
1495 id->psd[0].exlat = cpu_to_le32(0x4);
30349fd0
CH
1496 if (blk_enable_write_cache(n->conf.blk)) {
1497 id->vwc = 1;
1498 }
f3c507ad
KB
1499
1500 n->bar.cap = 0;
1501 NVME_CAP_SET_MQES(n->bar.cap, 0x7ff);
1502 NVME_CAP_SET_CQR(n->bar.cap, 1);
f3c507ad
KB
1503 NVME_CAP_SET_TO(n->bar.cap, 0xf);
1504 NVME_CAP_SET_CSS(n->bar.cap, 1);
be0677a9 1505 NVME_CAP_SET_MPSMAX(n->bar.cap, 4);
f3c507ad 1506
a896f7f2 1507 n->bar.vs = 0x00010200;
f3c507ad
KB
1508 n->bar.intmc = n->bar.intms = 0;
1509
1065abfb 1510 if (n->params.cmb_size_mb) {
a896f7f2
SB
1511
1512 NVME_CMBLOC_SET_BIR(n->bar.cmbloc, 2);
1513 NVME_CMBLOC_SET_OFST(n->bar.cmbloc, 0);
1514
1515 NVME_CMBSZ_SET_SQS(n->bar.cmbsz, 1);
1516 NVME_CMBSZ_SET_CQS(n->bar.cmbsz, 0);
1517 NVME_CMBSZ_SET_LISTS(n->bar.cmbsz, 0);
b2b2b67a
SB
1518 NVME_CMBSZ_SET_RDS(n->bar.cmbsz, 1);
1519 NVME_CMBSZ_SET_WDS(n->bar.cmbsz, 1);
a896f7f2 1520 NVME_CMBSZ_SET_SZU(n->bar.cmbsz, 2); /* MBs */
1065abfb 1521 NVME_CMBSZ_SET_SZ(n->bar.cmbsz, n->params.cmb_size_mb);
a896f7f2
SB
1522
1523 n->cmbuf = g_malloc0(NVME_CMBSZ_GETSIZE(n->bar.cmbsz));
1524 memory_region_init_io(&n->ctrl_mem, OBJECT(n), &nvme_cmb_ops, n,
1525 "nvme-cmb", NVME_CMBSZ_GETSIZE(n->bar.cmbsz));
a3d25ddd 1526 pci_register_bar(pci_dev, NVME_CMBLOC_BIR(n->bar.cmbloc),
a896f7f2
SB
1527 PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64 |
1528 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->ctrl_mem);
1529
6cf94132
AJ
1530 } else if (n->pmrdev) {
1531 /* Controller Capabilities register */
1532 NVME_CAP_SET_PMRS(n->bar.cap, 1);
1533
1534 /* PMR Capabities register */
1535 n->bar.pmrcap = 0;
1536 NVME_PMRCAP_SET_RDS(n->bar.pmrcap, 0);
1537 NVME_PMRCAP_SET_WDS(n->bar.pmrcap, 0);
1538 NVME_PMRCAP_SET_BIR(n->bar.pmrcap, 2);
1539 NVME_PMRCAP_SET_PMRTU(n->bar.pmrcap, 0);
1540 /* Turn on bit 1 support */
1541 NVME_PMRCAP_SET_PMRWBM(n->bar.pmrcap, 0x02);
1542 NVME_PMRCAP_SET_PMRTO(n->bar.pmrcap, 0);
1543 NVME_PMRCAP_SET_CMSS(n->bar.pmrcap, 0);
1544
1545 /* PMR Control register */
1546 n->bar.pmrctl = 0;
1547 NVME_PMRCTL_SET_EN(n->bar.pmrctl, 0);
1548
1549 /* PMR Status register */
1550 n->bar.pmrsts = 0;
1551 NVME_PMRSTS_SET_ERR(n->bar.pmrsts, 0);
1552 NVME_PMRSTS_SET_NRDY(n->bar.pmrsts, 0);
1553 NVME_PMRSTS_SET_HSTS(n->bar.pmrsts, 0);
1554 NVME_PMRSTS_SET_CBAI(n->bar.pmrsts, 0);
1555
1556 /* PMR Elasticity Buffer Size register */
1557 n->bar.pmrebs = 0;
1558 NVME_PMREBS_SET_PMRSZU(n->bar.pmrebs, 0);
1559 NVME_PMREBS_SET_RBB(n->bar.pmrebs, 0);
1560 NVME_PMREBS_SET_PMRWBZ(n->bar.pmrebs, 0);
1561
1562 /* PMR Sustained Write Throughput register */
1563 n->bar.pmrswtp = 0;
1564 NVME_PMRSWTP_SET_PMRSWTU(n->bar.pmrswtp, 0);
1565 NVME_PMRSWTP_SET_PMRSWTV(n->bar.pmrswtp, 0);
1566
1567 /* PMR Memory Space Control register */
1568 n->bar.pmrmsc = 0;
1569 NVME_PMRMSC_SET_CMSE(n->bar.pmrmsc, 0);
1570 NVME_PMRMSC_SET_CBA(n->bar.pmrmsc, 0);
1571
1572 pci_register_bar(pci_dev, NVME_PMRCAP_BIR(n->bar.pmrcap),
1573 PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64 |
1574 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmrdev->mr);
a896f7f2
SB
1575 }
1576
f3c507ad 1577 for (i = 0; i < n->num_namespaces; i++) {
d634d742
KJ
1578 nvme_init_namespace(n, &n->namespaces[i], &local_err);
1579 if (local_err) {
1580 error_propagate(errp, local_err);
1581 return;
1582 }
f3c507ad 1583 }
f3c507ad
KB
1584}
1585
1586static void nvme_exit(PCIDevice *pci_dev)
1587{
1588 NvmeCtrl *n = NVME(pci_dev);
1589
1590 nvme_clear_ctrl(n);
1591 g_free(n->namespaces);
1592 g_free(n->cq);
1593 g_free(n->sq);
a896f7f2 1594
1065abfb 1595 if (n->params.cmb_size_mb) {
a883d6a0
LQ
1596 g_free(n->cmbuf);
1597 }
6cf94132
AJ
1598
1599 if (n->pmrdev) {
1600 host_memory_backend_set_mapped(n->pmrdev, false);
1601 }
f3c507ad 1602 msix_uninit_exclusive_bar(pci_dev);
f3c507ad
KB
1603}
1604
1605static Property nvme_props[] = {
1606 DEFINE_BLOCK_PROPERTIES(NvmeCtrl, conf),
6cf94132
AJ
1607 DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmrdev, TYPE_MEMORY_BACKEND,
1608 HostMemoryBackend *),
1065abfb
KJ
1609 DEFINE_PROP_STRING("serial", NvmeCtrl, params.serial),
1610 DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, params.cmb_size_mb, 0),
dce22c86
KJ
1611 DEFINE_PROP_UINT32("num_queues", NvmeCtrl, params.num_queues, 0),
1612 DEFINE_PROP_UINT32("max_ioqpairs", NvmeCtrl, params.max_ioqpairs, 64),
f3c507ad
KB
1613 DEFINE_PROP_END_OF_LIST(),
1614};
1615
1616static const VMStateDescription nvme_vmstate = {
1617 .name = "nvme",
1618 .unmigratable = 1,
1619};
1620
1621static void nvme_class_init(ObjectClass *oc, void *data)
1622{
1623 DeviceClass *dc = DEVICE_CLASS(oc);
1624 PCIDeviceClass *pc = PCI_DEVICE_CLASS(oc);
1625
e01d6a41 1626 pc->realize = nvme_realize;
f3c507ad
KB
1627 pc->exit = nvme_exit;
1628 pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
1629 pc->vendor_id = PCI_VENDOR_ID_INTEL;
1630 pc->device_id = 0x5845;
47989f14 1631 pc->revision = 2;
f3c507ad 1632
125ee0ed 1633 set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
f3c507ad 1634 dc->desc = "Non-Volatile Memory Express";
4f67d30b 1635 device_class_set_props(dc, nvme_props);
f3c507ad
KB
1636 dc->vmsd = &nvme_vmstate;
1637}
1638
a907ec52 1639static void nvme_instance_init(Object *obj)
33739c71
GA
1640{
1641 NvmeCtrl *s = NVME(obj);
33739c71 1642
a907ec52
LE
1643 device_add_bootindex_property(obj, &s->conf.bootindex,
1644 "bootindex", "/namespace@1,0",
40c2281c 1645 DEVICE(obj));
33739c71
GA
1646}
1647
f3c507ad 1648static const TypeInfo nvme_info = {
08db59e1 1649 .name = TYPE_NVME,
f3c507ad
KB
1650 .parent = TYPE_PCI_DEVICE,
1651 .instance_size = sizeof(NvmeCtrl),
1652 .class_init = nvme_class_init,
33739c71 1653 .instance_init = nvme_instance_init,
71d78767
EH
1654 .interfaces = (InterfaceInfo[]) {
1655 { INTERFACE_PCIE_DEVICE },
1656 { }
1657 },
f3c507ad
KB
1658};
1659
1660static void nvme_register_types(void)
1661{
1662 type_register_static(&nvme_info);
1663}
1664
1665type_init(nvme_register_types)