]> git.proxmox.com Git - mirror_qemu.git/blame - block/nvme.c
block/nvme: Introduce Completion Queue definitions
[mirror_qemu.git] / block / nvme.c
CommitLineData
bdd6a90a
FZ
1/*
2 * NVMe block driver based on vfio
3 *
4 * Copyright 2016 - 2018 Red Hat, Inc.
5 *
6 * Authors:
7 * Fam Zheng <famz@redhat.com>
8 * Paolo Bonzini <pbonzini@redhat.com>
9 *
10 * This work is licensed under the terms of the GNU GPL, version 2 or later.
11 * See the COPYING file in the top-level directory.
12 */
13
14#include "qemu/osdep.h"
15#include <linux/vfio.h>
16#include "qapi/error.h"
17#include "qapi/qmp/qdict.h"
18#include "qapi/qmp/qstring.h"
19#include "qemu/error-report.h"
db725815 20#include "qemu/main-loop.h"
0b8fa32f 21#include "qemu/module.h"
bdd6a90a 22#include "qemu/cutils.h"
922a01a0 23#include "qemu/option.h"
bdd6a90a
FZ
24#include "qemu/vfio-helpers.h"
25#include "block/block_int.h"
e4ec5ad4 26#include "sysemu/replay.h"
bdd6a90a
FZ
27#include "trace.h"
28
a3d9a352 29#include "block/nvme.h"
bdd6a90a
FZ
30
31#define NVME_SQ_ENTRY_BYTES 64
32#define NVME_CQ_ENTRY_BYTES 16
33#define NVME_QUEUE_SIZE 128
f6845323 34#define NVME_DOORBELL_SIZE 4096
bdd6a90a 35
1086e95d
SH
36/*
37 * We have to leave one slot empty as that is the full queue case where
38 * head == tail + 1.
39 */
40#define NVME_NUM_REQS (NVME_QUEUE_SIZE - 1)
41
b75fd5f5
SH
42typedef struct BDRVNVMeState BDRVNVMeState;
43
3214b0f0
PMD
44/* Same index is used for queues and IRQs */
45#define INDEX_ADMIN 0
46#define INDEX_IO(n) (1 + n)
47
48/* This driver shares a single MSIX IRQ for the admin and I/O queues */
49enum {
50 MSIX_SHARED_IRQ_IDX = 0,
51 MSIX_IRQ_COUNT = 1
52};
53
bdd6a90a
FZ
54typedef struct {
55 int32_t head, tail;
56 uint8_t *queue;
57 uint64_t iova;
58 /* Hardware MMIO register */
59 volatile uint32_t *doorbell;
60} NVMeQueue;
61
62typedef struct {
63 BlockCompletionFunc *cb;
64 void *opaque;
65 int cid;
66 void *prp_list_page;
67 uint64_t prp_list_iova;
1086e95d 68 int free_req_next; /* q->reqs[] index of next free req */
bdd6a90a
FZ
69} NVMeRequest;
70
71typedef struct {
bdd6a90a
FZ
72 QemuMutex lock;
73
b75fd5f5
SH
74 /* Read from I/O code path, initialized under BQL */
75 BDRVNVMeState *s;
76 int index;
77
bdd6a90a 78 /* Fields protected by BQL */
bdd6a90a
FZ
79 uint8_t *prp_list_pages;
80
81 /* Fields protected by @lock */
a5db74f3 82 CoQueue free_req_queue;
bdd6a90a
FZ
83 NVMeQueue sq, cq;
84 int cq_phase;
1086e95d
SH
85 int free_req_head;
86 NVMeRequest reqs[NVME_NUM_REQS];
bdd6a90a
FZ
87 int need_kick;
88 int inflight;
7838c67f
SH
89
90 /* Thread-safe, no lock necessary */
91 QEMUBH *completion_bh;
bdd6a90a
FZ
92} NVMeQueuePair;
93
b75fd5f5 94struct BDRVNVMeState {
bdd6a90a
FZ
95 AioContext *aio_context;
96 QEMUVFIOState *vfio;
f6845323
PMD
97 /* Memory mapped registers */
98 volatile struct {
99 uint32_t sq_tail;
100 uint32_t cq_head;
101 } *doorbells;
bdd6a90a
FZ
102 /* The submission/completion queue pairs.
103 * [0]: admin queue.
104 * [1..]: io queues.
105 */
106 NVMeQueuePair **queues;
1b539bd6 107 unsigned queue_count;
bdd6a90a
FZ
108 size_t page_size;
109 /* How many uint32_t elements does each doorbell entry take. */
110 size_t doorbell_scale;
111 bool write_cache_supported;
b111b3fc 112 EventNotifier irq_notifier[MSIX_IRQ_COUNT];
118d1b6a 113
bdd6a90a
FZ
114 uint64_t nsze; /* Namespace size reported by identify command */
115 int nsid; /* The namespace id to read/write data. */
1120407b 116 int blkshift;
118d1b6a 117
bdd6a90a 118 uint64_t max_transfer;
2f0d8947 119 bool plugged;
bdd6a90a 120
e0dd95e3 121 bool supports_write_zeroes;
e87a09d6 122 bool supports_discard;
e0dd95e3 123
bdd6a90a
FZ
124 CoMutex dma_map_lock;
125 CoQueue dma_flush_queue;
126
127 /* Total size of mapped qiov, accessed under dma_map_lock */
128 int dma_map_count;
cc61b074
HR
129
130 /* PCI address (required for nvme_refresh_filename()) */
131 char *device;
f25e7ab2
PMD
132
133 struct {
134 uint64_t completion_errors;
135 uint64_t aligned_accesses;
136 uint64_t unaligned_accesses;
137 } stats;
b75fd5f5 138};
bdd6a90a
FZ
139
140#define NVME_BLOCK_OPT_DEVICE "device"
141#define NVME_BLOCK_OPT_NAMESPACE "namespace"
142
7838c67f
SH
143static void nvme_process_completion_bh(void *opaque);
144
bdd6a90a
FZ
145static QemuOptsList runtime_opts = {
146 .name = "nvme",
147 .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
148 .desc = {
149 {
150 .name = NVME_BLOCK_OPT_DEVICE,
151 .type = QEMU_OPT_STRING,
152 .help = "NVMe PCI device address",
153 },
154 {
155 .name = NVME_BLOCK_OPT_NAMESPACE,
156 .type = QEMU_OPT_NUMBER,
157 .help = "NVMe namespace",
158 },
159 { /* end of list */ }
160 },
161};
162
dfa9c6c6
PMD
163/* Returns true on success, false on failure. */
164static bool nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q,
1b539bd6 165 unsigned nentries, size_t entry_bytes, Error **errp)
bdd6a90a 166{
bdd6a90a
FZ
167 size_t bytes;
168 int r;
169
170 bytes = ROUND_UP(nentries * entry_bytes, s->page_size);
171 q->head = q->tail = 0;
38e1f818 172 q->queue = qemu_try_memalign(s->page_size, bytes);
bdd6a90a
FZ
173 if (!q->queue) {
174 error_setg(errp, "Cannot allocate queue");
dfa9c6c6 175 return false;
bdd6a90a 176 }
2ed84693 177 memset(q->queue, 0, bytes);
bdd6a90a
FZ
178 r = qemu_vfio_dma_map(s->vfio, q->queue, bytes, false, &q->iova);
179 if (r) {
180 error_setg(errp, "Cannot map queue");
dfa9c6c6 181 return false;
bdd6a90a 182 }
dfa9c6c6 183 return true;
bdd6a90a
FZ
184}
185
b75fd5f5 186static void nvme_free_queue_pair(NVMeQueuePair *q)
bdd6a90a 187{
6e1e9ff2 188 trace_nvme_free_queue_pair(q->index, q);
7838c67f
SH
189 if (q->completion_bh) {
190 qemu_bh_delete(q->completion_bh);
191 }
bdd6a90a
FZ
192 qemu_vfree(q->prp_list_pages);
193 qemu_vfree(q->sq.queue);
194 qemu_vfree(q->cq.queue);
195 qemu_mutex_destroy(&q->lock);
196 g_free(q);
197}
198
199static void nvme_free_req_queue_cb(void *opaque)
200{
201 NVMeQueuePair *q = opaque;
202
203 qemu_mutex_lock(&q->lock);
204 while (qemu_co_enter_next(&q->free_req_queue, &q->lock)) {
205 /* Retry all pending requests */
206 }
207 qemu_mutex_unlock(&q->lock);
208}
209
0a28b02e
PMD
210static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s,
211 AioContext *aio_context,
1b539bd6 212 unsigned idx, size_t size,
bdd6a90a
FZ
213 Error **errp)
214{
215 int i, r;
0ea45f76 216 NVMeQueuePair *q;
bdd6a90a
FZ
217 uint64_t prp_list_iova;
218
0ea45f76
PMD
219 q = g_try_new0(NVMeQueuePair, 1);
220 if (!q) {
221 return NULL;
222 }
6e1e9ff2
PMD
223 trace_nvme_create_queue_pair(idx, q, size, aio_context,
224 event_notifier_get_fd(s->irq_notifier));
38e1f818 225 q->prp_list_pages = qemu_try_memalign(s->page_size,
0ea45f76
PMD
226 s->page_size * NVME_NUM_REQS);
227 if (!q->prp_list_pages) {
228 goto fail;
229 }
2ed84693 230 memset(q->prp_list_pages, 0, s->page_size * NVME_NUM_REQS);
bdd6a90a 231 qemu_mutex_init(&q->lock);
b75fd5f5 232 q->s = s;
bdd6a90a
FZ
233 q->index = idx;
234 qemu_co_queue_init(&q->free_req_queue);
0a28b02e 235 q->completion_bh = aio_bh_new(aio_context, nvme_process_completion_bh, q);
bdd6a90a 236 r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages,
1086e95d 237 s->page_size * NVME_NUM_REQS,
bdd6a90a
FZ
238 false, &prp_list_iova);
239 if (r) {
240 goto fail;
241 }
1086e95d
SH
242 q->free_req_head = -1;
243 for (i = 0; i < NVME_NUM_REQS; i++) {
bdd6a90a
FZ
244 NVMeRequest *req = &q->reqs[i];
245 req->cid = i + 1;
1086e95d
SH
246 req->free_req_next = q->free_req_head;
247 q->free_req_head = i;
bdd6a90a
FZ
248 req->prp_list_page = q->prp_list_pages + i * s->page_size;
249 req->prp_list_iova = prp_list_iova + i * s->page_size;
250 }
1086e95d 251
dfa9c6c6 252 if (!nvme_init_queue(s, &q->sq, size, NVME_SQ_ENTRY_BYTES, errp)) {
bdd6a90a
FZ
253 goto fail;
254 }
f6845323 255 q->sq.doorbell = &s->doorbells[idx * s->doorbell_scale].sq_tail;
bdd6a90a 256
dfa9c6c6 257 if (!nvme_init_queue(s, &q->cq, size, NVME_CQ_ENTRY_BYTES, errp)) {
bdd6a90a
FZ
258 goto fail;
259 }
f6845323 260 q->cq.doorbell = &s->doorbells[idx * s->doorbell_scale].cq_head;
bdd6a90a
FZ
261
262 return q;
263fail:
b75fd5f5 264 nvme_free_queue_pair(q);
bdd6a90a
FZ
265 return NULL;
266}
267
268/* With q->lock */
b75fd5f5 269static void nvme_kick(NVMeQueuePair *q)
bdd6a90a 270{
b75fd5f5
SH
271 BDRVNVMeState *s = q->s;
272
bdd6a90a
FZ
273 if (s->plugged || !q->need_kick) {
274 return;
275 }
276 trace_nvme_kick(s, q->index);
277 assert(!(q->sq.tail & 0xFF00));
278 /* Fence the write to submission queue entry before notifying the device. */
279 smp_wmb();
280 *q->sq.doorbell = cpu_to_le32(q->sq.tail);
281 q->inflight += q->need_kick;
282 q->need_kick = 0;
283}
284
285/* Find a free request element if any, otherwise:
286 * a) if in coroutine context, try to wait for one to become available;
287 * b) if not in coroutine, return NULL;
288 */
289static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
290{
1086e95d 291 NVMeRequest *req;
bdd6a90a
FZ
292
293 qemu_mutex_lock(&q->lock);
1086e95d
SH
294
295 while (q->free_req_head == -1) {
bdd6a90a 296 if (qemu_in_coroutine()) {
51e98b6d 297 trace_nvme_free_req_queue_wait(q->s, q->index);
bdd6a90a
FZ
298 qemu_co_queue_wait(&q->free_req_queue, &q->lock);
299 } else {
300 qemu_mutex_unlock(&q->lock);
301 return NULL;
302 }
303 }
1086e95d
SH
304
305 req = &q->reqs[q->free_req_head];
306 q->free_req_head = req->free_req_next;
307 req->free_req_next = -1;
308
bdd6a90a
FZ
309 qemu_mutex_unlock(&q->lock);
310 return req;
311}
312
1086e95d
SH
313/* With q->lock */
314static void nvme_put_free_req_locked(NVMeQueuePair *q, NVMeRequest *req)
315{
316 req->free_req_next = q->free_req_head;
317 q->free_req_head = req - q->reqs;
318}
319
320/* With q->lock */
b75fd5f5 321static void nvme_wake_free_req_locked(NVMeQueuePair *q)
1086e95d
SH
322{
323 if (!qemu_co_queue_empty(&q->free_req_queue)) {
b75fd5f5 324 replay_bh_schedule_oneshot_event(q->s->aio_context,
1086e95d
SH
325 nvme_free_req_queue_cb, q);
326 }
327}
328
329/* Insert a request in the freelist and wake waiters */
b75fd5f5 330static void nvme_put_free_req_and_wake(NVMeQueuePair *q, NVMeRequest *req)
1086e95d
SH
331{
332 qemu_mutex_lock(&q->lock);
333 nvme_put_free_req_locked(q, req);
b75fd5f5 334 nvme_wake_free_req_locked(q);
1086e95d
SH
335 qemu_mutex_unlock(&q->lock);
336}
337
bdd6a90a
FZ
338static inline int nvme_translate_error(const NvmeCqe *c)
339{
340 uint16_t status = (le16_to_cpu(c->status) >> 1) & 0xFF;
341 if (status) {
342 trace_nvme_error(le32_to_cpu(c->result),
343 le16_to_cpu(c->sq_head),
344 le16_to_cpu(c->sq_id),
345 le16_to_cpu(c->cid),
346 le16_to_cpu(status));
347 }
348 switch (status) {
349 case 0:
350 return 0;
351 case 1:
352 return -ENOSYS;
353 case 2:
354 return -EINVAL;
355 default:
356 return -EIO;
357 }
358}
359
360/* With q->lock */
b75fd5f5 361static bool nvme_process_completion(NVMeQueuePair *q)
bdd6a90a 362{
b75fd5f5 363 BDRVNVMeState *s = q->s;
bdd6a90a
FZ
364 bool progress = false;
365 NVMeRequest *preq;
366 NVMeRequest req;
367 NvmeCqe *c;
368
369 trace_nvme_process_completion(s, q->index, q->inflight);
7838c67f
SH
370 if (s->plugged) {
371 trace_nvme_process_completion_queue_plugged(s, q->index);
bdd6a90a
FZ
372 return false;
373 }
7838c67f
SH
374
375 /*
376 * Support re-entrancy when a request cb() function invokes aio_poll().
377 * Pending completions must be visible to aio_poll() so that a cb()
378 * function can wait for the completion of another request.
379 *
380 * The aio_poll() loop will execute our BH and we'll resume completion
381 * processing there.
382 */
383 qemu_bh_schedule(q->completion_bh);
384
bdd6a90a
FZ
385 assert(q->inflight >= 0);
386 while (q->inflight) {
04b3fb39 387 int ret;
bdd6a90a 388 int16_t cid;
04b3fb39 389
bdd6a90a 390 c = (NvmeCqe *)&q->cq.queue[q->cq.head * NVME_CQ_ENTRY_BYTES];
258867d1 391 if ((le16_to_cpu(c->status) & 0x1) == q->cq_phase) {
bdd6a90a
FZ
392 break;
393 }
04b3fb39 394 ret = nvme_translate_error(c);
f25e7ab2
PMD
395 if (ret) {
396 s->stats.completion_errors++;
397 }
bdd6a90a
FZ
398 q->cq.head = (q->cq.head + 1) % NVME_QUEUE_SIZE;
399 if (!q->cq.head) {
400 q->cq_phase = !q->cq_phase;
401 }
402 cid = le16_to_cpu(c->cid);
403 if (cid == 0 || cid > NVME_QUEUE_SIZE) {
58ad6ae0
PMD
404 warn_report("NVMe: Unexpected CID in completion queue: %"PRIu32", "
405 "queue size: %u", cid, NVME_QUEUE_SIZE);
bdd6a90a
FZ
406 continue;
407 }
bdd6a90a
FZ
408 trace_nvme_complete_command(s, q->index, cid);
409 preq = &q->reqs[cid - 1];
410 req = *preq;
411 assert(req.cid == cid);
412 assert(req.cb);
1086e95d 413 nvme_put_free_req_locked(q, preq);
bdd6a90a 414 preq->cb = preq->opaque = NULL;
7838c67f 415 q->inflight--;
bdd6a90a 416 qemu_mutex_unlock(&q->lock);
04b3fb39 417 req.cb(req.opaque, ret);
bdd6a90a 418 qemu_mutex_lock(&q->lock);
bdd6a90a
FZ
419 progress = true;
420 }
421 if (progress) {
422 /* Notify the device so it can post more completions. */
423 smp_mb_release();
424 *q->cq.doorbell = cpu_to_le32(q->cq.head);
b75fd5f5 425 nvme_wake_free_req_locked(q);
bdd6a90a 426 }
7838c67f
SH
427
428 qemu_bh_cancel(q->completion_bh);
429
bdd6a90a
FZ
430 return progress;
431}
432
7838c67f
SH
433static void nvme_process_completion_bh(void *opaque)
434{
435 NVMeQueuePair *q = opaque;
436
437 /*
438 * We're being invoked because a nvme_process_completion() cb() function
439 * called aio_poll(). The callback may be waiting for further completions
440 * so notify the device that it has space to fill in more completions now.
441 */
442 smp_mb_release();
443 *q->cq.doorbell = cpu_to_le32(q->cq.head);
444 nvme_wake_free_req_locked(q);
445
446 nvme_process_completion(q);
447}
448
bdd6a90a
FZ
449static void nvme_trace_command(const NvmeCmd *cmd)
450{
451 int i;
452
e266f52c
PMD
453 if (!trace_event_get_state_backends(TRACE_NVME_SUBMIT_COMMAND_RAW)) {
454 return;
455 }
bdd6a90a
FZ
456 for (i = 0; i < 8; ++i) {
457 uint8_t *cmdp = (uint8_t *)cmd + i * 8;
458 trace_nvme_submit_command_raw(cmdp[0], cmdp[1], cmdp[2], cmdp[3],
459 cmdp[4], cmdp[5], cmdp[6], cmdp[7]);
460 }
461}
462
b75fd5f5 463static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req,
bdd6a90a
FZ
464 NvmeCmd *cmd, BlockCompletionFunc cb,
465 void *opaque)
466{
467 assert(!req->cb);
468 req->cb = cb;
469 req->opaque = opaque;
470 cmd->cid = cpu_to_le32(req->cid);
471
b75fd5f5 472 trace_nvme_submit_command(q->s, q->index, req->cid);
bdd6a90a
FZ
473 nvme_trace_command(cmd);
474 qemu_mutex_lock(&q->lock);
475 memcpy((uint8_t *)q->sq.queue +
476 q->sq.tail * NVME_SQ_ENTRY_BYTES, cmd, sizeof(*cmd));
477 q->sq.tail = (q->sq.tail + 1) % NVME_QUEUE_SIZE;
478 q->need_kick++;
b75fd5f5
SH
479 nvme_kick(q);
480 nvme_process_completion(q);
bdd6a90a
FZ
481 qemu_mutex_unlock(&q->lock);
482}
483
484static void nvme_cmd_sync_cb(void *opaque, int ret)
485{
486 int *pret = opaque;
487 *pret = ret;
4720cbee 488 aio_wait_kick();
bdd6a90a
FZ
489}
490
491static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q,
492 NvmeCmd *cmd)
493{
073a0697 494 AioContext *aio_context = bdrv_get_aio_context(bs);
bdd6a90a 495 NVMeRequest *req;
bdd6a90a
FZ
496 int ret = -EINPROGRESS;
497 req = nvme_get_free_req(q);
498 if (!req) {
499 return -EBUSY;
500 }
b75fd5f5 501 nvme_submit_command(q, req, cmd, nvme_cmd_sync_cb, &ret);
bdd6a90a 502
073a0697 503 AIO_WAIT_WHILE(aio_context, ret == -EINPROGRESS);
bdd6a90a
FZ
504 return ret;
505}
506
7a5f00dd
PMD
507/* Returns true on success, false on failure. */
508static bool nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
bdd6a90a
FZ
509{
510 BDRVNVMeState *s = bs->opaque;
7a5f00dd 511 bool ret = false;
7d3b214a
PMD
512 union {
513 NvmeIdCtrl ctrl;
514 NvmeIdNs ns;
515 } *id;
118d1b6a 516 NvmeLBAF *lbaf;
e0dd95e3 517 uint16_t oncs;
1120407b 518 int r;
bdd6a90a
FZ
519 uint64_t iova;
520 NvmeCmd cmd = {
521 .opcode = NVME_ADM_CMD_IDENTIFY,
522 .cdw10 = cpu_to_le32(0x1),
523 };
524
38e1f818 525 id = qemu_try_memalign(s->page_size, sizeof(*id));
4d980939 526 if (!id) {
bdd6a90a
FZ
527 error_setg(errp, "Cannot allocate buffer for identify response");
528 goto out;
529 }
7d3b214a 530 r = qemu_vfio_dma_map(s->vfio, id, sizeof(*id), true, &iova);
bdd6a90a
FZ
531 if (r) {
532 error_setg(errp, "Cannot map buffer for DMA");
533 goto out;
534 }
bdd6a90a 535
2ed84693
PMD
536 memset(id, 0, sizeof(*id));
537 cmd.dptr.prp1 = cpu_to_le64(iova);
73159e52 538 if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
bdd6a90a
FZ
539 error_setg(errp, "Failed to identify controller");
540 goto out;
541 }
542
7d3b214a 543 if (le32_to_cpu(id->ctrl.nn) < namespace) {
bdd6a90a
FZ
544 error_setg(errp, "Invalid namespace");
545 goto out;
546 }
7d3b214a
PMD
547 s->write_cache_supported = le32_to_cpu(id->ctrl.vwc) & 0x1;
548 s->max_transfer = (id->ctrl.mdts ? 1 << id->ctrl.mdts : 0) * s->page_size;
bdd6a90a
FZ
549 /* For now the page list buffer per command is one page, to hold at most
550 * s->page_size / sizeof(uint64_t) entries. */
551 s->max_transfer = MIN_NON_ZERO(s->max_transfer,
552 s->page_size / sizeof(uint64_t) * s->page_size);
553
7d3b214a 554 oncs = le16_to_cpu(id->ctrl.oncs);
69265150 555 s->supports_write_zeroes = !!(oncs & NVME_ONCS_WRITE_ZEROES);
e87a09d6 556 s->supports_discard = !!(oncs & NVME_ONCS_DSM);
e0dd95e3 557
7d3b214a 558 memset(id, 0, sizeof(*id));
bdd6a90a
FZ
559 cmd.cdw10 = 0;
560 cmd.nsid = cpu_to_le32(namespace);
73159e52 561 if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
bdd6a90a
FZ
562 error_setg(errp, "Failed to identify namespace");
563 goto out;
564 }
565
7d3b214a
PMD
566 s->nsze = le64_to_cpu(id->ns.nsze);
567 lbaf = &id->ns.lbaf[NVME_ID_NS_FLBAS_INDEX(id->ns.flbas)];
118d1b6a 568
7d3b214a
PMD
569 if (NVME_ID_NS_DLFEAT_WRITE_ZEROES(id->ns.dlfeat) &&
570 NVME_ID_NS_DLFEAT_READ_BEHAVIOR(id->ns.dlfeat) ==
e0dd95e3
ML
571 NVME_ID_NS_DLFEAT_READ_BEHAVIOR_ZEROES) {
572 bs->supported_write_flags |= BDRV_REQ_MAY_UNMAP;
573 }
574
118d1b6a
ML
575 if (lbaf->ms) {
576 error_setg(errp, "Namespaces with metadata are not yet supported");
577 goto out;
578 }
579
1120407b
HR
580 if (lbaf->ds < BDRV_SECTOR_BITS || lbaf->ds > 12 ||
581 (1 << lbaf->ds) > s->page_size)
582 {
583 error_setg(errp, "Namespace has unsupported block size (2^%d)",
584 lbaf->ds);
118d1b6a
ML
585 goto out;
586 }
bdd6a90a 587
7a5f00dd 588 ret = true;
118d1b6a 589 s->blkshift = lbaf->ds;
bdd6a90a 590out:
4d980939
PMD
591 qemu_vfio_dma_unmap(s->vfio, id);
592 qemu_vfree(id);
7a5f00dd
PMD
593
594 return ret;
bdd6a90a
FZ
595}
596
7a1fb2ef
PMD
597static bool nvme_poll_queue(NVMeQueuePair *q)
598{
599 bool progress = false;
600
601 const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES;
602 NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset];
603
1c914cd1 604 trace_nvme_poll_queue(q->s, q->index);
7a1fb2ef
PMD
605 /*
606 * Do an early check for completions. q->lock isn't needed because
607 * nvme_process_completion() only runs in the event loop thread and
608 * cannot race with itself.
609 */
610 if ((le16_to_cpu(cqe->status) & 0x1) == q->cq_phase) {
611 return false;
612 }
613
614 qemu_mutex_lock(&q->lock);
615 while (nvme_process_completion(q)) {
616 /* Keep polling */
617 progress = true;
618 }
619 qemu_mutex_unlock(&q->lock);
620
621 return progress;
622}
623
bdd6a90a
FZ
624static bool nvme_poll_queues(BDRVNVMeState *s)
625{
626 bool progress = false;
627 int i;
628
1b539bd6 629 for (i = 0; i < s->queue_count; i++) {
7a1fb2ef 630 if (nvme_poll_queue(s->queues[i])) {
bdd6a90a
FZ
631 progress = true;
632 }
bdd6a90a
FZ
633 }
634 return progress;
635}
636
637static void nvme_handle_event(EventNotifier *n)
638{
b111b3fc
PMD
639 BDRVNVMeState *s = container_of(n, BDRVNVMeState,
640 irq_notifier[MSIX_SHARED_IRQ_IDX]);
bdd6a90a
FZ
641
642 trace_nvme_handle_event(s);
bdd6a90a
FZ
643 event_notifier_test_and_clear(n);
644 nvme_poll_queues(s);
bdd6a90a
FZ
645}
646
647static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
648{
649 BDRVNVMeState *s = bs->opaque;
1b539bd6 650 unsigned n = s->queue_count;
bdd6a90a
FZ
651 NVMeQueuePair *q;
652 NvmeCmd cmd;
1b539bd6 653 unsigned queue_size = NVME_QUEUE_SIZE;
bdd6a90a 654
0a28b02e
PMD
655 q = nvme_create_queue_pair(s, bdrv_get_aio_context(bs),
656 n, queue_size, errp);
bdd6a90a
FZ
657 if (!q) {
658 return false;
659 }
660 cmd = (NvmeCmd) {
661 .opcode = NVME_ADM_CMD_CREATE_CQ,
c26f2173 662 .dptr.prp1 = cpu_to_le64(q->cq.iova),
bdd6a90a
FZ
663 .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0xFFFF)),
664 .cdw11 = cpu_to_le32(0x3),
665 };
73159e52 666 if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
1b539bd6 667 error_setg(errp, "Failed to create CQ io queue [%u]", n);
c8edbfb2 668 goto out_error;
bdd6a90a
FZ
669 }
670 cmd = (NvmeCmd) {
671 .opcode = NVME_ADM_CMD_CREATE_SQ,
c26f2173 672 .dptr.prp1 = cpu_to_le64(q->sq.iova),
bdd6a90a
FZ
673 .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0xFFFF)),
674 .cdw11 = cpu_to_le32(0x1 | (n << 16)),
675 };
73159e52 676 if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
1b539bd6 677 error_setg(errp, "Failed to create SQ io queue [%u]", n);
c8edbfb2 678 goto out_error;
bdd6a90a
FZ
679 }
680 s->queues = g_renew(NVMeQueuePair *, s->queues, n + 1);
681 s->queues[n] = q;
1b539bd6 682 s->queue_count++;
bdd6a90a 683 return true;
c8edbfb2
PMD
684out_error:
685 nvme_free_queue_pair(q);
686 return false;
bdd6a90a
FZ
687}
688
689static bool nvme_poll_cb(void *opaque)
690{
691 EventNotifier *e = opaque;
b111b3fc
PMD
692 BDRVNVMeState *s = container_of(e, BDRVNVMeState,
693 irq_notifier[MSIX_SHARED_IRQ_IDX]);
bdd6a90a 694
b3ac2b94 695 return nvme_poll_queues(s);
bdd6a90a
FZ
696}
697
698static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
699 Error **errp)
700{
701 BDRVNVMeState *s = bs->opaque;
0a28b02e 702 AioContext *aio_context = bdrv_get_aio_context(bs);
bdd6a90a
FZ
703 int ret;
704 uint64_t cap;
705 uint64_t timeout_ms;
706 uint64_t deadline, now;
9406e0d9 707 volatile NvmeBar *regs = NULL;
bdd6a90a
FZ
708
709 qemu_co_mutex_init(&s->dma_map_lock);
710 qemu_co_queue_init(&s->dma_flush_queue);
cc61b074 711 s->device = g_strdup(device);
bdd6a90a
FZ
712 s->nsid = namespace;
713 s->aio_context = bdrv_get_aio_context(bs);
b111b3fc 714 ret = event_notifier_init(&s->irq_notifier[MSIX_SHARED_IRQ_IDX], 0);
bdd6a90a
FZ
715 if (ret) {
716 error_setg(errp, "Failed to init event notifier");
717 return ret;
718 }
719
720 s->vfio = qemu_vfio_open_pci(device, errp);
721 if (!s->vfio) {
722 ret = -EINVAL;
9582f357 723 goto out;
bdd6a90a
FZ
724 }
725
37d7a45a
PMD
726 regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0, sizeof(NvmeBar),
727 PROT_READ | PROT_WRITE, errp);
728 if (!regs) {
bdd6a90a 729 ret = -EINVAL;
9582f357 730 goto out;
bdd6a90a 731 }
bdd6a90a
FZ
732 /* Perform initialize sequence as described in NVMe spec "7.6.1
733 * Initialization". */
734
9406e0d9 735 cap = le64_to_cpu(regs->cap);
15b2260b
PMD
736 trace_nvme_controller_capability_raw(cap);
737 trace_nvme_controller_capability("Maximum Queue Entries Supported",
738 1 + NVME_CAP_MQES(cap));
739 trace_nvme_controller_capability("Contiguous Queues Required",
740 NVME_CAP_CQR(cap));
741 trace_nvme_controller_capability("Doorbell Stride",
742 2 << (2 + NVME_CAP_DSTRD(cap)));
743 trace_nvme_controller_capability("Subsystem Reset Supported",
744 NVME_CAP_NSSRS(cap));
745 trace_nvme_controller_capability("Memory Page Size Minimum",
746 1 << (12 + NVME_CAP_MPSMIN(cap)));
747 trace_nvme_controller_capability("Memory Page Size Maximum",
748 1 << (12 + NVME_CAP_MPSMAX(cap)));
fad1eb68 749 if (!NVME_CAP_CSS(cap)) {
bdd6a90a
FZ
750 error_setg(errp, "Device doesn't support NVMe command set");
751 ret = -EINVAL;
9582f357 752 goto out;
bdd6a90a
FZ
753 }
754
fad1eb68
PMD
755 s->page_size = MAX(4096, 1 << NVME_CAP_MPSMIN(cap));
756 s->doorbell_scale = (4 << NVME_CAP_DSTRD(cap)) / sizeof(uint32_t);
bdd6a90a 757 bs->bl.opt_mem_alignment = s->page_size;
fad1eb68 758 timeout_ms = MIN(500 * NVME_CAP_TO(cap), 30000);
bdd6a90a
FZ
759
760 /* Reset device to get a clean state. */
9406e0d9 761 regs->cc = cpu_to_le32(le32_to_cpu(regs->cc) & 0xFE);
bdd6a90a 762 /* Wait for CSTS.RDY = 0. */
e4f310fe 763 deadline = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + timeout_ms * SCALE_MS;
fad1eb68 764 while (NVME_CSTS_RDY(le32_to_cpu(regs->csts))) {
bdd6a90a
FZ
765 if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) {
766 error_setg(errp, "Timeout while waiting for device to reset (%"
767 PRId64 " ms)",
768 timeout_ms);
769 ret = -ETIMEDOUT;
9582f357 770 goto out;
bdd6a90a
FZ
771 }
772 }
773
f6845323
PMD
774 s->doorbells = qemu_vfio_pci_map_bar(s->vfio, 0, sizeof(NvmeBar),
775 NVME_DOORBELL_SIZE, PROT_WRITE, errp);
776 if (!s->doorbells) {
777 ret = -EINVAL;
778 goto out;
779 }
780
bdd6a90a
FZ
781 /* Set up admin queue. */
782 s->queues = g_new(NVMeQueuePair *, 1);
0a28b02e 783 s->queues[INDEX_ADMIN] = nvme_create_queue_pair(s, aio_context, 0,
73159e52
PMD
784 NVME_QUEUE_SIZE,
785 errp);
786 if (!s->queues[INDEX_ADMIN]) {
bdd6a90a 787 ret = -EINVAL;
9582f357 788 goto out;
bdd6a90a 789 }
1b539bd6 790 s->queue_count = 1;
bdd6a90a 791 QEMU_BUILD_BUG_ON(NVME_QUEUE_SIZE & 0xF000);
fad1eb68
PMD
792 regs->aqa = cpu_to_le32((NVME_QUEUE_SIZE << AQA_ACQS_SHIFT) |
793 (NVME_QUEUE_SIZE << AQA_ASQS_SHIFT));
9406e0d9
PMD
794 regs->asq = cpu_to_le64(s->queues[INDEX_ADMIN]->sq.iova);
795 regs->acq = cpu_to_le64(s->queues[INDEX_ADMIN]->cq.iova);
bdd6a90a
FZ
796
797 /* After setting up all control registers we can enable device now. */
fad1eb68
PMD
798 regs->cc = cpu_to_le32((ctz32(NVME_CQ_ENTRY_BYTES) << CC_IOCQES_SHIFT) |
799 (ctz32(NVME_SQ_ENTRY_BYTES) << CC_IOSQES_SHIFT) |
800 CC_EN_MASK);
bdd6a90a
FZ
801 /* Wait for CSTS.RDY = 1. */
802 now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
eefffb02 803 deadline = now + timeout_ms * SCALE_MS;
fad1eb68 804 while (!NVME_CSTS_RDY(le32_to_cpu(regs->csts))) {
bdd6a90a
FZ
805 if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) {
806 error_setg(errp, "Timeout while waiting for device to start (%"
807 PRId64 " ms)",
808 timeout_ms);
809 ret = -ETIMEDOUT;
9582f357 810 goto out;
bdd6a90a
FZ
811 }
812 }
813
b111b3fc 814 ret = qemu_vfio_pci_init_irq(s->vfio, s->irq_notifier,
bdd6a90a
FZ
815 VFIO_PCI_MSIX_IRQ_INDEX, errp);
816 if (ret) {
9582f357 817 goto out;
bdd6a90a 818 }
b111b3fc
PMD
819 aio_set_event_notifier(bdrv_get_aio_context(bs),
820 &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
bdd6a90a
FZ
821 false, nvme_handle_event, nvme_poll_cb);
822
7a5f00dd 823 if (!nvme_identify(bs, namespace, errp)) {
bdd6a90a 824 ret = -EIO;
9582f357 825 goto out;
bdd6a90a
FZ
826 }
827
828 /* Set up command queues. */
829 if (!nvme_add_io_queue(bs, errp)) {
830 ret = -EIO;
bdd6a90a 831 }
9582f357 832out:
37d7a45a
PMD
833 if (regs) {
834 qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)regs, 0, sizeof(NvmeBar));
835 }
836
9582f357 837 /* Cleaning up is done in nvme_file_open() upon error. */
bdd6a90a
FZ
838 return ret;
839}
840
841/* Parse a filename in the format of nvme://XXXX:XX:XX.X/X. Example:
842 *
843 * nvme://0000:44:00.0/1
844 *
845 * where the "nvme://" is a fixed form of the protocol prefix, the middle part
846 * is the PCI address, and the last part is the namespace number starting from
847 * 1 according to the NVMe spec. */
848static void nvme_parse_filename(const char *filename, QDict *options,
849 Error **errp)
850{
851 int pref = strlen("nvme://");
852
853 if (strlen(filename) > pref && !strncmp(filename, "nvme://", pref)) {
854 const char *tmp = filename + pref;
855 char *device;
856 const char *namespace;
857 unsigned long ns;
858 const char *slash = strchr(tmp, '/');
859 if (!slash) {
625eaca9 860 qdict_put_str(options, NVME_BLOCK_OPT_DEVICE, tmp);
bdd6a90a
FZ
861 return;
862 }
863 device = g_strndup(tmp, slash - tmp);
625eaca9 864 qdict_put_str(options, NVME_BLOCK_OPT_DEVICE, device);
bdd6a90a
FZ
865 g_free(device);
866 namespace = slash + 1;
867 if (*namespace && qemu_strtoul(namespace, NULL, 10, &ns)) {
868 error_setg(errp, "Invalid namespace '%s', positive number expected",
869 namespace);
870 return;
871 }
625eaca9
LV
872 qdict_put_str(options, NVME_BLOCK_OPT_NAMESPACE,
873 *namespace ? namespace : "1");
bdd6a90a
FZ
874 }
875}
876
877static int nvme_enable_disable_write_cache(BlockDriverState *bs, bool enable,
878 Error **errp)
879{
880 int ret;
881 BDRVNVMeState *s = bs->opaque;
882 NvmeCmd cmd = {
883 .opcode = NVME_ADM_CMD_SET_FEATURES,
884 .nsid = cpu_to_le32(s->nsid),
885 .cdw10 = cpu_to_le32(0x06),
886 .cdw11 = cpu_to_le32(enable ? 0x01 : 0x00),
887 };
888
73159e52 889 ret = nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd);
bdd6a90a
FZ
890 if (ret) {
891 error_setg(errp, "Failed to configure NVMe write cache");
892 }
893 return ret;
894}
895
896static void nvme_close(BlockDriverState *bs)
897{
bdd6a90a
FZ
898 BDRVNVMeState *s = bs->opaque;
899
1b539bd6 900 for (unsigned i = 0; i < s->queue_count; ++i) {
b75fd5f5 901 nvme_free_queue_pair(s->queues[i]);
bdd6a90a 902 }
9582f357 903 g_free(s->queues);
b111b3fc
PMD
904 aio_set_event_notifier(bdrv_get_aio_context(bs),
905 &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
bdd6a90a 906 false, NULL, NULL);
b111b3fc 907 event_notifier_cleanup(&s->irq_notifier[MSIX_SHARED_IRQ_IDX]);
f6845323
PMD
908 qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->doorbells,
909 sizeof(NvmeBar), NVME_DOORBELL_SIZE);
bdd6a90a 910 qemu_vfio_close(s->vfio);
cc61b074
HR
911
912 g_free(s->device);
bdd6a90a
FZ
913}
914
915static int nvme_file_open(BlockDriverState *bs, QDict *options, int flags,
916 Error **errp)
917{
918 const char *device;
919 QemuOpts *opts;
920 int namespace;
921 int ret;
922 BDRVNVMeState *s = bs->opaque;
923
e0dd95e3
ML
924 bs->supported_write_flags = BDRV_REQ_FUA;
925
bdd6a90a
FZ
926 opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
927 qemu_opts_absorb_qdict(opts, options, &error_abort);
928 device = qemu_opt_get(opts, NVME_BLOCK_OPT_DEVICE);
929 if (!device) {
930 error_setg(errp, "'" NVME_BLOCK_OPT_DEVICE "' option is required");
931 qemu_opts_del(opts);
932 return -EINVAL;
933 }
934
935 namespace = qemu_opt_get_number(opts, NVME_BLOCK_OPT_NAMESPACE, 1);
936 ret = nvme_init(bs, device, namespace, errp);
937 qemu_opts_del(opts);
938 if (ret) {
939 goto fail;
940 }
941 if (flags & BDRV_O_NOCACHE) {
942 if (!s->write_cache_supported) {
943 error_setg(errp,
944 "NVMe controller doesn't support write cache configuration");
945 ret = -EINVAL;
946 } else {
947 ret = nvme_enable_disable_write_cache(bs, !(flags & BDRV_O_NOCACHE),
948 errp);
949 }
950 if (ret) {
951 goto fail;
952 }
953 }
bdd6a90a
FZ
954 return 0;
955fail:
956 nvme_close(bs);
957 return ret;
958}
959
960static int64_t nvme_getlength(BlockDriverState *bs)
961{
962 BDRVNVMeState *s = bs->opaque;
118d1b6a
ML
963 return s->nsze << s->blkshift;
964}
bdd6a90a 965
1120407b 966static uint32_t nvme_get_blocksize(BlockDriverState *bs)
118d1b6a
ML
967{
968 BDRVNVMeState *s = bs->opaque;
1120407b
HR
969 assert(s->blkshift >= BDRV_SECTOR_BITS && s->blkshift <= 12);
970 return UINT32_C(1) << s->blkshift;
118d1b6a
ML
971}
972
973static int nvme_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
974{
1120407b 975 uint32_t blocksize = nvme_get_blocksize(bs);
118d1b6a
ML
976 bsz->phys = blocksize;
977 bsz->log = blocksize;
978 return 0;
bdd6a90a
FZ
979}
980
981/* Called with s->dma_map_lock */
982static coroutine_fn int nvme_cmd_unmap_qiov(BlockDriverState *bs,
983 QEMUIOVector *qiov)
984{
985 int r = 0;
986 BDRVNVMeState *s = bs->opaque;
987
988 s->dma_map_count -= qiov->size;
989 if (!s->dma_map_count && !qemu_co_queue_empty(&s->dma_flush_queue)) {
990 r = qemu_vfio_dma_reset_temporary(s->vfio);
991 if (!r) {
992 qemu_co_queue_restart_all(&s->dma_flush_queue);
993 }
994 }
995 return r;
996}
997
998/* Called with s->dma_map_lock */
999static coroutine_fn int nvme_cmd_map_qiov(BlockDriverState *bs, NvmeCmd *cmd,
1000 NVMeRequest *req, QEMUIOVector *qiov)
1001{
1002 BDRVNVMeState *s = bs->opaque;
1003 uint64_t *pagelist = req->prp_list_page;
1004 int i, j, r;
1005 int entries = 0;
1006
1007 assert(qiov->size);
1008 assert(QEMU_IS_ALIGNED(qiov->size, s->page_size));
1009 assert(qiov->size / s->page_size <= s->page_size / sizeof(uint64_t));
1010 for (i = 0; i < qiov->niov; ++i) {
1011 bool retry = true;
1012 uint64_t iova;
1013try_map:
1014 r = qemu_vfio_dma_map(s->vfio,
1015 qiov->iov[i].iov_base,
1016 qiov->iov[i].iov_len,
1017 true, &iova);
1018 if (r == -ENOMEM && retry) {
1019 retry = false;
1020 trace_nvme_dma_flush_queue_wait(s);
1021 if (s->dma_map_count) {
1022 trace_nvme_dma_map_flush(s);
1023 qemu_co_queue_wait(&s->dma_flush_queue, &s->dma_map_lock);
1024 } else {
1025 r = qemu_vfio_dma_reset_temporary(s->vfio);
1026 if (r) {
1027 goto fail;
1028 }
1029 }
1030 goto try_map;
1031 }
1032 if (r) {
1033 goto fail;
1034 }
1035
1036 for (j = 0; j < qiov->iov[i].iov_len / s->page_size; j++) {
2916405a 1037 pagelist[entries++] = cpu_to_le64(iova + j * s->page_size);
bdd6a90a
FZ
1038 }
1039 trace_nvme_cmd_map_qiov_iov(s, i, qiov->iov[i].iov_base,
1040 qiov->iov[i].iov_len / s->page_size);
1041 }
1042
1043 s->dma_map_count += qiov->size;
1044
1045 assert(entries <= s->page_size / sizeof(uint64_t));
1046 switch (entries) {
1047 case 0:
1048 abort();
1049 case 1:
c26f2173
KJ
1050 cmd->dptr.prp1 = pagelist[0];
1051 cmd->dptr.prp2 = 0;
bdd6a90a
FZ
1052 break;
1053 case 2:
c26f2173
KJ
1054 cmd->dptr.prp1 = pagelist[0];
1055 cmd->dptr.prp2 = pagelist[1];
bdd6a90a
FZ
1056 break;
1057 default:
c26f2173
KJ
1058 cmd->dptr.prp1 = pagelist[0];
1059 cmd->dptr.prp2 = cpu_to_le64(req->prp_list_iova + sizeof(uint64_t));
bdd6a90a
FZ
1060 break;
1061 }
1062 trace_nvme_cmd_map_qiov(s, cmd, req, qiov, entries);
1063 for (i = 0; i < entries; ++i) {
1064 trace_nvme_cmd_map_qiov_pages(s, i, pagelist[i]);
1065 }
1066 return 0;
1067fail:
1068 /* No need to unmap [0 - i) iovs even if we've failed, since we don't
1069 * increment s->dma_map_count. This is okay for fixed mapping memory areas
1070 * because they are already mapped before calling this function; for
1071 * temporary mappings, a later nvme_cmd_(un)map_qiov will reclaim by
1072 * calling qemu_vfio_dma_reset_temporary when necessary. */
1073 return r;
1074}
1075
1076typedef struct {
1077 Coroutine *co;
1078 int ret;
1079 AioContext *ctx;
1080} NVMeCoData;
1081
1082static void nvme_rw_cb_bh(void *opaque)
1083{
1084 NVMeCoData *data = opaque;
1085 qemu_coroutine_enter(data->co);
1086}
1087
1088static void nvme_rw_cb(void *opaque, int ret)
1089{
1090 NVMeCoData *data = opaque;
1091 data->ret = ret;
1092 if (!data->co) {
1093 /* The rw coroutine hasn't yielded, don't try to enter. */
1094 return;
1095 }
e4ec5ad4 1096 replay_bh_schedule_oneshot_event(data->ctx, nvme_rw_cb_bh, data);
bdd6a90a
FZ
1097}
1098
1099static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
1100 uint64_t offset, uint64_t bytes,
1101 QEMUIOVector *qiov,
1102 bool is_write,
1103 int flags)
1104{
1105 int r;
1106 BDRVNVMeState *s = bs->opaque;
73159e52 1107 NVMeQueuePair *ioq = s->queues[INDEX_IO(0)];
bdd6a90a 1108 NVMeRequest *req;
118d1b6a
ML
1109
1110 uint32_t cdw12 = (((bytes >> s->blkshift) - 1) & 0xFFFF) |
bdd6a90a
FZ
1111 (flags & BDRV_REQ_FUA ? 1 << 30 : 0);
1112 NvmeCmd cmd = {
1113 .opcode = is_write ? NVME_CMD_WRITE : NVME_CMD_READ,
1114 .nsid = cpu_to_le32(s->nsid),
118d1b6a
ML
1115 .cdw10 = cpu_to_le32((offset >> s->blkshift) & 0xFFFFFFFF),
1116 .cdw11 = cpu_to_le32(((offset >> s->blkshift) >> 32) & 0xFFFFFFFF),
bdd6a90a
FZ
1117 .cdw12 = cpu_to_le32(cdw12),
1118 };
1119 NVMeCoData data = {
1120 .ctx = bdrv_get_aio_context(bs),
1121 .ret = -EINPROGRESS,
1122 };
1123
1124 trace_nvme_prw_aligned(s, is_write, offset, bytes, flags, qiov->niov);
1b539bd6 1125 assert(s->queue_count > 1);
bdd6a90a
FZ
1126 req = nvme_get_free_req(ioq);
1127 assert(req);
1128
1129 qemu_co_mutex_lock(&s->dma_map_lock);
1130 r = nvme_cmd_map_qiov(bs, &cmd, req, qiov);
1131 qemu_co_mutex_unlock(&s->dma_map_lock);
1132 if (r) {
b75fd5f5 1133 nvme_put_free_req_and_wake(ioq, req);
bdd6a90a
FZ
1134 return r;
1135 }
b75fd5f5 1136 nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
bdd6a90a
FZ
1137
1138 data.co = qemu_coroutine_self();
1139 while (data.ret == -EINPROGRESS) {
1140 qemu_coroutine_yield();
1141 }
1142
1143 qemu_co_mutex_lock(&s->dma_map_lock);
1144 r = nvme_cmd_unmap_qiov(bs, qiov);
1145 qemu_co_mutex_unlock(&s->dma_map_lock);
1146 if (r) {
1147 return r;
1148 }
1149
1150 trace_nvme_rw_done(s, is_write, offset, bytes, data.ret);
1151 return data.ret;
1152}
1153
1154static inline bool nvme_qiov_aligned(BlockDriverState *bs,
1155 const QEMUIOVector *qiov)
1156{
1157 int i;
1158 BDRVNVMeState *s = bs->opaque;
1159
1160 for (i = 0; i < qiov->niov; ++i) {
1161 if (!QEMU_PTR_IS_ALIGNED(qiov->iov[i].iov_base, s->page_size) ||
1162 !QEMU_IS_ALIGNED(qiov->iov[i].iov_len, s->page_size)) {
1163 trace_nvme_qiov_unaligned(qiov, i, qiov->iov[i].iov_base,
1164 qiov->iov[i].iov_len, s->page_size);
1165 return false;
1166 }
1167 }
1168 return true;
1169}
1170
1171static int nvme_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
1172 QEMUIOVector *qiov, bool is_write, int flags)
1173{
1174 BDRVNVMeState *s = bs->opaque;
1175 int r;
1176 uint8_t *buf = NULL;
1177 QEMUIOVector local_qiov;
1178
1179 assert(QEMU_IS_ALIGNED(offset, s->page_size));
1180 assert(QEMU_IS_ALIGNED(bytes, s->page_size));
1181 assert(bytes <= s->max_transfer);
1182 if (nvme_qiov_aligned(bs, qiov)) {
f25e7ab2 1183 s->stats.aligned_accesses++;
bdd6a90a
FZ
1184 return nvme_co_prw_aligned(bs, offset, bytes, qiov, is_write, flags);
1185 }
f25e7ab2 1186 s->stats.unaligned_accesses++;
bdd6a90a 1187 trace_nvme_prw_buffered(s, offset, bytes, qiov->niov, is_write);
38e1f818 1188 buf = qemu_try_memalign(s->page_size, bytes);
bdd6a90a
FZ
1189
1190 if (!buf) {
1191 return -ENOMEM;
1192 }
1193 qemu_iovec_init(&local_qiov, 1);
1194 if (is_write) {
1195 qemu_iovec_to_buf(qiov, 0, buf, bytes);
1196 }
1197 qemu_iovec_add(&local_qiov, buf, bytes);
1198 r = nvme_co_prw_aligned(bs, offset, bytes, &local_qiov, is_write, flags);
1199 qemu_iovec_destroy(&local_qiov);
1200 if (!r && !is_write) {
1201 qemu_iovec_from_buf(qiov, 0, buf, bytes);
1202 }
1203 qemu_vfree(buf);
1204 return r;
1205}
1206
1207static coroutine_fn int nvme_co_preadv(BlockDriverState *bs,
1208 uint64_t offset, uint64_t bytes,
1209 QEMUIOVector *qiov, int flags)
1210{
1211 return nvme_co_prw(bs, offset, bytes, qiov, false, flags);
1212}
1213
1214static coroutine_fn int nvme_co_pwritev(BlockDriverState *bs,
1215 uint64_t offset, uint64_t bytes,
1216 QEMUIOVector *qiov, int flags)
1217{
1218 return nvme_co_prw(bs, offset, bytes, qiov, true, flags);
1219}
1220
1221static coroutine_fn int nvme_co_flush(BlockDriverState *bs)
1222{
1223 BDRVNVMeState *s = bs->opaque;
73159e52 1224 NVMeQueuePair *ioq = s->queues[INDEX_IO(0)];
bdd6a90a
FZ
1225 NVMeRequest *req;
1226 NvmeCmd cmd = {
1227 .opcode = NVME_CMD_FLUSH,
1228 .nsid = cpu_to_le32(s->nsid),
1229 };
1230 NVMeCoData data = {
1231 .ctx = bdrv_get_aio_context(bs),
1232 .ret = -EINPROGRESS,
1233 };
1234
1b539bd6 1235 assert(s->queue_count > 1);
bdd6a90a
FZ
1236 req = nvme_get_free_req(ioq);
1237 assert(req);
b75fd5f5 1238 nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
bdd6a90a
FZ
1239
1240 data.co = qemu_coroutine_self();
1241 if (data.ret == -EINPROGRESS) {
1242 qemu_coroutine_yield();
1243 }
1244
1245 return data.ret;
1246}
1247
1248
e0dd95e3
ML
1249static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
1250 int64_t offset,
1251 int bytes,
1252 BdrvRequestFlags flags)
1253{
1254 BDRVNVMeState *s = bs->opaque;
73159e52 1255 NVMeQueuePair *ioq = s->queues[INDEX_IO(0)];
e0dd95e3
ML
1256 NVMeRequest *req;
1257
1258 uint32_t cdw12 = ((bytes >> s->blkshift) - 1) & 0xFFFF;
1259
1260 if (!s->supports_write_zeroes) {
1261 return -ENOTSUP;
1262 }
1263
1264 NvmeCmd cmd = {
69265150 1265 .opcode = NVME_CMD_WRITE_ZEROES,
e0dd95e3
ML
1266 .nsid = cpu_to_le32(s->nsid),
1267 .cdw10 = cpu_to_le32((offset >> s->blkshift) & 0xFFFFFFFF),
1268 .cdw11 = cpu_to_le32(((offset >> s->blkshift) >> 32) & 0xFFFFFFFF),
1269 };
1270
1271 NVMeCoData data = {
1272 .ctx = bdrv_get_aio_context(bs),
1273 .ret = -EINPROGRESS,
1274 };
1275
1276 if (flags & BDRV_REQ_MAY_UNMAP) {
1277 cdw12 |= (1 << 25);
1278 }
1279
1280 if (flags & BDRV_REQ_FUA) {
1281 cdw12 |= (1 << 30);
1282 }
1283
1284 cmd.cdw12 = cpu_to_le32(cdw12);
1285
1286 trace_nvme_write_zeroes(s, offset, bytes, flags);
1b539bd6 1287 assert(s->queue_count > 1);
e0dd95e3
ML
1288 req = nvme_get_free_req(ioq);
1289 assert(req);
1290
b75fd5f5 1291 nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
e0dd95e3
ML
1292
1293 data.co = qemu_coroutine_self();
1294 while (data.ret == -EINPROGRESS) {
1295 qemu_coroutine_yield();
1296 }
1297
1298 trace_nvme_rw_done(s, true, offset, bytes, data.ret);
1299 return data.ret;
1300}
1301
1302
e87a09d6
ML
1303static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
1304 int64_t offset,
1305 int bytes)
1306{
1307 BDRVNVMeState *s = bs->opaque;
73159e52 1308 NVMeQueuePair *ioq = s->queues[INDEX_IO(0)];
e87a09d6
ML
1309 NVMeRequest *req;
1310 NvmeDsmRange *buf;
1311 QEMUIOVector local_qiov;
1312 int ret;
1313
1314 NvmeCmd cmd = {
1315 .opcode = NVME_CMD_DSM,
1316 .nsid = cpu_to_le32(s->nsid),
1317 .cdw10 = cpu_to_le32(0), /*number of ranges - 0 based*/
1318 .cdw11 = cpu_to_le32(1 << 2), /*deallocate bit*/
1319 };
1320
1321 NVMeCoData data = {
1322 .ctx = bdrv_get_aio_context(bs),
1323 .ret = -EINPROGRESS,
1324 };
1325
1326 if (!s->supports_discard) {
1327 return -ENOTSUP;
1328 }
1329
1b539bd6 1330 assert(s->queue_count > 1);
e87a09d6 1331
38e1f818 1332 buf = qemu_try_memalign(s->page_size, s->page_size);
e87a09d6
ML
1333 if (!buf) {
1334 return -ENOMEM;
1335 }
2ed84693 1336 memset(buf, 0, s->page_size);
e87a09d6
ML
1337 buf->nlb = cpu_to_le32(bytes >> s->blkshift);
1338 buf->slba = cpu_to_le64(offset >> s->blkshift);
1339 buf->cattr = 0;
1340
1341 qemu_iovec_init(&local_qiov, 1);
1342 qemu_iovec_add(&local_qiov, buf, 4096);
1343
1344 req = nvme_get_free_req(ioq);
1345 assert(req);
1346
1347 qemu_co_mutex_lock(&s->dma_map_lock);
1348 ret = nvme_cmd_map_qiov(bs, &cmd, req, &local_qiov);
1349 qemu_co_mutex_unlock(&s->dma_map_lock);
1350
1351 if (ret) {
b75fd5f5 1352 nvme_put_free_req_and_wake(ioq, req);
e87a09d6
ML
1353 goto out;
1354 }
1355
1356 trace_nvme_dsm(s, offset, bytes);
1357
b75fd5f5 1358 nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
e87a09d6
ML
1359
1360 data.co = qemu_coroutine_self();
1361 while (data.ret == -EINPROGRESS) {
1362 qemu_coroutine_yield();
1363 }
1364
1365 qemu_co_mutex_lock(&s->dma_map_lock);
1366 ret = nvme_cmd_unmap_qiov(bs, &local_qiov);
1367 qemu_co_mutex_unlock(&s->dma_map_lock);
1368
1369 if (ret) {
1370 goto out;
1371 }
1372
1373 ret = data.ret;
1374 trace_nvme_dsm_done(s, offset, bytes, ret);
1375out:
1376 qemu_iovec_destroy(&local_qiov);
1377 qemu_vfree(buf);
1378 return ret;
1379
1380}
1381
1382
bdd6a90a
FZ
1383static int nvme_reopen_prepare(BDRVReopenState *reopen_state,
1384 BlockReopenQueue *queue, Error **errp)
1385{
1386 return 0;
1387}
1388
998b3a1e 1389static void nvme_refresh_filename(BlockDriverState *bs)
bdd6a90a 1390{
cc61b074 1391 BDRVNVMeState *s = bs->opaque;
bdd6a90a 1392
cc61b074
HR
1393 snprintf(bs->exact_filename, sizeof(bs->exact_filename), "nvme://%s/%i",
1394 s->device, s->nsid);
bdd6a90a
FZ
1395}
1396
1397static void nvme_refresh_limits(BlockDriverState *bs, Error **errp)
1398{
1399 BDRVNVMeState *s = bs->opaque;
1400
1401 bs->bl.opt_mem_alignment = s->page_size;
1402 bs->bl.request_alignment = s->page_size;
1403 bs->bl.max_transfer = s->max_transfer;
1404}
1405
1406static void nvme_detach_aio_context(BlockDriverState *bs)
1407{
1408 BDRVNVMeState *s = bs->opaque;
1409
1b539bd6 1410 for (unsigned i = 0; i < s->queue_count; i++) {
7838c67f
SH
1411 NVMeQueuePair *q = s->queues[i];
1412
1413 qemu_bh_delete(q->completion_bh);
1414 q->completion_bh = NULL;
1415 }
1416
b111b3fc
PMD
1417 aio_set_event_notifier(bdrv_get_aio_context(bs),
1418 &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
bdd6a90a
FZ
1419 false, NULL, NULL);
1420}
1421
1422static void nvme_attach_aio_context(BlockDriverState *bs,
1423 AioContext *new_context)
1424{
1425 BDRVNVMeState *s = bs->opaque;
1426
1427 s->aio_context = new_context;
b111b3fc 1428 aio_set_event_notifier(new_context, &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
bdd6a90a 1429 false, nvme_handle_event, nvme_poll_cb);
7838c67f 1430
1b539bd6 1431 for (unsigned i = 0; i < s->queue_count; i++) {
7838c67f
SH
1432 NVMeQueuePair *q = s->queues[i];
1433
1434 q->completion_bh =
1435 aio_bh_new(new_context, nvme_process_completion_bh, q);
1436 }
bdd6a90a
FZ
1437}
1438
1439static void nvme_aio_plug(BlockDriverState *bs)
1440{
1441 BDRVNVMeState *s = bs->opaque;
2f0d8947
PB
1442 assert(!s->plugged);
1443 s->plugged = true;
bdd6a90a
FZ
1444}
1445
1446static void nvme_aio_unplug(BlockDriverState *bs)
1447{
bdd6a90a
FZ
1448 BDRVNVMeState *s = bs->opaque;
1449 assert(s->plugged);
2f0d8947 1450 s->plugged = false;
1b539bd6 1451 for (unsigned i = INDEX_IO(0); i < s->queue_count; i++) {
2f0d8947
PB
1452 NVMeQueuePair *q = s->queues[i];
1453 qemu_mutex_lock(&q->lock);
b75fd5f5
SH
1454 nvme_kick(q);
1455 nvme_process_completion(q);
2f0d8947 1456 qemu_mutex_unlock(&q->lock);
bdd6a90a
FZ
1457 }
1458}
1459
9ed61612
FZ
1460static void nvme_register_buf(BlockDriverState *bs, void *host, size_t size)
1461{
1462 int ret;
1463 BDRVNVMeState *s = bs->opaque;
1464
1465 ret = qemu_vfio_dma_map(s->vfio, host, size, false, NULL);
1466 if (ret) {
1467 /* FIXME: we may run out of IOVA addresses after repeated
1468 * bdrv_register_buf/bdrv_unregister_buf, because nvme_vfio_dma_unmap
1469 * doesn't reclaim addresses for fixed mappings. */
1470 error_report("nvme_register_buf failed: %s", strerror(-ret));
1471 }
1472}
1473
1474static void nvme_unregister_buf(BlockDriverState *bs, void *host)
1475{
1476 BDRVNVMeState *s = bs->opaque;
1477
1478 qemu_vfio_dma_unmap(s->vfio, host);
1479}
1480
f25e7ab2
PMD
1481static BlockStatsSpecific *nvme_get_specific_stats(BlockDriverState *bs)
1482{
1483 BlockStatsSpecific *stats = g_new(BlockStatsSpecific, 1);
1484 BDRVNVMeState *s = bs->opaque;
1485
1486 stats->driver = BLOCKDEV_DRIVER_NVME;
1487 stats->u.nvme = (BlockStatsSpecificNvme) {
1488 .completion_errors = s->stats.completion_errors,
1489 .aligned_accesses = s->stats.aligned_accesses,
1490 .unaligned_accesses = s->stats.unaligned_accesses,
1491 };
1492
1493 return stats;
1494}
1495
2654267c
HR
1496static const char *const nvme_strong_runtime_opts[] = {
1497 NVME_BLOCK_OPT_DEVICE,
1498 NVME_BLOCK_OPT_NAMESPACE,
1499
1500 NULL
1501};
1502
bdd6a90a
FZ
1503static BlockDriver bdrv_nvme = {
1504 .format_name = "nvme",
1505 .protocol_name = "nvme",
1506 .instance_size = sizeof(BDRVNVMeState),
1507
5a5e7f8c
ML
1508 .bdrv_co_create_opts = bdrv_co_create_opts_simple,
1509 .create_opts = &bdrv_create_opts_simple,
1510
bdd6a90a
FZ
1511 .bdrv_parse_filename = nvme_parse_filename,
1512 .bdrv_file_open = nvme_file_open,
1513 .bdrv_close = nvme_close,
1514 .bdrv_getlength = nvme_getlength,
118d1b6a 1515 .bdrv_probe_blocksizes = nvme_probe_blocksizes,
bdd6a90a
FZ
1516
1517 .bdrv_co_preadv = nvme_co_preadv,
1518 .bdrv_co_pwritev = nvme_co_pwritev,
e0dd95e3
ML
1519
1520 .bdrv_co_pwrite_zeroes = nvme_co_pwrite_zeroes,
e87a09d6 1521 .bdrv_co_pdiscard = nvme_co_pdiscard,
e0dd95e3 1522
bdd6a90a
FZ
1523 .bdrv_co_flush_to_disk = nvme_co_flush,
1524 .bdrv_reopen_prepare = nvme_reopen_prepare,
1525
bdd6a90a
FZ
1526 .bdrv_refresh_filename = nvme_refresh_filename,
1527 .bdrv_refresh_limits = nvme_refresh_limits,
2654267c 1528 .strong_runtime_opts = nvme_strong_runtime_opts,
f25e7ab2 1529 .bdrv_get_specific_stats = nvme_get_specific_stats,
bdd6a90a
FZ
1530
1531 .bdrv_detach_aio_context = nvme_detach_aio_context,
1532 .bdrv_attach_aio_context = nvme_attach_aio_context,
1533
1534 .bdrv_io_plug = nvme_aio_plug,
1535 .bdrv_io_unplug = nvme_aio_unplug,
9ed61612
FZ
1536
1537 .bdrv_register_buf = nvme_register_buf,
1538 .bdrv_unregister_buf = nvme_unregister_buf,
bdd6a90a
FZ
1539};
1540
1541static void bdrv_nvme_init(void)
1542{
1543 bdrv_register(&bdrv_nvme);
1544}
1545
1546block_init(bdrv_nvme_init);