4 * Copyright (c) Intel Corporation.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 * NVMe over PCIe transport
40 #include <sys/syscall.h>
41 #include <sys/types.h>
43 #include "nvme_internal.h"
44 #include "nvme_uevent.h"
46 #define NVME_ADMIN_ENTRIES (128)
47 #define NVME_ADMIN_TRACKERS (64)
50 * NVME_IO_ENTRIES defines the size of an I/O qpair's submission and completion
51 * queues, while NVME_IO_TRACKERS defines the maximum number of I/O that we
52 * will allow outstanding on an I/O qpair at any time. The only advantage in
53 * having IO_ENTRIES > IO_TRACKERS is for debugging purposes - when dumping
54 * the contents of the submission and completion queues, it will show a longer
57 #define NVME_IO_ENTRIES (256)
58 #define NVME_IO_TRACKERS (128)
61 * NVME_MAX_SGL_DESCRIPTORS defines the maximum number of descriptors in one SGL
64 #define NVME_MAX_SGL_DESCRIPTORS (253)
66 #define NVME_MAX_PRP_LIST_ENTRIES (506)
69 * For commands requiring more than 2 PRP entries, one PRP will be
70 * embedded in the command (prp1), and the rest of the PRP entries
71 * will be in a list pointed to by the command (prp2). This means
72 * that real max number of PRP entries we support is 506+1, which
73 * results in a max xfer size of 506*PAGE_SIZE.
75 #define NVME_MAX_XFER_SIZE NVME_MAX_PRP_LIST_ENTRIES * PAGE_SIZE
77 struct nvme_pcie_enum_ctx
{
78 spdk_nvme_probe_cb probe_cb
;
80 struct spdk_pci_addr pci_addr
;
84 /* PCIe transport extensions for spdk_nvme_ctrlr */
85 struct nvme_pcie_ctrlr
{
86 struct spdk_nvme_ctrlr ctrlr
;
88 /** NVMe MMIO register space */
89 volatile struct spdk_nvme_registers
*regs
;
91 /** NVMe MMIO register size */
94 /* BAR mapping address which contains controller memory buffer */
95 void *cmb_bar_virt_addr
;
97 /* BAR physical address which contains controller memory buffer */
98 uint64_t cmb_bar_phys_addr
;
100 /* Controller memory buffer size in Bytes */
103 /* Current offset of controller memory buffer */
104 uint64_t cmb_current_offset
;
106 /** stride in uint32_t units between doorbell registers (1 = 4 bytes, 2 = 8 bytes, ...) */
107 uint32_t doorbell_stride_u32
;
109 /* Opaque handle to associated PCI device. */
110 struct spdk_pci_device
*devhandle
;
112 /* Flag to indicate the MMIO register has been remapped */
116 struct nvme_tracker
{
117 TAILQ_ENTRY(nvme_tracker
) tq_list
;
119 struct nvme_request
*req
;
123 uint16_t timed_out
: 1;
128 /* The value of spdk_get_ticks() when the tracker was submitted to the hardware. */
129 uint64_t submit_tick
;
131 uint64_t prp_sgl_bus_addr
;
134 uint64_t prp
[NVME_MAX_PRP_LIST_ENTRIES
];
135 struct spdk_nvme_sgl_descriptor sgl
[NVME_MAX_SGL_DESCRIPTORS
];
139 * struct nvme_tracker must be exactly 4K so that the prp[] array does not cross a page boundary
140 * and so that there is no padding required to meet alignment requirements.
142 SPDK_STATIC_ASSERT(sizeof(struct nvme_tracker
) == 4096, "nvme_tracker is not 4K");
143 SPDK_STATIC_ASSERT((offsetof(struct nvme_tracker
, u
.sgl
) & 7) == 0, "SGL must be Qword aligned");
145 /* PCIe transport extensions for spdk_nvme_qpair */
146 struct nvme_pcie_qpair
{
147 /* Submission queue tail doorbell */
148 volatile uint32_t *sq_tdbl
;
150 /* Completion queue head doorbell */
151 volatile uint32_t *cq_hdbl
;
153 /* Submission queue */
154 struct spdk_nvme_cmd
*cmd
;
156 /* Completion queue */
157 struct spdk_nvme_cpl
*cpl
;
159 TAILQ_HEAD(, nvme_tracker
) free_tr
;
160 TAILQ_HEAD(nvme_outstanding_tr_head
, nvme_tracker
) outstanding_tr
;
162 /* Array of trackers indexed by command ID. */
163 struct nvme_tracker
*tr
;
165 uint16_t num_entries
;
175 * Base qpair structure.
176 * This is located after the hot data in this structure so that the important parts of
177 * nvme_pcie_qpair are in the same cache line.
179 struct spdk_nvme_qpair qpair
;
182 * Fields below this point should not be touched on the normal I/O path.
187 uint64_t cmd_bus_addr
;
188 uint64_t cpl_bus_addr
;
191 static int nvme_pcie_ctrlr_attach(spdk_nvme_probe_cb probe_cb
, void *cb_ctx
,
192 struct spdk_pci_addr
*pci_addr
);
193 static int nvme_pcie_qpair_construct(struct spdk_nvme_qpair
*qpair
);
194 static int nvme_pcie_qpair_destroy(struct spdk_nvme_qpair
*qpair
);
196 __thread
struct nvme_pcie_ctrlr
*g_thread_mmio_ctrlr
= NULL
;
197 static volatile uint16_t g_signal_lock
;
198 static bool g_sigset
= false;
199 static int hotplug_fd
= -1;
202 nvme_sigbus_fault_sighandler(int signum
, siginfo_t
*info
, void *ctx
)
206 if (!__sync_bool_compare_and_swap(&g_signal_lock
, 0, 1)) {
210 assert(g_thread_mmio_ctrlr
!= NULL
);
212 if (!g_thread_mmio_ctrlr
->is_remapped
) {
213 map_address
= mmap((void *)g_thread_mmio_ctrlr
->regs
, g_thread_mmio_ctrlr
->regs_size
,
214 PROT_READ
| PROT_WRITE
,
215 MAP_PRIVATE
| MAP_ANONYMOUS
| MAP_FIXED
, -1, 0);
216 if (map_address
== MAP_FAILED
) {
217 SPDK_ERRLOG("mmap failed\n");
221 memset(map_address
, 0xFF, sizeof(struct spdk_nvme_registers
));
222 g_thread_mmio_ctrlr
->regs
= (volatile struct spdk_nvme_registers
*)map_address
;
223 g_thread_mmio_ctrlr
->is_remapped
= true;
230 nvme_pcie_ctrlr_setup_signal(void)
234 sa
.sa_sigaction
= nvme_sigbus_fault_sighandler
;
235 sigemptyset(&sa
.sa_mask
);
236 sa
.sa_flags
= SA_SIGINFO
;
237 sigaction(SIGBUS
, &sa
, NULL
);
241 _nvme_pcie_hotplug_monitor(void *cb_ctx
, spdk_nvme_probe_cb probe_cb
,
242 spdk_nvme_remove_cb remove_cb
)
244 struct spdk_nvme_ctrlr
*ctrlr
;
245 struct spdk_uevent event
;
246 struct spdk_pci_addr pci_addr
;
248 while (spdk_get_uevent(hotplug_fd
, &event
) > 0) {
249 if (event
.subsystem
== SPDK_NVME_UEVENT_SUBSYSTEM_UIO
) {
250 if (event
.action
== SPDK_NVME_UEVENT_ADD
) {
251 SPDK_TRACELOG(SPDK_TRACE_NVME
, "add nvme address: %s\n",
253 if (spdk_process_is_primary()) {
254 if (!spdk_pci_addr_parse(&pci_addr
, event
.traddr
)) {
255 nvme_pcie_ctrlr_attach(probe_cb
, cb_ctx
, &pci_addr
);
258 } else if (event
.action
== SPDK_NVME_UEVENT_REMOVE
) {
259 bool in_list
= false;
261 TAILQ_FOREACH(ctrlr
, &g_spdk_nvme_driver
->attached_ctrlrs
, tailq
) {
262 if (strcmp(event
.traddr
, ctrlr
->trid
.traddr
) == 0) {
267 if (in_list
== false) {
270 SPDK_TRACELOG(SPDK_TRACE_NVME
, "remove nvme address: %s\n",
273 nvme_ctrlr_fail(ctrlr
, true);
275 /* get the user app to clean up and stop I/O */
277 nvme_robust_mutex_unlock(&g_spdk_nvme_driver
->lock
);
278 remove_cb(cb_ctx
, ctrlr
);
279 nvme_robust_mutex_lock(&g_spdk_nvme_driver
->lock
);
287 static inline struct nvme_pcie_ctrlr
*
288 nvme_pcie_ctrlr(struct spdk_nvme_ctrlr
*ctrlr
)
290 assert(ctrlr
->trid
.trtype
== SPDK_NVME_TRANSPORT_PCIE
);
291 return (struct nvme_pcie_ctrlr
*)((uintptr_t)ctrlr
- offsetof(struct nvme_pcie_ctrlr
, ctrlr
));
294 static inline struct nvme_pcie_qpair
*
295 nvme_pcie_qpair(struct spdk_nvme_qpair
*qpair
)
297 assert(qpair
->trtype
== SPDK_NVME_TRANSPORT_PCIE
);
298 return (struct nvme_pcie_qpair
*)((uintptr_t)qpair
- offsetof(struct nvme_pcie_qpair
, qpair
));
301 static volatile void *
302 nvme_pcie_reg_addr(struct spdk_nvme_ctrlr
*ctrlr
, uint32_t offset
)
304 struct nvme_pcie_ctrlr
*pctrlr
= nvme_pcie_ctrlr(ctrlr
);
306 return (volatile void *)((uintptr_t)pctrlr
->regs
+ offset
);
310 nvme_pcie_ctrlr_set_reg_4(struct spdk_nvme_ctrlr
*ctrlr
, uint32_t offset
, uint32_t value
)
312 struct nvme_pcie_ctrlr
*pctrlr
= nvme_pcie_ctrlr(ctrlr
);
314 assert(offset
<= sizeof(struct spdk_nvme_registers
) - 4);
315 g_thread_mmio_ctrlr
= pctrlr
;
316 spdk_mmio_write_4(nvme_pcie_reg_addr(ctrlr
, offset
), value
);
317 g_thread_mmio_ctrlr
= NULL
;
322 nvme_pcie_ctrlr_set_reg_8(struct spdk_nvme_ctrlr
*ctrlr
, uint32_t offset
, uint64_t value
)
324 struct nvme_pcie_ctrlr
*pctrlr
= nvme_pcie_ctrlr(ctrlr
);
326 assert(offset
<= sizeof(struct spdk_nvme_registers
) - 8);
327 g_thread_mmio_ctrlr
= pctrlr
;
328 spdk_mmio_write_8(nvme_pcie_reg_addr(ctrlr
, offset
), value
);
329 g_thread_mmio_ctrlr
= NULL
;
334 nvme_pcie_ctrlr_get_reg_4(struct spdk_nvme_ctrlr
*ctrlr
, uint32_t offset
, uint32_t *value
)
336 struct nvme_pcie_ctrlr
*pctrlr
= nvme_pcie_ctrlr(ctrlr
);
338 assert(offset
<= sizeof(struct spdk_nvme_registers
) - 4);
339 assert(value
!= NULL
);
340 g_thread_mmio_ctrlr
= pctrlr
;
341 *value
= spdk_mmio_read_4(nvme_pcie_reg_addr(ctrlr
, offset
));
342 g_thread_mmio_ctrlr
= NULL
;
343 if (~(*value
) == 0) {
351 nvme_pcie_ctrlr_get_reg_8(struct spdk_nvme_ctrlr
*ctrlr
, uint32_t offset
, uint64_t *value
)
353 struct nvme_pcie_ctrlr
*pctrlr
= nvme_pcie_ctrlr(ctrlr
);
355 assert(offset
<= sizeof(struct spdk_nvme_registers
) - 8);
356 assert(value
!= NULL
);
357 g_thread_mmio_ctrlr
= pctrlr
;
358 *value
= spdk_mmio_read_8(nvme_pcie_reg_addr(ctrlr
, offset
));
359 g_thread_mmio_ctrlr
= NULL
;
360 if (~(*value
) == 0) {
368 nvme_pcie_ctrlr_set_asq(struct nvme_pcie_ctrlr
*pctrlr
, uint64_t value
)
370 return nvme_pcie_ctrlr_set_reg_8(&pctrlr
->ctrlr
, offsetof(struct spdk_nvme_registers
, asq
),
375 nvme_pcie_ctrlr_set_acq(struct nvme_pcie_ctrlr
*pctrlr
, uint64_t value
)
377 return nvme_pcie_ctrlr_set_reg_8(&pctrlr
->ctrlr
, offsetof(struct spdk_nvme_registers
, acq
),
382 nvme_pcie_ctrlr_set_aqa(struct nvme_pcie_ctrlr
*pctrlr
, const union spdk_nvme_aqa_register
*aqa
)
384 return nvme_pcie_ctrlr_set_reg_4(&pctrlr
->ctrlr
, offsetof(struct spdk_nvme_registers
, aqa
.raw
),
389 nvme_pcie_ctrlr_get_cmbloc(struct nvme_pcie_ctrlr
*pctrlr
, union spdk_nvme_cmbloc_register
*cmbloc
)
391 return nvme_pcie_ctrlr_get_reg_4(&pctrlr
->ctrlr
, offsetof(struct spdk_nvme_registers
, cmbloc
.raw
),
396 nvme_pcie_ctrlr_get_cmbsz(struct nvme_pcie_ctrlr
*pctrlr
, union spdk_nvme_cmbsz_register
*cmbsz
)
398 return nvme_pcie_ctrlr_get_reg_4(&pctrlr
->ctrlr
, offsetof(struct spdk_nvme_registers
, cmbsz
.raw
),
403 nvme_pcie_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr
*ctrlr
)
405 return NVME_MAX_XFER_SIZE
;
409 nvme_pcie_ctrlr_get_max_io_queue_size(struct spdk_nvme_ctrlr
*ctrlr
)
411 return NVME_IO_ENTRIES
;
415 nvme_pcie_ctrlr_map_cmb(struct nvme_pcie_ctrlr
*pctrlr
)
420 union spdk_nvme_cmbsz_register cmbsz
;
421 union spdk_nvme_cmbloc_register cmbloc
;
422 uint64_t size
, unit_size
, offset
, bar_size
, bar_phys_addr
;
424 if (nvme_pcie_ctrlr_get_cmbsz(pctrlr
, &cmbsz
) ||
425 nvme_pcie_ctrlr_get_cmbloc(pctrlr
, &cmbloc
)) {
426 SPDK_ERRLOG("get registers failed\n");
433 bir
= cmbloc
.bits
.bir
;
434 /* Values 0 2 3 4 5 are valid for BAR */
435 if (bir
> 5 || bir
== 1)
438 /* unit size for 4KB/64KB/1MB/16MB/256MB/4GB/64GB */
439 unit_size
= (uint64_t)1 << (12 + 4 * cmbsz
.bits
.szu
);
440 /* controller memory buffer size in Bytes */
441 size
= unit_size
* cmbsz
.bits
.sz
;
442 /* controller memory buffer offset from BAR in Bytes */
443 offset
= unit_size
* cmbloc
.bits
.ofst
;
445 rc
= spdk_pci_device_map_bar(pctrlr
->devhandle
, bir
, &addr
,
446 &bar_phys_addr
, &bar_size
);
447 if ((rc
!= 0) || addr
== NULL
) {
451 if (offset
> bar_size
) {
455 if (size
> bar_size
- offset
) {
459 pctrlr
->cmb_bar_virt_addr
= addr
;
460 pctrlr
->cmb_bar_phys_addr
= bar_phys_addr
;
461 pctrlr
->cmb_size
= size
;
462 pctrlr
->cmb_current_offset
= offset
;
464 if (!cmbsz
.bits
.sqs
) {
465 pctrlr
->ctrlr
.opts
.use_cmb_sqs
= false;
470 pctrlr
->cmb_bar_virt_addr
= NULL
;
471 pctrlr
->ctrlr
.opts
.use_cmb_sqs
= false;
476 nvme_pcie_ctrlr_unmap_cmb(struct nvme_pcie_ctrlr
*pctrlr
)
479 union spdk_nvme_cmbloc_register cmbloc
;
480 void *addr
= pctrlr
->cmb_bar_virt_addr
;
483 if (nvme_pcie_ctrlr_get_cmbloc(pctrlr
, &cmbloc
)) {
484 SPDK_ERRLOG("get_cmbloc() failed\n");
487 rc
= spdk_pci_device_unmap_bar(pctrlr
->devhandle
, cmbloc
.bits
.bir
, addr
);
493 nvme_pcie_ctrlr_alloc_cmb(struct spdk_nvme_ctrlr
*ctrlr
, uint64_t length
, uint64_t aligned
,
496 struct nvme_pcie_ctrlr
*pctrlr
= nvme_pcie_ctrlr(ctrlr
);
497 uint64_t round_offset
;
499 round_offset
= pctrlr
->cmb_current_offset
;
500 round_offset
= (round_offset
+ (aligned
- 1)) & ~(aligned
- 1);
502 if (round_offset
+ length
> pctrlr
->cmb_size
)
505 *offset
= round_offset
;
506 pctrlr
->cmb_current_offset
= round_offset
+ length
;
512 nvme_pcie_ctrlr_allocate_bars(struct nvme_pcie_ctrlr
*pctrlr
)
516 uint64_t phys_addr
, size
;
518 rc
= spdk_pci_device_map_bar(pctrlr
->devhandle
, 0, &addr
,
520 pctrlr
->regs
= (volatile struct spdk_nvme_registers
*)addr
;
521 if ((pctrlr
->regs
== NULL
) || (rc
!= 0)) {
522 SPDK_ERRLOG("nvme_pcicfg_map_bar failed with rc %d or bar %p\n",
527 pctrlr
->regs_size
= size
;
528 nvme_pcie_ctrlr_map_cmb(pctrlr
);
534 nvme_pcie_ctrlr_free_bars(struct nvme_pcie_ctrlr
*pctrlr
)
537 void *addr
= (void *)pctrlr
->regs
;
539 if (pctrlr
->ctrlr
.is_removed
) {
543 rc
= nvme_pcie_ctrlr_unmap_cmb(pctrlr
);
545 SPDK_ERRLOG("nvme_ctrlr_unmap_cmb failed with error code %d\n", rc
);
550 /* NOTE: addr may have been remapped here. We're relying on DPDK to call
553 rc
= spdk_pci_device_unmap_bar(pctrlr
->devhandle
, 0, addr
);
559 nvme_pcie_ctrlr_construct_admin_qpair(struct spdk_nvme_ctrlr
*ctrlr
)
561 struct nvme_pcie_qpair
*pqpair
;
564 pqpair
= spdk_zmalloc(sizeof(*pqpair
), 64, NULL
);
565 if (pqpair
== NULL
) {
569 pqpair
->num_entries
= NVME_ADMIN_ENTRIES
;
571 ctrlr
->adminq
= &pqpair
->qpair
;
573 rc
= nvme_qpair_init(ctrlr
->adminq
,
576 SPDK_NVME_QPRIO_URGENT
,
582 return nvme_pcie_qpair_construct(ctrlr
->adminq
);
585 /* This function must only be called while holding g_spdk_nvme_driver->lock */
587 pcie_nvme_enum_cb(void *ctx
, struct spdk_pci_device
*pci_dev
)
589 struct spdk_nvme_transport_id trid
= {};
590 struct nvme_pcie_enum_ctx
*enum_ctx
= ctx
;
591 struct spdk_nvme_ctrlr
*ctrlr
;
593 struct spdk_pci_addr pci_addr
;
595 pci_addr
= spdk_pci_device_get_addr(pci_dev
);
597 trid
.trtype
= SPDK_NVME_TRANSPORT_PCIE
;
598 spdk_pci_addr_fmt(trid
.traddr
, sizeof(trid
.traddr
), &pci_addr
);
600 /* Verify that this controller is not already attached */
601 TAILQ_FOREACH(ctrlr
, &g_spdk_nvme_driver
->attached_ctrlrs
, tailq
) {
602 /* NOTE: In the case like multi-process environment where the device handle is
603 * different per each process, we compare by BDF to determine whether it is the
606 if (strcmp(trid
.traddr
, ctrlr
->trid
.traddr
) == 0) {
607 if (!spdk_process_is_primary()) {
608 rc
= nvme_ctrlr_add_process(ctrlr
, pci_dev
);
614 /* check whether user passes the pci_addr */
615 if (enum_ctx
->has_pci_addr
&&
616 (spdk_pci_addr_compare(&pci_addr
, &enum_ctx
->pci_addr
) != 0)) {
620 return nvme_ctrlr_probe(&trid
, pci_dev
,
621 enum_ctx
->probe_cb
, enum_ctx
->cb_ctx
);
625 nvme_pcie_ctrlr_scan(const struct spdk_nvme_transport_id
*trid
,
627 spdk_nvme_probe_cb probe_cb
,
628 spdk_nvme_remove_cb remove_cb
)
630 struct nvme_pcie_enum_ctx enum_ctx
= {};
632 enum_ctx
.probe_cb
= probe_cb
;
633 enum_ctx
.cb_ctx
= cb_ctx
;
635 if (strlen(trid
->traddr
) != 0) {
636 if (spdk_pci_addr_parse(&enum_ctx
.pci_addr
, trid
->traddr
)) {
639 enum_ctx
.has_pci_addr
= true;
642 if (hotplug_fd
< 0) {
643 hotplug_fd
= spdk_uevent_connect();
644 if (hotplug_fd
< 0) {
645 SPDK_TRACELOG(SPDK_TRACE_NVME
, "Failed to open uevent netlink socket\n");
648 _nvme_pcie_hotplug_monitor(cb_ctx
, probe_cb
, remove_cb
);
651 if (enum_ctx
.has_pci_addr
== false) {
652 return spdk_pci_nvme_enumerate(pcie_nvme_enum_cb
, &enum_ctx
);
654 return spdk_pci_nvme_device_attach(pcie_nvme_enum_cb
, &enum_ctx
, &enum_ctx
.pci_addr
);
659 nvme_pcie_ctrlr_attach(spdk_nvme_probe_cb probe_cb
, void *cb_ctx
, struct spdk_pci_addr
*pci_addr
)
661 struct nvme_pcie_enum_ctx enum_ctx
;
663 enum_ctx
.probe_cb
= probe_cb
;
664 enum_ctx
.cb_ctx
= cb_ctx
;
666 return spdk_pci_nvme_device_attach(pcie_nvme_enum_cb
, &enum_ctx
, pci_addr
);
669 struct spdk_nvme_ctrlr
*nvme_pcie_ctrlr_construct(const struct spdk_nvme_transport_id
*trid
,
670 const struct spdk_nvme_ctrlr_opts
*opts
,
673 struct spdk_pci_device
*pci_dev
= devhandle
;
674 struct nvme_pcie_ctrlr
*pctrlr
;
675 union spdk_nvme_cap_register cap
;
678 struct spdk_pci_id pci_id
;
680 pctrlr
= spdk_zmalloc(sizeof(struct nvme_pcie_ctrlr
), 64, NULL
);
681 if (pctrlr
== NULL
) {
682 SPDK_ERRLOG("could not allocate ctrlr\n");
686 pctrlr
->is_remapped
= false;
687 pctrlr
->ctrlr
.is_removed
= false;
688 pctrlr
->ctrlr
.trid
.trtype
= SPDK_NVME_TRANSPORT_PCIE
;
689 pctrlr
->devhandle
= devhandle
;
690 pctrlr
->ctrlr
.opts
= *opts
;
691 memcpy(&pctrlr
->ctrlr
.trid
, trid
, sizeof(pctrlr
->ctrlr
.trid
));
693 rc
= nvme_pcie_ctrlr_allocate_bars(pctrlr
);
699 /* Enable PCI busmaster and disable INTx */
700 spdk_pci_device_cfg_read32(pci_dev
, &cmd_reg
, 4);
702 spdk_pci_device_cfg_write32(pci_dev
, cmd_reg
, 4);
704 if (nvme_ctrlr_get_cap(&pctrlr
->ctrlr
, &cap
)) {
705 SPDK_ERRLOG("get_cap() failed\n");
710 nvme_ctrlr_init_cap(&pctrlr
->ctrlr
, &cap
);
712 /* Doorbell stride is 2 ^ (dstrd + 2),
713 * but we want multiples of 4, so drop the + 2 */
714 pctrlr
->doorbell_stride_u32
= 1 << cap
.bits
.dstrd
;
716 rc
= nvme_ctrlr_construct(&pctrlr
->ctrlr
);
718 nvme_ctrlr_destruct(&pctrlr
->ctrlr
);
722 pci_id
= spdk_pci_device_get_id(pci_dev
);
723 pctrlr
->ctrlr
.quirks
= nvme_get_quirks(&pci_id
);
725 rc
= nvme_pcie_ctrlr_construct_admin_qpair(&pctrlr
->ctrlr
);
727 nvme_ctrlr_destruct(&pctrlr
->ctrlr
);
731 /* Construct the primary process properties */
732 rc
= nvme_ctrlr_add_process(&pctrlr
->ctrlr
, pci_dev
);
734 nvme_ctrlr_destruct(&pctrlr
->ctrlr
);
738 if (g_sigset
!= true) {
739 nvme_pcie_ctrlr_setup_signal();
743 return &pctrlr
->ctrlr
;
747 nvme_pcie_ctrlr_enable(struct spdk_nvme_ctrlr
*ctrlr
)
749 struct nvme_pcie_ctrlr
*pctrlr
= nvme_pcie_ctrlr(ctrlr
);
750 struct nvme_pcie_qpair
*padminq
= nvme_pcie_qpair(ctrlr
->adminq
);
751 union spdk_nvme_aqa_register aqa
;
753 if (nvme_pcie_ctrlr_set_asq(pctrlr
, padminq
->cmd_bus_addr
)) {
754 SPDK_ERRLOG("set_asq() failed\n");
758 if (nvme_pcie_ctrlr_set_acq(pctrlr
, padminq
->cpl_bus_addr
)) {
759 SPDK_ERRLOG("set_acq() failed\n");
764 /* acqs and asqs are 0-based. */
765 aqa
.bits
.acqs
= nvme_pcie_qpair(ctrlr
->adminq
)->num_entries
- 1;
766 aqa
.bits
.asqs
= nvme_pcie_qpair(ctrlr
->adminq
)->num_entries
- 1;
768 if (nvme_pcie_ctrlr_set_aqa(pctrlr
, &aqa
)) {
769 SPDK_ERRLOG("set_aqa() failed\n");
777 nvme_pcie_ctrlr_destruct(struct spdk_nvme_ctrlr
*ctrlr
)
779 struct nvme_pcie_ctrlr
*pctrlr
= nvme_pcie_ctrlr(ctrlr
);
782 nvme_pcie_qpair_destroy(ctrlr
->adminq
);
785 nvme_ctrlr_free_processes(ctrlr
);
787 nvme_pcie_ctrlr_free_bars(pctrlr
);
788 spdk_pci_device_detach(pctrlr
->devhandle
);
795 nvme_qpair_construct_tracker(struct nvme_tracker
*tr
, uint16_t cid
, uint64_t phys_addr
)
797 tr
->prp_sgl_bus_addr
= phys_addr
+ offsetof(struct nvme_tracker
, u
.prp
);
803 nvme_pcie_qpair_reset(struct spdk_nvme_qpair
*qpair
)
805 struct nvme_pcie_qpair
*pqpair
= nvme_pcie_qpair(qpair
);
807 pqpair
->sq_tail
= pqpair
->cq_head
= 0;
810 * First time through the completion queue, HW will set phase
811 * bit on completions to 1. So set this to 1 here, indicating
812 * we're looking for a 1 to know which entries have completed.
813 * we'll toggle the bit each time when the completion queue
818 memset(pqpair
->cmd
, 0,
819 pqpair
->num_entries
* sizeof(struct spdk_nvme_cmd
));
820 memset(pqpair
->cpl
, 0,
821 pqpair
->num_entries
* sizeof(struct spdk_nvme_cpl
));
827 nvme_pcie_qpair_construct(struct spdk_nvme_qpair
*qpair
)
829 struct spdk_nvme_ctrlr
*ctrlr
= qpair
->ctrlr
;
830 struct nvme_pcie_ctrlr
*pctrlr
= nvme_pcie_ctrlr(ctrlr
);
831 struct nvme_pcie_qpair
*pqpair
= nvme_pcie_qpair(qpair
);
832 struct nvme_tracker
*tr
;
834 volatile uint32_t *doorbell_base
;
835 uint64_t phys_addr
= 0;
837 uint16_t num_trackers
;
839 if (qpair
->id
== 0) {
840 num_trackers
= NVME_ADMIN_TRACKERS
;
843 * No need to have more trackers than entries in the submit queue.
844 * Note also that for a queue size of N, we can only have (N-1)
845 * commands outstanding, hence the "-1" here.
847 num_trackers
= spdk_min(NVME_IO_TRACKERS
, pqpair
->num_entries
- 1);
850 assert(num_trackers
!= 0);
852 pqpair
->sq_in_cmb
= false;
854 /* cmd and cpl rings must be aligned on 4KB boundaries. */
855 if (ctrlr
->opts
.use_cmb_sqs
) {
856 if (nvme_pcie_ctrlr_alloc_cmb(ctrlr
, pqpair
->num_entries
* sizeof(struct spdk_nvme_cmd
),
857 0x1000, &offset
) == 0) {
858 pqpair
->cmd
= pctrlr
->cmb_bar_virt_addr
+ offset
;
859 pqpair
->cmd_bus_addr
= pctrlr
->cmb_bar_phys_addr
+ offset
;
860 pqpair
->sq_in_cmb
= true;
863 if (pqpair
->sq_in_cmb
== false) {
864 pqpair
->cmd
= spdk_zmalloc(pqpair
->num_entries
* sizeof(struct spdk_nvme_cmd
),
866 &pqpair
->cmd_bus_addr
);
867 if (pqpair
->cmd
== NULL
) {
868 SPDK_ERRLOG("alloc qpair_cmd failed\n");
873 pqpair
->cpl
= spdk_zmalloc(pqpair
->num_entries
* sizeof(struct spdk_nvme_cpl
),
875 &pqpair
->cpl_bus_addr
);
876 if (pqpair
->cpl
== NULL
) {
877 SPDK_ERRLOG("alloc qpair_cpl failed\n");
881 doorbell_base
= &pctrlr
->regs
->doorbell
[0].sq_tdbl
;
882 pqpair
->sq_tdbl
= doorbell_base
+ (2 * qpair
->id
+ 0) * pctrlr
->doorbell_stride_u32
;
883 pqpair
->cq_hdbl
= doorbell_base
+ (2 * qpair
->id
+ 1) * pctrlr
->doorbell_stride_u32
;
886 * Reserve space for all of the trackers in a single allocation.
887 * struct nvme_tracker must be padded so that its size is already a power of 2.
888 * This ensures the PRP list embedded in the nvme_tracker object will not span a
889 * 4KB boundary, while allowing access to trackers in tr[] via normal array indexing.
891 pqpair
->tr
= spdk_zmalloc(num_trackers
* sizeof(*tr
), sizeof(*tr
), &phys_addr
);
892 if (pqpair
->tr
== NULL
) {
893 SPDK_ERRLOG("nvme_tr failed\n");
897 TAILQ_INIT(&pqpair
->free_tr
);
898 TAILQ_INIT(&pqpair
->outstanding_tr
);
900 for (i
= 0; i
< num_trackers
; i
++) {
902 nvme_qpair_construct_tracker(tr
, i
, phys_addr
);
903 TAILQ_INSERT_HEAD(&pqpair
->free_tr
, tr
, tq_list
);
904 phys_addr
+= sizeof(struct nvme_tracker
);
907 nvme_pcie_qpair_reset(qpair
);
913 nvme_pcie_copy_command(struct spdk_nvme_cmd
*dst
, const struct spdk_nvme_cmd
*src
)
915 /* dst and src are known to be non-overlapping and 64-byte aligned. */
917 __m256i
*d256
= (__m256i
*)dst
;
918 const __m256i
*s256
= (const __m256i
*)src
;
920 _mm256_store_si256(&d256
[0], _mm256_load_si256(&s256
[0]));
921 _mm256_store_si256(&d256
[1], _mm256_load_si256(&s256
[1]));
922 #elif defined(__SSE2__)
923 __m128i
*d128
= (__m128i
*)dst
;
924 const __m128i
*s128
= (const __m128i
*)src
;
926 _mm_store_si128(&d128
[0], _mm_load_si128(&s128
[0]));
927 _mm_store_si128(&d128
[1], _mm_load_si128(&s128
[1]));
928 _mm_store_si128(&d128
[2], _mm_load_si128(&s128
[2]));
929 _mm_store_si128(&d128
[3], _mm_load_si128(&s128
[3]));
936 * Note: the ctrlr_lock must be held when calling this function.
939 nvme_pcie_qpair_insert_pending_admin_request(struct spdk_nvme_qpair
*qpair
,
940 struct nvme_request
*req
, struct spdk_nvme_cpl
*cpl
)
942 struct spdk_nvme_ctrlr
*ctrlr
= qpair
->ctrlr
;
943 struct nvme_request
*active_req
= req
;
944 struct spdk_nvme_ctrlr_process
*active_proc
;
945 bool pending_on_proc
= false;
948 * The admin request is from another process. Move to the per
949 * process list for that process to handle it later.
951 assert(nvme_qpair_is_admin_queue(qpair
));
952 assert(active_req
->pid
!= getpid());
954 TAILQ_FOREACH(active_proc
, &ctrlr
->active_procs
, tailq
) {
955 if (active_proc
->pid
== active_req
->pid
) {
956 /* Saved the original completion information */
957 memcpy(&active_req
->cpl
, cpl
, sizeof(*cpl
));
958 STAILQ_INSERT_TAIL(&active_proc
->active_reqs
, active_req
, stailq
);
959 pending_on_proc
= true;
965 if (pending_on_proc
== false) {
966 SPDK_ERRLOG("The owning process (pid %d) is not found. Drop the request.\n",
969 nvme_free_request(active_req
);
974 * Note: the ctrlr_lock must be held when calling this function.
977 nvme_pcie_qpair_complete_pending_admin_request(struct spdk_nvme_qpair
*qpair
)
979 struct spdk_nvme_ctrlr
*ctrlr
= qpair
->ctrlr
;
980 struct nvme_request
*req
, *tmp_req
;
981 bool proc_found
= false;
982 pid_t pid
= getpid();
983 struct spdk_nvme_ctrlr_process
*proc
;
986 * Check whether there is any pending admin request from
987 * other active processes.
989 assert(nvme_qpair_is_admin_queue(qpair
));
991 TAILQ_FOREACH(proc
, &ctrlr
->active_procs
, tailq
) {
992 if (proc
->pid
== pid
) {
999 if (proc_found
== false) {
1000 SPDK_ERRLOG("the active process (pid %d) is not found for this controller.\n", pid
);
1004 STAILQ_FOREACH_SAFE(req
, &proc
->active_reqs
, stailq
, tmp_req
) {
1005 STAILQ_REMOVE(&proc
->active_reqs
, req
, nvme_request
, stailq
);
1007 assert(req
->pid
== pid
);
1010 req
->cb_fn(req
->cb_arg
, &req
->cpl
);
1013 nvme_free_request(req
);
1018 nvme_pcie_qpair_submit_tracker(struct spdk_nvme_qpair
*qpair
, struct nvme_tracker
*tr
)
1020 struct nvme_request
*req
;
1021 struct nvme_pcie_qpair
*pqpair
= nvme_pcie_qpair(qpair
);
1022 struct nvme_pcie_ctrlr
*pctrlr
= nvme_pcie_ctrlr(qpair
->ctrlr
);
1024 tr
->submit_tick
= spdk_get_ticks();
1028 pqpair
->tr
[tr
->cid
].active
= true;
1030 /* Copy the command from the tracker to the submission queue. */
1031 nvme_pcie_copy_command(&pqpair
->cmd
[pqpair
->sq_tail
], &req
->cmd
);
1033 if (++pqpair
->sq_tail
== pqpair
->num_entries
) {
1034 pqpair
->sq_tail
= 0;
1038 g_thread_mmio_ctrlr
= pctrlr
;
1039 spdk_mmio_write_4(pqpair
->sq_tdbl
, pqpair
->sq_tail
);
1040 g_thread_mmio_ctrlr
= NULL
;
1044 nvme_pcie_qpair_complete_tracker(struct spdk_nvme_qpair
*qpair
, struct nvme_tracker
*tr
,
1045 struct spdk_nvme_cpl
*cpl
, bool print_on_error
)
1047 struct nvme_pcie_qpair
*pqpair
= nvme_pcie_qpair(qpair
);
1048 struct nvme_request
*req
;
1049 bool retry
, error
, was_active
;
1050 bool req_from_current_proc
= true;
1054 assert(req
!= NULL
);
1056 error
= spdk_nvme_cpl_is_error(cpl
);
1057 retry
= error
&& nvme_completion_is_retry(cpl
) &&
1058 req
->retries
< spdk_nvme_retry_count
;
1060 if (error
&& print_on_error
) {
1061 nvme_qpair_print_command(qpair
, &req
->cmd
);
1062 nvme_qpair_print_completion(qpair
, cpl
);
1065 was_active
= pqpair
->tr
[cpl
->cid
].active
;
1066 pqpair
->tr
[cpl
->cid
].active
= false;
1068 assert(cpl
->cid
== req
->cmd
.cid
);
1072 nvme_pcie_qpair_submit_tracker(qpair
, tr
);
1075 /* Only check admin requests from different processes. */
1076 if (nvme_qpair_is_admin_queue(qpair
) && req
->pid
!= getpid()) {
1077 req_from_current_proc
= false;
1078 nvme_pcie_qpair_insert_pending_admin_request(qpair
, req
, cpl
);
1081 req
->cb_fn(req
->cb_arg
, cpl
);
1086 if (req_from_current_proc
== true) {
1087 nvme_free_request(req
);
1092 TAILQ_REMOVE(&pqpair
->outstanding_tr
, tr
, tq_list
);
1093 TAILQ_INSERT_HEAD(&pqpair
->free_tr
, tr
, tq_list
);
1096 * If the controller is in the middle of resetting, don't
1097 * try to submit queued requests here - let the reset logic
1098 * handle that instead.
1100 if (!STAILQ_EMPTY(&qpair
->queued_req
) &&
1101 !qpair
->ctrlr
->is_resetting
) {
1102 req
= STAILQ_FIRST(&qpair
->queued_req
);
1103 STAILQ_REMOVE_HEAD(&qpair
->queued_req
, stailq
);
1104 nvme_qpair_submit_request(qpair
, req
);
1110 nvme_pcie_qpair_manual_complete_tracker(struct spdk_nvme_qpair
*qpair
,
1111 struct nvme_tracker
*tr
, uint32_t sct
, uint32_t sc
, uint32_t dnr
,
1112 bool print_on_error
)
1114 struct spdk_nvme_cpl cpl
;
1116 memset(&cpl
, 0, sizeof(cpl
));
1117 cpl
.sqid
= qpair
->id
;
1119 cpl
.status
.sct
= sct
;
1121 cpl
.status
.dnr
= dnr
;
1122 nvme_pcie_qpair_complete_tracker(qpair
, tr
, &cpl
, print_on_error
);
1126 nvme_pcie_qpair_abort_trackers(struct spdk_nvme_qpair
*qpair
, uint32_t dnr
)
1128 struct nvme_pcie_qpair
*pqpair
= nvme_pcie_qpair(qpair
);
1129 struct nvme_tracker
*tr
, *temp
;
1131 TAILQ_FOREACH_SAFE(tr
, &pqpair
->outstanding_tr
, tq_list
, temp
) {
1132 SPDK_ERRLOG("aborting outstanding command\n");
1133 nvme_pcie_qpair_manual_complete_tracker(qpair
, tr
, SPDK_NVME_SCT_GENERIC
,
1134 SPDK_NVME_SC_ABORTED_BY_REQUEST
, dnr
, true);
1139 nvme_pcie_admin_qpair_abort_aers(struct spdk_nvme_qpair
*qpair
)
1141 struct nvme_pcie_qpair
*pqpair
= nvme_pcie_qpair(qpair
);
1142 struct nvme_tracker
*tr
;
1144 tr
= TAILQ_FIRST(&pqpair
->outstanding_tr
);
1145 while (tr
!= NULL
) {
1146 assert(tr
->req
!= NULL
);
1147 if (tr
->req
->cmd
.opc
== SPDK_NVME_OPC_ASYNC_EVENT_REQUEST
) {
1148 nvme_pcie_qpair_manual_complete_tracker(qpair
, tr
,
1149 SPDK_NVME_SCT_GENERIC
, SPDK_NVME_SC_ABORTED_SQ_DELETION
, 0,
1151 tr
= TAILQ_FIRST(&pqpair
->outstanding_tr
);
1153 tr
= TAILQ_NEXT(tr
, tq_list
);
1159 nvme_pcie_admin_qpair_destroy(struct spdk_nvme_qpair
*qpair
)
1161 nvme_pcie_admin_qpair_abort_aers(qpair
);
1165 nvme_pcie_qpair_destroy(struct spdk_nvme_qpair
*qpair
)
1167 struct nvme_pcie_qpair
*pqpair
= nvme_pcie_qpair(qpair
);
1169 if (nvme_qpair_is_admin_queue(qpair
)) {
1170 nvme_pcie_admin_qpair_destroy(qpair
);
1172 if (pqpair
->cmd
&& !pqpair
->sq_in_cmb
) {
1173 spdk_free(pqpair
->cmd
);
1176 spdk_free(pqpair
->cpl
);
1179 spdk_free(pqpair
->tr
);
1188 nvme_pcie_admin_qpair_enable(struct spdk_nvme_qpair
*qpair
)
1191 * Manually abort each outstanding admin command. Do not retry
1192 * admin commands found here, since they will be left over from
1193 * a controller reset and its likely the context in which the
1194 * command was issued no longer applies.
1196 nvme_pcie_qpair_abort_trackers(qpair
, 1 /* do not retry */);
1200 nvme_pcie_io_qpair_enable(struct spdk_nvme_qpair
*qpair
)
1202 /* Manually abort each outstanding I/O. */
1203 nvme_pcie_qpair_abort_trackers(qpair
, 0);
1207 nvme_pcie_qpair_enable(struct spdk_nvme_qpair
*qpair
)
1209 struct nvme_pcie_qpair
*pqpair
= nvme_pcie_qpair(qpair
);
1211 pqpair
->is_enabled
= true;
1212 if (nvme_qpair_is_io_queue(qpair
)) {
1213 nvme_pcie_io_qpair_enable(qpair
);
1215 nvme_pcie_admin_qpair_enable(qpair
);
1222 nvme_pcie_admin_qpair_disable(struct spdk_nvme_qpair
*qpair
)
1224 nvme_pcie_admin_qpair_abort_aers(qpair
);
1228 nvme_pcie_io_qpair_disable(struct spdk_nvme_qpair
*qpair
)
1233 nvme_pcie_qpair_disable(struct spdk_nvme_qpair
*qpair
)
1235 struct nvme_pcie_qpair
*pqpair
= nvme_pcie_qpair(qpair
);
1237 pqpair
->is_enabled
= false;
1238 if (nvme_qpair_is_io_queue(qpair
)) {
1239 nvme_pcie_io_qpair_disable(qpair
);
1241 nvme_pcie_admin_qpair_disable(qpair
);
1249 nvme_pcie_qpair_fail(struct spdk_nvme_qpair
*qpair
)
1251 nvme_pcie_qpair_abort_trackers(qpair
, 1 /* do not retry */);
1257 nvme_pcie_ctrlr_cmd_create_io_cq(struct spdk_nvme_ctrlr
*ctrlr
,
1258 struct spdk_nvme_qpair
*io_que
, spdk_nvme_cmd_cb cb_fn
,
1261 struct nvme_pcie_qpair
*pqpair
= nvme_pcie_qpair(io_que
);
1262 struct nvme_request
*req
;
1263 struct spdk_nvme_cmd
*cmd
;
1265 req
= nvme_allocate_request_null(ctrlr
->adminq
, cb_fn
, cb_arg
);
1271 cmd
->opc
= SPDK_NVME_OPC_CREATE_IO_CQ
;
1274 * TODO: create a create io completion queue command data
1277 cmd
->cdw10
= ((pqpair
->num_entries
- 1) << 16) | io_que
->id
;
1279 * 0x2 = interrupts enabled
1280 * 0x1 = physically contiguous
1283 cmd
->dptr
.prp
.prp1
= pqpair
->cpl_bus_addr
;
1285 return nvme_ctrlr_submit_admin_request(ctrlr
, req
);
1289 nvme_pcie_ctrlr_cmd_create_io_sq(struct spdk_nvme_ctrlr
*ctrlr
,
1290 struct spdk_nvme_qpair
*io_que
, spdk_nvme_cmd_cb cb_fn
, void *cb_arg
)
1292 struct nvme_pcie_qpair
*pqpair
= nvme_pcie_qpair(io_que
);
1293 struct nvme_request
*req
;
1294 struct spdk_nvme_cmd
*cmd
;
1296 req
= nvme_allocate_request_null(ctrlr
->adminq
, cb_fn
, cb_arg
);
1302 cmd
->opc
= SPDK_NVME_OPC_CREATE_IO_SQ
;
1305 * TODO: create a create io submission queue command data
1308 cmd
->cdw10
= ((pqpair
->num_entries
- 1) << 16) | io_que
->id
;
1309 /* 0x1 = physically contiguous */
1310 cmd
->cdw11
= (io_que
->id
<< 16) | (io_que
->qprio
<< 1) | 0x1;
1311 cmd
->dptr
.prp
.prp1
= pqpair
->cmd_bus_addr
;
1313 return nvme_ctrlr_submit_admin_request(ctrlr
, req
);
1317 nvme_pcie_ctrlr_cmd_delete_io_cq(struct spdk_nvme_ctrlr
*ctrlr
, struct spdk_nvme_qpair
*qpair
,
1318 spdk_nvme_cmd_cb cb_fn
, void *cb_arg
)
1320 struct nvme_request
*req
;
1321 struct spdk_nvme_cmd
*cmd
;
1323 req
= nvme_allocate_request_null(ctrlr
->adminq
, cb_fn
, cb_arg
);
1329 cmd
->opc
= SPDK_NVME_OPC_DELETE_IO_CQ
;
1330 cmd
->cdw10
= qpair
->id
;
1332 return nvme_ctrlr_submit_admin_request(ctrlr
, req
);
1336 nvme_pcie_ctrlr_cmd_delete_io_sq(struct spdk_nvme_ctrlr
*ctrlr
, struct spdk_nvme_qpair
*qpair
,
1337 spdk_nvme_cmd_cb cb_fn
, void *cb_arg
)
1339 struct nvme_request
*req
;
1340 struct spdk_nvme_cmd
*cmd
;
1342 req
= nvme_allocate_request_null(ctrlr
->adminq
, cb_fn
, cb_arg
);
1348 cmd
->opc
= SPDK_NVME_OPC_DELETE_IO_SQ
;
1349 cmd
->cdw10
= qpair
->id
;
1351 return nvme_ctrlr_submit_admin_request(ctrlr
, req
);
1355 _nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr
*ctrlr
, struct spdk_nvme_qpair
*qpair
,
1358 struct nvme_completion_poll_status status
;
1361 status
.done
= false;
1362 rc
= nvme_pcie_ctrlr_cmd_create_io_cq(ctrlr
, qpair
, nvme_completion_poll_cb
, &status
);
1367 while (status
.done
== false) {
1368 spdk_nvme_qpair_process_completions(ctrlr
->adminq
, 0);
1370 if (spdk_nvme_cpl_is_error(&status
.cpl
)) {
1371 SPDK_ERRLOG("nvme_create_io_cq failed!\n");
1375 status
.done
= false;
1376 rc
= nvme_pcie_ctrlr_cmd_create_io_sq(qpair
->ctrlr
, qpair
, nvme_completion_poll_cb
, &status
);
1381 while (status
.done
== false) {
1382 spdk_nvme_qpair_process_completions(ctrlr
->adminq
, 0);
1384 if (spdk_nvme_cpl_is_error(&status
.cpl
)) {
1385 SPDK_ERRLOG("nvme_create_io_sq failed!\n");
1386 /* Attempt to delete the completion queue */
1387 status
.done
= false;
1388 rc
= nvme_pcie_ctrlr_cmd_delete_io_cq(qpair
->ctrlr
, qpair
, nvme_completion_poll_cb
, &status
);
1392 while (status
.done
== false) {
1393 spdk_nvme_qpair_process_completions(ctrlr
->adminq
, 0);
1398 nvme_pcie_qpair_reset(qpair
);
1403 struct spdk_nvme_qpair
*
1404 nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr
*ctrlr
, uint16_t qid
,
1405 enum spdk_nvme_qprio qprio
)
1407 struct nvme_pcie_qpair
*pqpair
;
1408 struct spdk_nvme_qpair
*qpair
;
1411 assert(ctrlr
!= NULL
);
1413 pqpair
= spdk_zmalloc(sizeof(*pqpair
), 64, NULL
);
1414 if (pqpair
== NULL
) {
1418 pqpair
->num_entries
= ctrlr
->opts
.io_queue_size
;
1420 qpair
= &pqpair
->qpair
;
1422 rc
= nvme_qpair_init(qpair
, qid
, ctrlr
, qprio
, ctrlr
->opts
.io_queue_requests
);
1424 nvme_pcie_qpair_destroy(qpair
);
1428 rc
= nvme_pcie_qpair_construct(qpair
);
1430 nvme_pcie_qpair_destroy(qpair
);
1434 rc
= _nvme_pcie_ctrlr_create_io_qpair(ctrlr
, qpair
, qid
);
1437 SPDK_ERRLOG("I/O queue creation failed\n");
1438 nvme_pcie_qpair_destroy(qpair
);
1446 nvme_pcie_ctrlr_reinit_io_qpair(struct spdk_nvme_ctrlr
*ctrlr
, struct spdk_nvme_qpair
*qpair
)
1448 return _nvme_pcie_ctrlr_create_io_qpair(ctrlr
, qpair
, qpair
->id
);
1452 nvme_pcie_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr
*ctrlr
, struct spdk_nvme_qpair
*qpair
)
1454 struct nvme_completion_poll_status status
;
1457 assert(ctrlr
!= NULL
);
1459 if (ctrlr
->is_removed
) {
1463 /* Delete the I/O submission queue and then the completion queue */
1465 status
.done
= false;
1466 rc
= nvme_pcie_ctrlr_cmd_delete_io_sq(ctrlr
, qpair
, nvme_completion_poll_cb
, &status
);
1470 while (status
.done
== false) {
1471 spdk_nvme_qpair_process_completions(ctrlr
->adminq
, 0);
1473 if (spdk_nvme_cpl_is_error(&status
.cpl
)) {
1477 status
.done
= false;
1478 rc
= nvme_pcie_ctrlr_cmd_delete_io_cq(ctrlr
, qpair
, nvme_completion_poll_cb
, &status
);
1482 while (status
.done
== false) {
1483 spdk_nvme_qpair_process_completions(ctrlr
->adminq
, 0);
1485 if (spdk_nvme_cpl_is_error(&status
.cpl
)) {
1490 nvme_pcie_qpair_destroy(qpair
);
1495 nvme_pcie_fail_request_bad_vtophys(struct spdk_nvme_qpair
*qpair
, struct nvme_tracker
*tr
)
1498 * Bad vtophys translation, so abort this request and return
1501 nvme_pcie_qpair_manual_complete_tracker(qpair
, tr
, SPDK_NVME_SCT_GENERIC
,
1502 SPDK_NVME_SC_INVALID_FIELD
,
1503 1 /* do not retry */, true);
1507 * Build PRP list describing physically contiguous payload buffer.
1510 nvme_pcie_qpair_build_contig_request(struct spdk_nvme_qpair
*qpair
, struct nvme_request
*req
,
1511 struct nvme_tracker
*tr
)
1515 uint32_t nseg
, cur_nseg
, modulo
, unaligned
;
1517 void *payload
= req
->payload
.u
.contig
+ req
->payload_offset
;
1519 phys_addr
= spdk_vtophys(payload
);
1520 if (phys_addr
== SPDK_VTOPHYS_ERROR
) {
1521 nvme_pcie_fail_request_bad_vtophys(qpair
, tr
);
1524 nseg
= req
->payload_size
>> spdk_u32log2(PAGE_SIZE
);
1525 modulo
= req
->payload_size
& (PAGE_SIZE
- 1);
1526 unaligned
= phys_addr
& (PAGE_SIZE
- 1);
1527 if (modulo
|| unaligned
) {
1528 nseg
+= 1 + ((modulo
+ unaligned
- 1) >> spdk_u32log2(PAGE_SIZE
));
1531 if (req
->payload
.md
) {
1532 md_payload
= req
->payload
.md
+ req
->md_offset
;
1533 tr
->req
->cmd
.mptr
= spdk_vtophys(md_payload
);
1534 if (tr
->req
->cmd
.mptr
== SPDK_VTOPHYS_ERROR
) {
1535 nvme_pcie_fail_request_bad_vtophys(qpair
, tr
);
1540 tr
->req
->cmd
.psdt
= SPDK_NVME_PSDT_PRP
;
1541 tr
->req
->cmd
.dptr
.prp
.prp1
= phys_addr
;
1543 seg_addr
= payload
+ PAGE_SIZE
- unaligned
;
1544 tr
->req
->cmd
.dptr
.prp
.prp2
= spdk_vtophys(seg_addr
);
1545 } else if (nseg
> 2) {
1547 tr
->req
->cmd
.dptr
.prp
.prp2
= (uint64_t)tr
->prp_sgl_bus_addr
;
1548 while (cur_nseg
< nseg
) {
1549 seg_addr
= payload
+ cur_nseg
* PAGE_SIZE
- unaligned
;
1550 phys_addr
= spdk_vtophys(seg_addr
);
1551 if (phys_addr
== SPDK_VTOPHYS_ERROR
) {
1552 nvme_pcie_fail_request_bad_vtophys(qpair
, tr
);
1555 tr
->u
.prp
[cur_nseg
- 1] = phys_addr
;
1564 * Build SGL list describing scattered payload buffer.
1567 nvme_pcie_qpair_build_hw_sgl_request(struct spdk_nvme_qpair
*qpair
, struct nvme_request
*req
,
1568 struct nvme_tracker
*tr
)
1573 uint32_t remaining_transfer_len
, length
;
1574 struct spdk_nvme_sgl_descriptor
*sgl
;
1578 * Build scattered payloads.
1580 assert(req
->payload_size
!= 0);
1581 assert(req
->payload
.type
== NVME_PAYLOAD_TYPE_SGL
);
1582 assert(req
->payload
.u
.sgl
.reset_sgl_fn
!= NULL
);
1583 assert(req
->payload
.u
.sgl
.next_sge_fn
!= NULL
);
1584 req
->payload
.u
.sgl
.reset_sgl_fn(req
->payload
.u
.sgl
.cb_arg
, req
->payload_offset
);
1587 req
->cmd
.psdt
= SPDK_NVME_PSDT_SGL_MPTR_SGL
;
1588 req
->cmd
.dptr
.sgl1
.unkeyed
.subtype
= 0;
1590 remaining_transfer_len
= req
->payload_size
;
1592 while (remaining_transfer_len
> 0) {
1593 if (nseg
>= NVME_MAX_SGL_DESCRIPTORS
) {
1594 nvme_pcie_fail_request_bad_vtophys(qpair
, tr
);
1598 rc
= req
->payload
.u
.sgl
.next_sge_fn(req
->payload
.u
.sgl
.cb_arg
, &virt_addr
, &length
);
1600 nvme_pcie_fail_request_bad_vtophys(qpair
, tr
);
1604 phys_addr
= spdk_vtophys(virt_addr
);
1605 if (phys_addr
== SPDK_VTOPHYS_ERROR
) {
1606 nvme_pcie_fail_request_bad_vtophys(qpair
, tr
);
1610 length
= spdk_min(remaining_transfer_len
, length
);
1611 remaining_transfer_len
-= length
;
1613 sgl
->unkeyed
.type
= SPDK_NVME_SGL_TYPE_DATA_BLOCK
;
1614 sgl
->unkeyed
.length
= length
;
1615 sgl
->address
= phys_addr
;
1616 sgl
->unkeyed
.subtype
= 0;
1624 * The whole transfer can be described by a single SGL descriptor.
1625 * Use the special case described by the spec where SGL1's type is Data Block.
1626 * This means the SGL in the tracker is not used at all, so copy the first (and only)
1627 * SGL element into SGL1.
1629 req
->cmd
.dptr
.sgl1
.unkeyed
.type
= SPDK_NVME_SGL_TYPE_DATA_BLOCK
;
1630 req
->cmd
.dptr
.sgl1
.address
= tr
->u
.sgl
[0].address
;
1631 req
->cmd
.dptr
.sgl1
.unkeyed
.length
= tr
->u
.sgl
[0].unkeyed
.length
;
1633 /* For now we can only support 1 SGL segment in NVMe controller */
1634 req
->cmd
.dptr
.sgl1
.unkeyed
.type
= SPDK_NVME_SGL_TYPE_LAST_SEGMENT
;
1635 req
->cmd
.dptr
.sgl1
.address
= tr
->prp_sgl_bus_addr
;
1636 req
->cmd
.dptr
.sgl1
.unkeyed
.length
= nseg
* sizeof(struct spdk_nvme_sgl_descriptor
);
1643 * Build PRP list describing scattered payload buffer.
1646 nvme_pcie_qpair_build_prps_sgl_request(struct spdk_nvme_qpair
*qpair
, struct nvme_request
*req
,
1647 struct nvme_tracker
*tr
)
1652 uint32_t data_transferred
, remaining_transfer_len
, length
;
1653 uint32_t nseg
, cur_nseg
, total_nseg
, last_nseg
, modulo
, unaligned
;
1654 uint32_t sge_count
= 0;
1658 * Build scattered payloads.
1660 assert(req
->payload
.type
== NVME_PAYLOAD_TYPE_SGL
);
1661 assert(req
->payload
.u
.sgl
.reset_sgl_fn
!= NULL
);
1662 req
->payload
.u
.sgl
.reset_sgl_fn(req
->payload
.u
.sgl
.cb_arg
, req
->payload_offset
);
1664 remaining_transfer_len
= req
->payload_size
;
1668 while (remaining_transfer_len
> 0) {
1669 assert(req
->payload
.u
.sgl
.next_sge_fn
!= NULL
);
1670 rc
= req
->payload
.u
.sgl
.next_sge_fn(req
->payload
.u
.sgl
.cb_arg
, &virt_addr
, &length
);
1672 nvme_pcie_fail_request_bad_vtophys(qpair
, tr
);
1676 phys_addr
= spdk_vtophys(virt_addr
);
1677 if (phys_addr
== SPDK_VTOPHYS_ERROR
) {
1678 nvme_pcie_fail_request_bad_vtophys(qpair
, tr
);
1683 * Any incompatible sges should have been handled up in the splitting routine,
1684 * but assert here as an additional check.
1686 assert((phys_addr
& 0x3) == 0); /* Address must be dword aligned. */
1687 /* All SGEs except last must end on a page boundary. */
1688 assert((length
>= remaining_transfer_len
) || _is_page_aligned(phys_addr
+ length
));
1689 /* All SGe except first must start on a page boundary. */
1690 assert((sge_count
== 0) || _is_page_aligned(phys_addr
));
1692 data_transferred
= spdk_min(remaining_transfer_len
, length
);
1694 nseg
= data_transferred
>> spdk_u32log2(PAGE_SIZE
);
1695 modulo
= data_transferred
& (PAGE_SIZE
- 1);
1696 unaligned
= phys_addr
& (PAGE_SIZE
- 1);
1697 if (modulo
|| unaligned
) {
1698 nseg
+= 1 + ((modulo
+ unaligned
- 1) >> spdk_u32log2(PAGE_SIZE
));
1701 if (total_nseg
== 0) {
1702 req
->cmd
.psdt
= SPDK_NVME_PSDT_PRP
;
1703 req
->cmd
.dptr
.prp
.prp1
= phys_addr
;
1704 phys_addr
-= unaligned
;
1709 remaining_transfer_len
-= data_transferred
;
1711 if (total_nseg
== 2) {
1713 tr
->req
->cmd
.dptr
.prp
.prp2
= phys_addr
+ PAGE_SIZE
;
1714 else if (sge_count
== 2)
1715 tr
->req
->cmd
.dptr
.prp
.prp2
= phys_addr
;
1716 /* save prp2 value */
1717 prp2
= tr
->req
->cmd
.dptr
.prp
.prp2
;
1718 } else if (total_nseg
> 2) {
1724 tr
->req
->cmd
.dptr
.prp
.prp2
= (uint64_t)tr
->prp_sgl_bus_addr
;
1725 while (cur_nseg
< nseg
) {
1727 tr
->u
.prp
[0] = prp2
;
1728 tr
->u
.prp
[last_nseg
+ 1] = phys_addr
+ cur_nseg
* PAGE_SIZE
;
1730 tr
->u
.prp
[last_nseg
] = phys_addr
+ cur_nseg
* PAGE_SIZE
;
1742 nvme_pcie_qpair_check_enabled(struct spdk_nvme_qpair
*qpair
)
1744 struct nvme_pcie_qpair
*pqpair
= nvme_pcie_qpair(qpair
);
1746 if (!pqpair
->is_enabled
&&
1747 !qpair
->ctrlr
->is_resetting
) {
1748 nvme_qpair_enable(qpair
);
1750 return pqpair
->is_enabled
;
1754 nvme_pcie_qpair_submit_request(struct spdk_nvme_qpair
*qpair
, struct nvme_request
*req
)
1756 struct nvme_tracker
*tr
;
1758 struct spdk_nvme_ctrlr
*ctrlr
= qpair
->ctrlr
;
1759 struct nvme_pcie_qpair
*pqpair
= nvme_pcie_qpair(qpair
);
1761 nvme_pcie_qpair_check_enabled(qpair
);
1763 if (nvme_qpair_is_admin_queue(qpair
)) {
1764 nvme_robust_mutex_lock(&ctrlr
->ctrlr_lock
);
1767 tr
= TAILQ_FIRST(&pqpair
->free_tr
);
1769 if (tr
== NULL
|| !pqpair
->is_enabled
) {
1771 * No tracker is available, or the qpair is disabled due to
1772 * an in-progress controller-level reset.
1774 * Put the request on the qpair's request queue to be
1775 * processed when a tracker frees up via a command
1776 * completion or when the controller reset is
1779 STAILQ_INSERT_TAIL(&qpair
->queued_req
, req
, stailq
);
1783 TAILQ_REMOVE(&pqpair
->free_tr
, tr
, tq_list
); /* remove tr from free_tr */
1784 TAILQ_INSERT_TAIL(&pqpair
->outstanding_tr
, tr
, tq_list
);
1786 req
->cmd
.cid
= tr
->cid
;
1788 if (req
->payload_size
== 0) {
1789 /* Null payload - leave PRP fields zeroed */
1791 } else if (req
->payload
.type
== NVME_PAYLOAD_TYPE_CONTIG
) {
1792 rc
= nvme_pcie_qpair_build_contig_request(qpair
, req
, tr
);
1793 } else if (req
->payload
.type
== NVME_PAYLOAD_TYPE_SGL
) {
1794 if (ctrlr
->flags
& SPDK_NVME_CTRLR_SGL_SUPPORTED
) {
1795 rc
= nvme_pcie_qpair_build_hw_sgl_request(qpair
, req
, tr
);
1797 rc
= nvme_pcie_qpair_build_prps_sgl_request(qpair
, req
, tr
);
1801 nvme_pcie_fail_request_bad_vtophys(qpair
, tr
);
1809 nvme_pcie_qpair_submit_tracker(qpair
, tr
);
1812 if (nvme_qpair_is_admin_queue(qpair
)) {
1813 nvme_robust_mutex_unlock(&ctrlr
->ctrlr_lock
);
1820 nvme_pcie_qpair_check_timeout(struct spdk_nvme_qpair
*qpair
)
1823 struct nvme_tracker
*tr
, *tmp
;
1824 struct nvme_pcie_qpair
*pqpair
= nvme_pcie_qpair(qpair
);
1825 struct spdk_nvme_ctrlr
*ctrlr
= qpair
->ctrlr
;
1827 /* We don't want to expose the admin queue to the user,
1828 * so when we're timing out admin commands set the
1831 if (qpair
== ctrlr
->adminq
) {
1835 t02
= spdk_get_ticks();
1836 TAILQ_FOREACH_SAFE(tr
, &pqpair
->outstanding_tr
, tq_list
, tmp
) {
1837 if (tr
->timed_out
) {
1841 if (qpair
== NULL
&&
1842 tr
->req
->cmd
.opc
== SPDK_NVME_OPC_ASYNC_EVENT_REQUEST
) {
1846 if (tr
->submit_tick
+ ctrlr
->timeout_ticks
> t02
) {
1847 /* The trackers are in order, so as soon as one has not timed out,
1854 ctrlr
->timeout_cb_fn(ctrlr
->timeout_cb_arg
, ctrlr
, qpair
, tr
->cid
);
1859 nvme_pcie_qpair_process_completions(struct spdk_nvme_qpair
*qpair
, uint32_t max_completions
)
1861 struct nvme_pcie_qpair
*pqpair
= nvme_pcie_qpair(qpair
);
1862 struct nvme_pcie_ctrlr
*pctrlr
= nvme_pcie_ctrlr(qpair
->ctrlr
);
1863 struct nvme_tracker
*tr
;
1864 struct spdk_nvme_cpl
*cpl
;
1865 uint32_t num_completions
= 0;
1866 struct spdk_nvme_ctrlr
*ctrlr
= qpair
->ctrlr
;
1868 if (!nvme_pcie_qpair_check_enabled(qpair
)) {
1870 * qpair is not enabled, likely because a controller reset is
1871 * is in progress. Ignore the interrupt - any I/O that was
1872 * associated with this interrupt will get retried when the
1873 * reset is complete.
1878 if (nvme_qpair_is_admin_queue(qpair
)) {
1879 nvme_robust_mutex_lock(&ctrlr
->ctrlr_lock
);
1882 if (max_completions
== 0 || (max_completions
> (pqpair
->num_entries
- 1U))) {
1885 * max_completions == 0 means unlimited, but complete at most one
1886 * queue depth batch of I/O at a time so that the completion
1887 * queue doorbells don't wrap around.
1889 max_completions
= pqpair
->num_entries
- 1;
1893 cpl
= &pqpair
->cpl
[pqpair
->cq_head
];
1895 if (cpl
->status
.p
!= pqpair
->phase
)
1898 tr
= &pqpair
->tr
[cpl
->cid
];
1901 nvme_pcie_qpair_complete_tracker(qpair
, tr
, cpl
, true);
1903 SPDK_ERRLOG("cpl does not map to outstanding cmd\n");
1904 nvme_qpair_print_completion(qpair
, cpl
);
1908 if (++pqpair
->cq_head
== pqpair
->num_entries
) {
1909 pqpair
->cq_head
= 0;
1910 pqpair
->phase
= !pqpair
->phase
;
1913 if (++num_completions
== max_completions
) {
1918 if (num_completions
> 0) {
1919 g_thread_mmio_ctrlr
= pctrlr
;
1920 spdk_mmio_write_4(pqpair
->cq_hdbl
, pqpair
->cq_head
);
1921 g_thread_mmio_ctrlr
= NULL
;
1924 if (qpair
->ctrlr
->state
== NVME_CTRLR_STATE_READY
) {
1925 if (qpair
->ctrlr
->timeout_cb_fn
) {
1927 * User registered for timeout callback
1929 nvme_pcie_qpair_check_timeout(qpair
);
1933 /* Before returning, complete any pending admin request. */
1934 if (nvme_qpair_is_admin_queue(qpair
)) {
1935 nvme_pcie_qpair_complete_pending_admin_request(qpair
);
1937 nvme_robust_mutex_unlock(&ctrlr
->ctrlr_lock
);
1940 return num_completions
;