]> git.proxmox.com Git - ceph.git/blame - ceph/src/spdk/lib/nvme/nvme_pcie.c
import 15.2.0 Octopus source
[ceph.git] / ceph / src / spdk / lib / nvme / nvme_pcie.c
CommitLineData
7c673cae
FG
1/*-
2 * BSD LICENSE
3 *
4 * Copyright (c) Intel Corporation.
11fdf7f2 5 * Copyright (c) 2017, IBM Corporation.
7c673cae
FG
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 *
12 * * Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * * Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in
16 * the documentation and/or other materials provided with the
17 * distribution.
18 * * Neither the name of Intel Corporation nor the names of its
19 * contributors may be used to endorse or promote products derived
20 * from this software without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
26 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 */
34
35/*
36 * NVMe over PCIe transport
37 */
38
11fdf7f2
TL
39#include "spdk/stdinc.h"
40#include "spdk/env.h"
41#include "spdk/likely.h"
7c673cae
FG
42#include "nvme_internal.h"
43#include "nvme_uevent.h"
44
7c673cae 45/*
11fdf7f2
TL
46 * Number of completion queue entries to process before ringing the
47 * completion queue doorbell.
7c673cae 48 */
11fdf7f2
TL
49#define NVME_MIN_COMPLETIONS (1)
50#define NVME_MAX_COMPLETIONS (128)
51
52#define NVME_ADMIN_ENTRIES (128)
7c673cae
FG
53
54/*
55 * NVME_MAX_SGL_DESCRIPTORS defines the maximum number of descriptors in one SGL
56 * segment.
57 */
9f95a23c 58#define NVME_MAX_SGL_DESCRIPTORS (251)
7c673cae 59
9f95a23c 60#define NVME_MAX_PRP_LIST_ENTRIES (505)
7c673cae 61
7c673cae 62struct nvme_pcie_enum_ctx {
9f95a23c 63 struct spdk_nvme_probe_ctx *probe_ctx;
7c673cae
FG
64 struct spdk_pci_addr pci_addr;
65 bool has_pci_addr;
66};
67
68/* PCIe transport extensions for spdk_nvme_ctrlr */
69struct nvme_pcie_ctrlr {
70 struct spdk_nvme_ctrlr ctrlr;
71
72 /** NVMe MMIO register space */
73 volatile struct spdk_nvme_registers *regs;
74
75 /** NVMe MMIO register size */
76 uint64_t regs_size;
77
78 /* BAR mapping address which contains controller memory buffer */
79 void *cmb_bar_virt_addr;
80
81 /* BAR physical address which contains controller memory buffer */
82 uint64_t cmb_bar_phys_addr;
83
84 /* Controller memory buffer size in Bytes */
85 uint64_t cmb_size;
86
11fdf7f2 87 /* Current offset of controller memory buffer, relative to start of BAR virt addr */
7c673cae
FG
88 uint64_t cmb_current_offset;
89
11fdf7f2
TL
90 /* Last valid offset into CMB, this differs if CMB memory registration occurs or not */
91 uint64_t cmb_max_offset;
92
93 void *cmb_mem_register_addr;
94 size_t cmb_mem_register_size;
95
96 bool cmb_io_data_supported;
97
7c673cae
FG
98 /** stride in uint32_t units between doorbell registers (1 = 4 bytes, 2 = 8 bytes, ...) */
99 uint32_t doorbell_stride_u32;
100
101 /* Opaque handle to associated PCI device. */
102 struct spdk_pci_device *devhandle;
103
11fdf7f2
TL
104 /* File descriptor returned from spdk_pci_device_claim(). Closed when ctrlr is detached. */
105 int claim_fd;
106
7c673cae
FG
107 /* Flag to indicate the MMIO register has been remapped */
108 bool is_remapped;
109};
110
111struct nvme_tracker {
112 TAILQ_ENTRY(nvme_tracker) tq_list;
113
114 struct nvme_request *req;
115 uint16_t cid;
116
9f95a23c
TL
117 uint16_t rsvd0;
118 uint32_t rsvd1;
7c673cae 119
9f95a23c
TL
120 spdk_nvme_cmd_cb cb_fn;
121 void *cb_arg;
7c673cae
FG
122
123 uint64_t prp_sgl_bus_addr;
124
125 union {
126 uint64_t prp[NVME_MAX_PRP_LIST_ENTRIES];
127 struct spdk_nvme_sgl_descriptor sgl[NVME_MAX_SGL_DESCRIPTORS];
128 } u;
129};
130/*
131 * struct nvme_tracker must be exactly 4K so that the prp[] array does not cross a page boundary
132 * and so that there is no padding required to meet alignment requirements.
133 */
134SPDK_STATIC_ASSERT(sizeof(struct nvme_tracker) == 4096, "nvme_tracker is not 4K");
135SPDK_STATIC_ASSERT((offsetof(struct nvme_tracker, u.sgl) & 7) == 0, "SGL must be Qword aligned");
136
137/* PCIe transport extensions for spdk_nvme_qpair */
138struct nvme_pcie_qpair {
139 /* Submission queue tail doorbell */
140 volatile uint32_t *sq_tdbl;
141
142 /* Completion queue head doorbell */
143 volatile uint32_t *cq_hdbl;
144
145 /* Submission queue */
146 struct spdk_nvme_cmd *cmd;
147
148 /* Completion queue */
149 struct spdk_nvme_cpl *cpl;
150
151 TAILQ_HEAD(, nvme_tracker) free_tr;
152 TAILQ_HEAD(nvme_outstanding_tr_head, nvme_tracker) outstanding_tr;
153
154 /* Array of trackers indexed by command ID. */
155 struct nvme_tracker *tr;
156
157 uint16_t num_entries;
158
11fdf7f2
TL
159 uint16_t max_completions_cap;
160
9f95a23c 161 uint16_t last_sq_tail;
7c673cae
FG
162 uint16_t sq_tail;
163 uint16_t cq_head;
11fdf7f2 164 uint16_t sq_head;
7c673cae 165
9f95a23c
TL
166 struct {
167 uint8_t phase : 1;
168 uint8_t delay_pcie_doorbell : 1;
169 uint8_t has_shadow_doorbell : 1;
170 } flags;
7c673cae
FG
171
172 /*
173 * Base qpair structure.
174 * This is located after the hot data in this structure so that the important parts of
175 * nvme_pcie_qpair are in the same cache line.
176 */
177 struct spdk_nvme_qpair qpair;
178
9f95a23c
TL
179 struct {
180 /* Submission queue shadow tail doorbell */
181 volatile uint32_t *sq_tdbl;
182
183 /* Completion queue shadow head doorbell */
184 volatile uint32_t *cq_hdbl;
185
186 /* Submission queue event index */
187 volatile uint32_t *sq_eventidx;
188
189 /* Completion queue event index */
190 volatile uint32_t *cq_eventidx;
191 } shadow_doorbell;
192
7c673cae
FG
193 /*
194 * Fields below this point should not be touched on the normal I/O path.
195 */
196
197 bool sq_in_cmb;
198
199 uint64_t cmd_bus_addr;
200 uint64_t cpl_bus_addr;
201};
202
9f95a23c 203static int nvme_pcie_ctrlr_attach(struct spdk_nvme_probe_ctx *probe_ctx,
7c673cae
FG
204 struct spdk_pci_addr *pci_addr);
205static int nvme_pcie_qpair_construct(struct spdk_nvme_qpair *qpair);
206static int nvme_pcie_qpair_destroy(struct spdk_nvme_qpair *qpair);
207
208__thread struct nvme_pcie_ctrlr *g_thread_mmio_ctrlr = NULL;
209static volatile uint16_t g_signal_lock;
210static bool g_sigset = false;
211static int hotplug_fd = -1;
212
213static void
214nvme_sigbus_fault_sighandler(int signum, siginfo_t *info, void *ctx)
215{
216 void *map_address;
217
218 if (!__sync_bool_compare_and_swap(&g_signal_lock, 0, 1)) {
219 return;
220 }
221
222 assert(g_thread_mmio_ctrlr != NULL);
223
224 if (!g_thread_mmio_ctrlr->is_remapped) {
225 map_address = mmap((void *)g_thread_mmio_ctrlr->regs, g_thread_mmio_ctrlr->regs_size,
226 PROT_READ | PROT_WRITE,
227 MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
228 if (map_address == MAP_FAILED) {
229 SPDK_ERRLOG("mmap failed\n");
230 g_signal_lock = 0;
231 return;
232 }
233 memset(map_address, 0xFF, sizeof(struct spdk_nvme_registers));
234 g_thread_mmio_ctrlr->regs = (volatile struct spdk_nvme_registers *)map_address;
235 g_thread_mmio_ctrlr->is_remapped = true;
236 }
237 g_signal_lock = 0;
238 return;
239}
240
241static void
242nvme_pcie_ctrlr_setup_signal(void)
243{
244 struct sigaction sa;
245
246 sa.sa_sigaction = nvme_sigbus_fault_sighandler;
247 sigemptyset(&sa.sa_mask);
248 sa.sa_flags = SA_SIGINFO;
249 sigaction(SIGBUS, &sa, NULL);
250}
251
9f95a23c
TL
252static inline struct nvme_pcie_ctrlr *
253nvme_pcie_ctrlr(struct spdk_nvme_ctrlr *ctrlr)
254{
255 assert(ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE);
256 return SPDK_CONTAINEROF(ctrlr, struct nvme_pcie_ctrlr, ctrlr);
257}
258
7c673cae 259static int
9f95a23c 260_nvme_pcie_hotplug_monitor(struct spdk_nvme_probe_ctx *probe_ctx)
7c673cae 261{
11fdf7f2 262 struct spdk_nvme_ctrlr *ctrlr, *tmp;
7c673cae
FG
263 struct spdk_uevent event;
264 struct spdk_pci_addr pci_addr;
11fdf7f2
TL
265 union spdk_nvme_csts_register csts;
266 struct spdk_nvme_ctrlr_process *proc;
7c673cae
FG
267
268 while (spdk_get_uevent(hotplug_fd, &event) > 0) {
11fdf7f2
TL
269 if (event.subsystem == SPDK_NVME_UEVENT_SUBSYSTEM_UIO ||
270 event.subsystem == SPDK_NVME_UEVENT_SUBSYSTEM_VFIO) {
7c673cae 271 if (event.action == SPDK_NVME_UEVENT_ADD) {
11fdf7f2 272 SPDK_DEBUGLOG(SPDK_LOG_NVME, "add nvme address: %s\n",
7c673cae
FG
273 event.traddr);
274 if (spdk_process_is_primary()) {
275 if (!spdk_pci_addr_parse(&pci_addr, event.traddr)) {
9f95a23c 276 nvme_pcie_ctrlr_attach(probe_ctx, &pci_addr);
7c673cae
FG
277 }
278 }
279 } else if (event.action == SPDK_NVME_UEVENT_REMOVE) {
11fdf7f2 280 struct spdk_nvme_transport_id trid;
7c673cae 281
11fdf7f2
TL
282 memset(&trid, 0, sizeof(trid));
283 trid.trtype = SPDK_NVME_TRANSPORT_PCIE;
284 snprintf(trid.traddr, sizeof(trid.traddr), "%s", event.traddr);
285
286 ctrlr = spdk_nvme_get_ctrlr_by_trid_unsafe(&trid);
287 if (ctrlr == NULL) {
7c673cae
FG
288 return 0;
289 }
11fdf7f2 290 SPDK_DEBUGLOG(SPDK_LOG_NVME, "remove nvme address: %s\n",
7c673cae
FG
291 event.traddr);
292
293 nvme_ctrlr_fail(ctrlr, true);
294
295 /* get the user app to clean up and stop I/O */
9f95a23c 296 if (probe_ctx->remove_cb) {
7c673cae 297 nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock);
9f95a23c 298 probe_ctx->remove_cb(probe_ctx->cb_ctx, ctrlr);
7c673cae
FG
299 nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock);
300 }
301 }
302 }
303 }
11fdf7f2
TL
304
305 /* This is a work around for vfio-attached device hot remove detection. */
306 TAILQ_FOREACH_SAFE(ctrlr, &g_spdk_nvme_driver->shared_attached_ctrlrs, tailq, tmp) {
9f95a23c
TL
307 bool do_remove = false;
308
309 if (ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
310 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
311
312 if (spdk_pci_device_is_removed(pctrlr->devhandle)) {
313 do_remove = true;
314 }
315 }
316
317 /* NVMe controller BAR must be mapped in the current process before any access. */
11fdf7f2
TL
318 proc = spdk_nvme_ctrlr_get_current_process(ctrlr);
319 if (proc) {
320 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr);
321 if (csts.raw == 0xffffffffU) {
9f95a23c
TL
322 do_remove = true;
323 }
324 }
325
326 if (do_remove) {
327 nvme_ctrlr_fail(ctrlr, true);
328 if (probe_ctx->remove_cb) {
329 nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock);
330 probe_ctx->remove_cb(probe_ctx->cb_ctx, ctrlr);
331 nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock);
11fdf7f2
TL
332 }
333 }
334 }
7c673cae
FG
335 return 0;
336}
337
7c673cae
FG
338static inline struct nvme_pcie_qpair *
339nvme_pcie_qpair(struct spdk_nvme_qpair *qpair)
340{
341 assert(qpair->trtype == SPDK_NVME_TRANSPORT_PCIE);
11fdf7f2 342 return SPDK_CONTAINEROF(qpair, struct nvme_pcie_qpair, qpair);
7c673cae
FG
343}
344
345static volatile void *
346nvme_pcie_reg_addr(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset)
347{
348 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
349
350 return (volatile void *)((uintptr_t)pctrlr->regs + offset);
351}
352
353int
354nvme_pcie_ctrlr_set_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value)
355{
356 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
357
358 assert(offset <= sizeof(struct spdk_nvme_registers) - 4);
359 g_thread_mmio_ctrlr = pctrlr;
360 spdk_mmio_write_4(nvme_pcie_reg_addr(ctrlr, offset), value);
361 g_thread_mmio_ctrlr = NULL;
362 return 0;
363}
364
365int
366nvme_pcie_ctrlr_set_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t value)
367{
368 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
369
370 assert(offset <= sizeof(struct spdk_nvme_registers) - 8);
371 g_thread_mmio_ctrlr = pctrlr;
372 spdk_mmio_write_8(nvme_pcie_reg_addr(ctrlr, offset), value);
373 g_thread_mmio_ctrlr = NULL;
374 return 0;
375}
376
377int
378nvme_pcie_ctrlr_get_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value)
379{
380 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
381
382 assert(offset <= sizeof(struct spdk_nvme_registers) - 4);
383 assert(value != NULL);
384 g_thread_mmio_ctrlr = pctrlr;
385 *value = spdk_mmio_read_4(nvme_pcie_reg_addr(ctrlr, offset));
386 g_thread_mmio_ctrlr = NULL;
387 if (~(*value) == 0) {
388 return -1;
389 }
390
391 return 0;
392}
393
394int
395nvme_pcie_ctrlr_get_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t *value)
396{
397 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
398
399 assert(offset <= sizeof(struct spdk_nvme_registers) - 8);
400 assert(value != NULL);
401 g_thread_mmio_ctrlr = pctrlr;
402 *value = spdk_mmio_read_8(nvme_pcie_reg_addr(ctrlr, offset));
403 g_thread_mmio_ctrlr = NULL;
404 if (~(*value) == 0) {
405 return -1;
406 }
407
408 return 0;
409}
410
411static int
412nvme_pcie_ctrlr_set_asq(struct nvme_pcie_ctrlr *pctrlr, uint64_t value)
413{
414 return nvme_pcie_ctrlr_set_reg_8(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, asq),
415 value);
416}
417
418static int
419nvme_pcie_ctrlr_set_acq(struct nvme_pcie_ctrlr *pctrlr, uint64_t value)
420{
421 return nvme_pcie_ctrlr_set_reg_8(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, acq),
422 value);
423}
424
425static int
426nvme_pcie_ctrlr_set_aqa(struct nvme_pcie_ctrlr *pctrlr, const union spdk_nvme_aqa_register *aqa)
427{
428 return nvme_pcie_ctrlr_set_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, aqa.raw),
429 aqa->raw);
430}
431
432static int
433nvme_pcie_ctrlr_get_cmbloc(struct nvme_pcie_ctrlr *pctrlr, union spdk_nvme_cmbloc_register *cmbloc)
434{
435 return nvme_pcie_ctrlr_get_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, cmbloc.raw),
436 &cmbloc->raw);
437}
438
439static int
440nvme_pcie_ctrlr_get_cmbsz(struct nvme_pcie_ctrlr *pctrlr, union spdk_nvme_cmbsz_register *cmbsz)
441{
442 return nvme_pcie_ctrlr_get_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, cmbsz.raw),
443 &cmbsz->raw);
444}
445
446uint32_t
447nvme_pcie_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr)
448{
11fdf7f2
TL
449 /*
450 * For commands requiring more than 2 PRP entries, one PRP will be
451 * embedded in the command (prp1), and the rest of the PRP entries
452 * will be in a list pointed to by the command (prp2). This means
453 * that real max number of PRP entries we support is 506+1, which
454 * results in a max xfer size of 506*ctrlr->page_size.
455 */
456 return NVME_MAX_PRP_LIST_ENTRIES * ctrlr->page_size;
7c673cae
FG
457}
458
11fdf7f2
TL
459uint16_t
460nvme_pcie_ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr)
7c673cae 461{
11fdf7f2 462 return NVME_MAX_SGL_DESCRIPTORS;
7c673cae
FG
463}
464
465static void
466nvme_pcie_ctrlr_map_cmb(struct nvme_pcie_ctrlr *pctrlr)
467{
468 int rc;
469 void *addr;
470 uint32_t bir;
471 union spdk_nvme_cmbsz_register cmbsz;
472 union spdk_nvme_cmbloc_register cmbloc;
473 uint64_t size, unit_size, offset, bar_size, bar_phys_addr;
11fdf7f2 474 uint64_t mem_register_start, mem_register_end;
7c673cae
FG
475
476 if (nvme_pcie_ctrlr_get_cmbsz(pctrlr, &cmbsz) ||
477 nvme_pcie_ctrlr_get_cmbloc(pctrlr, &cmbloc)) {
478 SPDK_ERRLOG("get registers failed\n");
479 goto exit;
480 }
481
11fdf7f2 482 if (!cmbsz.bits.sz) {
7c673cae 483 goto exit;
11fdf7f2 484 }
7c673cae
FG
485
486 bir = cmbloc.bits.bir;
487 /* Values 0 2 3 4 5 are valid for BAR */
11fdf7f2 488 if (bir > 5 || bir == 1) {
7c673cae 489 goto exit;
11fdf7f2 490 }
7c673cae
FG
491
492 /* unit size for 4KB/64KB/1MB/16MB/256MB/4GB/64GB */
493 unit_size = (uint64_t)1 << (12 + 4 * cmbsz.bits.szu);
494 /* controller memory buffer size in Bytes */
495 size = unit_size * cmbsz.bits.sz;
496 /* controller memory buffer offset from BAR in Bytes */
497 offset = unit_size * cmbloc.bits.ofst;
498
499 rc = spdk_pci_device_map_bar(pctrlr->devhandle, bir, &addr,
500 &bar_phys_addr, &bar_size);
501 if ((rc != 0) || addr == NULL) {
502 goto exit;
503 }
504
505 if (offset > bar_size) {
506 goto exit;
507 }
508
509 if (size > bar_size - offset) {
510 goto exit;
511 }
512
513 pctrlr->cmb_bar_virt_addr = addr;
514 pctrlr->cmb_bar_phys_addr = bar_phys_addr;
515 pctrlr->cmb_size = size;
516 pctrlr->cmb_current_offset = offset;
11fdf7f2 517 pctrlr->cmb_max_offset = offset + size;
7c673cae
FG
518
519 if (!cmbsz.bits.sqs) {
520 pctrlr->ctrlr.opts.use_cmb_sqs = false;
521 }
522
11fdf7f2
TL
523 /* If only SQS is supported use legacy mapping */
524 if (cmbsz.bits.sqs && !(cmbsz.bits.wds || cmbsz.bits.rds)) {
525 return;
526 }
527
528 /* If CMB is less than 4MiB in size then abort CMB mapping */
529 if (pctrlr->cmb_size < (1ULL << 22)) {
530 goto exit;
531 }
532
9f95a23c
TL
533 mem_register_start = _2MB_PAGE((uintptr_t)pctrlr->cmb_bar_virt_addr + offset + VALUE_2MB - 1);
534 mem_register_end = _2MB_PAGE((uintptr_t)pctrlr->cmb_bar_virt_addr + offset + pctrlr->cmb_size);
11fdf7f2
TL
535 pctrlr->cmb_mem_register_addr = (void *)mem_register_start;
536 pctrlr->cmb_mem_register_size = mem_register_end - mem_register_start;
537
538 rc = spdk_mem_register(pctrlr->cmb_mem_register_addr, pctrlr->cmb_mem_register_size);
539 if (rc) {
540 SPDK_ERRLOG("spdk_mem_register() failed\n");
541 goto exit;
542 }
543 pctrlr->cmb_current_offset = mem_register_start - ((uint64_t)pctrlr->cmb_bar_virt_addr);
544 pctrlr->cmb_max_offset = mem_register_end - ((uint64_t)pctrlr->cmb_bar_virt_addr);
545 pctrlr->cmb_io_data_supported = true;
546
7c673cae
FG
547 return;
548exit:
549 pctrlr->cmb_bar_virt_addr = NULL;
550 pctrlr->ctrlr.opts.use_cmb_sqs = false;
551 return;
552}
553
554static int
555nvme_pcie_ctrlr_unmap_cmb(struct nvme_pcie_ctrlr *pctrlr)
556{
557 int rc = 0;
558 union spdk_nvme_cmbloc_register cmbloc;
559 void *addr = pctrlr->cmb_bar_virt_addr;
560
561 if (addr) {
11fdf7f2
TL
562 if (pctrlr->cmb_mem_register_addr) {
563 spdk_mem_unregister(pctrlr->cmb_mem_register_addr, pctrlr->cmb_mem_register_size);
564 }
565
7c673cae
FG
566 if (nvme_pcie_ctrlr_get_cmbloc(pctrlr, &cmbloc)) {
567 SPDK_ERRLOG("get_cmbloc() failed\n");
568 return -EIO;
569 }
570 rc = spdk_pci_device_unmap_bar(pctrlr->devhandle, cmbloc.bits.bir, addr);
571 }
572 return rc;
573}
574
575static int
576nvme_pcie_ctrlr_alloc_cmb(struct spdk_nvme_ctrlr *ctrlr, uint64_t length, uint64_t aligned,
577 uint64_t *offset)
578{
579 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
580 uint64_t round_offset;
581
582 round_offset = pctrlr->cmb_current_offset;
583 round_offset = (round_offset + (aligned - 1)) & ~(aligned - 1);
584
11fdf7f2
TL
585 /* CMB may only consume part of the BAR, calculate accordingly */
586 if (round_offset + length > pctrlr->cmb_max_offset) {
587 SPDK_ERRLOG("Tried to allocate past valid CMB range!\n");
7c673cae 588 return -1;
11fdf7f2 589 }
7c673cae
FG
590
591 *offset = round_offset;
592 pctrlr->cmb_current_offset = round_offset + length;
593
594 return 0;
595}
596
9f95a23c
TL
597volatile struct spdk_nvme_registers *
598nvme_pcie_ctrlr_get_registers(struct spdk_nvme_ctrlr *ctrlr)
599{
600 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
601
602 return pctrlr->regs;
603}
604
11fdf7f2
TL
605void *
606nvme_pcie_ctrlr_alloc_cmb_io_buffer(struct spdk_nvme_ctrlr *ctrlr, size_t size)
607{
608 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
609 uint64_t offset;
610
611 if (pctrlr->cmb_bar_virt_addr == NULL) {
612 SPDK_DEBUGLOG(SPDK_LOG_NVME, "CMB not available\n");
613 return NULL;
614 }
615
616 if (!pctrlr->cmb_io_data_supported) {
617 SPDK_DEBUGLOG(SPDK_LOG_NVME, "CMB doesn't support I/O data\n");
618 return NULL;
619 }
620
621 if (nvme_pcie_ctrlr_alloc_cmb(ctrlr, size, 4, &offset) != 0) {
622 SPDK_DEBUGLOG(SPDK_LOG_NVME, "%zu-byte CMB allocation failed\n", size);
623 return NULL;
624 }
625
626 return pctrlr->cmb_bar_virt_addr + offset;
627}
628
629int
630nvme_pcie_ctrlr_free_cmb_io_buffer(struct spdk_nvme_ctrlr *ctrlr, void *buf, size_t size)
631{
632 /*
633 * Do nothing for now.
634 * TODO: Track free space so buffers may be reused.
635 */
636 SPDK_ERRLOG("%s: no deallocation for CMB buffers yet!\n",
637 __func__);
638 return 0;
639}
640
7c673cae
FG
641static int
642nvme_pcie_ctrlr_allocate_bars(struct nvme_pcie_ctrlr *pctrlr)
643{
644 int rc;
645 void *addr;
646 uint64_t phys_addr, size;
647
648 rc = spdk_pci_device_map_bar(pctrlr->devhandle, 0, &addr,
649 &phys_addr, &size);
650 pctrlr->regs = (volatile struct spdk_nvme_registers *)addr;
651 if ((pctrlr->regs == NULL) || (rc != 0)) {
652 SPDK_ERRLOG("nvme_pcicfg_map_bar failed with rc %d or bar %p\n",
653 rc, pctrlr->regs);
654 return -1;
655 }
656
657 pctrlr->regs_size = size;
658 nvme_pcie_ctrlr_map_cmb(pctrlr);
659
660 return 0;
661}
662
663static int
664nvme_pcie_ctrlr_free_bars(struct nvme_pcie_ctrlr *pctrlr)
665{
666 int rc = 0;
667 void *addr = (void *)pctrlr->regs;
668
669 if (pctrlr->ctrlr.is_removed) {
670 return rc;
671 }
672
673 rc = nvme_pcie_ctrlr_unmap_cmb(pctrlr);
674 if (rc != 0) {
675 SPDK_ERRLOG("nvme_ctrlr_unmap_cmb failed with error code %d\n", rc);
676 return -1;
677 }
678
679 if (addr) {
680 /* NOTE: addr may have been remapped here. We're relying on DPDK to call
681 * munmap internally.
682 */
683 rc = spdk_pci_device_unmap_bar(pctrlr->devhandle, 0, addr);
684 }
685 return rc;
686}
687
688static int
689nvme_pcie_ctrlr_construct_admin_qpair(struct spdk_nvme_ctrlr *ctrlr)
690{
691 struct nvme_pcie_qpair *pqpair;
692 int rc;
693
11fdf7f2 694 pqpair = spdk_zmalloc(sizeof(*pqpair), 64, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE);
7c673cae
FG
695 if (pqpair == NULL) {
696 return -ENOMEM;
697 }
698
699 pqpair->num_entries = NVME_ADMIN_ENTRIES;
9f95a23c 700 pqpair->flags.delay_pcie_doorbell = 0;
7c673cae
FG
701
702 ctrlr->adminq = &pqpair->qpair;
703
704 rc = nvme_qpair_init(ctrlr->adminq,
705 0, /* qpair ID */
706 ctrlr,
707 SPDK_NVME_QPRIO_URGENT,
708 NVME_ADMIN_ENTRIES);
709 if (rc != 0) {
710 return rc;
711 }
712
713 return nvme_pcie_qpair_construct(ctrlr->adminq);
714}
715
716/* This function must only be called while holding g_spdk_nvme_driver->lock */
717static int
718pcie_nvme_enum_cb(void *ctx, struct spdk_pci_device *pci_dev)
719{
720 struct spdk_nvme_transport_id trid = {};
721 struct nvme_pcie_enum_ctx *enum_ctx = ctx;
722 struct spdk_nvme_ctrlr *ctrlr;
7c673cae
FG
723 struct spdk_pci_addr pci_addr;
724
725 pci_addr = spdk_pci_device_get_addr(pci_dev);
726
727 trid.trtype = SPDK_NVME_TRANSPORT_PCIE;
728 spdk_pci_addr_fmt(trid.traddr, sizeof(trid.traddr), &pci_addr);
729
11fdf7f2 730 ctrlr = spdk_nvme_get_ctrlr_by_trid_unsafe(&trid);
9f95a23c
TL
731 if (!spdk_process_is_primary()) {
732 if (!ctrlr) {
733 SPDK_ERRLOG("Controller must be constructed in the primary process first.\n");
734 return -1;
7c673cae 735 }
9f95a23c
TL
736
737 return nvme_ctrlr_add_process(ctrlr, pci_dev);
7c673cae
FG
738 }
739
740 /* check whether user passes the pci_addr */
741 if (enum_ctx->has_pci_addr &&
742 (spdk_pci_addr_compare(&pci_addr, &enum_ctx->pci_addr) != 0)) {
743 return 1;
744 }
745
9f95a23c 746 return nvme_ctrlr_probe(&trid, enum_ctx->probe_ctx, pci_dev);
7c673cae
FG
747}
748
749int
9f95a23c 750nvme_pcie_ctrlr_scan(struct spdk_nvme_probe_ctx *probe_ctx,
11fdf7f2 751 bool direct_connect)
7c673cae
FG
752{
753 struct nvme_pcie_enum_ctx enum_ctx = {};
754
9f95a23c 755 enum_ctx.probe_ctx = probe_ctx;
7c673cae 756
9f95a23c
TL
757 if (strlen(probe_ctx->trid.traddr) != 0) {
758 if (spdk_pci_addr_parse(&enum_ctx.pci_addr, probe_ctx->trid.traddr)) {
7c673cae
FG
759 return -1;
760 }
761 enum_ctx.has_pci_addr = true;
762 }
763
764 if (hotplug_fd < 0) {
765 hotplug_fd = spdk_uevent_connect();
766 if (hotplug_fd < 0) {
11fdf7f2 767 SPDK_DEBUGLOG(SPDK_LOG_NVME, "Failed to open uevent netlink socket\n");
7c673cae
FG
768 }
769 } else {
9f95a23c 770 _nvme_pcie_hotplug_monitor(probe_ctx);
7c673cae
FG
771 }
772
773 if (enum_ctx.has_pci_addr == false) {
9f95a23c
TL
774 return spdk_pci_enumerate(spdk_pci_nvme_get_driver(),
775 pcie_nvme_enum_cb, &enum_ctx);
7c673cae 776 } else {
9f95a23c
TL
777 return spdk_pci_device_attach(spdk_pci_nvme_get_driver(),
778 pcie_nvme_enum_cb, &enum_ctx, &enum_ctx.pci_addr);
7c673cae
FG
779 }
780}
781
782static int
9f95a23c 783nvme_pcie_ctrlr_attach(struct spdk_nvme_probe_ctx *probe_ctx, struct spdk_pci_addr *pci_addr)
7c673cae
FG
784{
785 struct nvme_pcie_enum_ctx enum_ctx;
786
9f95a23c
TL
787 enum_ctx.probe_ctx = probe_ctx;
788 enum_ctx.has_pci_addr = true;
789 enum_ctx.pci_addr = *pci_addr;
7c673cae 790
9f95a23c 791 return spdk_pci_enumerate(spdk_pci_nvme_get_driver(), pcie_nvme_enum_cb, &enum_ctx);
7c673cae
FG
792}
793
794struct spdk_nvme_ctrlr *nvme_pcie_ctrlr_construct(const struct spdk_nvme_transport_id *trid,
795 const struct spdk_nvme_ctrlr_opts *opts,
796 void *devhandle)
797{
798 struct spdk_pci_device *pci_dev = devhandle;
799 struct nvme_pcie_ctrlr *pctrlr;
800 union spdk_nvme_cap_register cap;
11fdf7f2 801 union spdk_nvme_vs_register vs;
7c673cae 802 uint32_t cmd_reg;
11fdf7f2 803 int rc, claim_fd;
7c673cae 804 struct spdk_pci_id pci_id;
11fdf7f2
TL
805 struct spdk_pci_addr pci_addr;
806
807 if (spdk_pci_addr_parse(&pci_addr, trid->traddr)) {
808 SPDK_ERRLOG("could not parse pci address\n");
809 return NULL;
810 }
811
812 claim_fd = spdk_pci_device_claim(&pci_addr);
813 if (claim_fd < 0) {
814 SPDK_ERRLOG("could not claim device %s\n", trid->traddr);
815 return NULL;
816 }
7c673cae 817
11fdf7f2
TL
818 pctrlr = spdk_zmalloc(sizeof(struct nvme_pcie_ctrlr), 64, NULL,
819 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE);
7c673cae 820 if (pctrlr == NULL) {
11fdf7f2 821 close(claim_fd);
7c673cae
FG
822 SPDK_ERRLOG("could not allocate ctrlr\n");
823 return NULL;
824 }
825
826 pctrlr->is_remapped = false;
827 pctrlr->ctrlr.is_removed = false;
828 pctrlr->ctrlr.trid.trtype = SPDK_NVME_TRANSPORT_PCIE;
829 pctrlr->devhandle = devhandle;
830 pctrlr->ctrlr.opts = *opts;
11fdf7f2 831 pctrlr->claim_fd = claim_fd;
7c673cae
FG
832 memcpy(&pctrlr->ctrlr.trid, trid, sizeof(pctrlr->ctrlr.trid));
833
834 rc = nvme_pcie_ctrlr_allocate_bars(pctrlr);
835 if (rc != 0) {
11fdf7f2 836 close(claim_fd);
7c673cae
FG
837 spdk_free(pctrlr);
838 return NULL;
839 }
840
841 /* Enable PCI busmaster and disable INTx */
842 spdk_pci_device_cfg_read32(pci_dev, &cmd_reg, 4);
843 cmd_reg |= 0x404;
844 spdk_pci_device_cfg_write32(pci_dev, cmd_reg, 4);
845
846 if (nvme_ctrlr_get_cap(&pctrlr->ctrlr, &cap)) {
847 SPDK_ERRLOG("get_cap() failed\n");
11fdf7f2 848 close(claim_fd);
7c673cae
FG
849 spdk_free(pctrlr);
850 return NULL;
851 }
852
11fdf7f2
TL
853 if (nvme_ctrlr_get_vs(&pctrlr->ctrlr, &vs)) {
854 SPDK_ERRLOG("get_vs() failed\n");
855 close(claim_fd);
856 spdk_free(pctrlr);
857 return NULL;
858 }
859
860 nvme_ctrlr_init_cap(&pctrlr->ctrlr, &cap, &vs);
7c673cae
FG
861
862 /* Doorbell stride is 2 ^ (dstrd + 2),
863 * but we want multiples of 4, so drop the + 2 */
864 pctrlr->doorbell_stride_u32 = 1 << cap.bits.dstrd;
865
866 rc = nvme_ctrlr_construct(&pctrlr->ctrlr);
867 if (rc != 0) {
868 nvme_ctrlr_destruct(&pctrlr->ctrlr);
869 return NULL;
870 }
871
872 pci_id = spdk_pci_device_get_id(pci_dev);
873 pctrlr->ctrlr.quirks = nvme_get_quirks(&pci_id);
874
875 rc = nvme_pcie_ctrlr_construct_admin_qpair(&pctrlr->ctrlr);
876 if (rc != 0) {
877 nvme_ctrlr_destruct(&pctrlr->ctrlr);
878 return NULL;
879 }
880
881 /* Construct the primary process properties */
882 rc = nvme_ctrlr_add_process(&pctrlr->ctrlr, pci_dev);
883 if (rc != 0) {
884 nvme_ctrlr_destruct(&pctrlr->ctrlr);
885 return NULL;
886 }
887
888 if (g_sigset != true) {
889 nvme_pcie_ctrlr_setup_signal();
890 g_sigset = true;
891 }
892
893 return &pctrlr->ctrlr;
894}
895
896int
897nvme_pcie_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr)
898{
899 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
900 struct nvme_pcie_qpair *padminq = nvme_pcie_qpair(ctrlr->adminq);
901 union spdk_nvme_aqa_register aqa;
902
903 if (nvme_pcie_ctrlr_set_asq(pctrlr, padminq->cmd_bus_addr)) {
904 SPDK_ERRLOG("set_asq() failed\n");
905 return -EIO;
906 }
907
908 if (nvme_pcie_ctrlr_set_acq(pctrlr, padminq->cpl_bus_addr)) {
909 SPDK_ERRLOG("set_acq() failed\n");
910 return -EIO;
911 }
912
913 aqa.raw = 0;
914 /* acqs and asqs are 0-based. */
915 aqa.bits.acqs = nvme_pcie_qpair(ctrlr->adminq)->num_entries - 1;
916 aqa.bits.asqs = nvme_pcie_qpair(ctrlr->adminq)->num_entries - 1;
917
918 if (nvme_pcie_ctrlr_set_aqa(pctrlr, &aqa)) {
919 SPDK_ERRLOG("set_aqa() failed\n");
920 return -EIO;
921 }
922
923 return 0;
924}
925
926int
927nvme_pcie_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr)
928{
929 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
11fdf7f2
TL
930 struct spdk_pci_device *devhandle = nvme_ctrlr_proc_get_devhandle(ctrlr);
931
932 close(pctrlr->claim_fd);
7c673cae
FG
933
934 if (ctrlr->adminq) {
935 nvme_pcie_qpair_destroy(ctrlr->adminq);
936 }
937
11fdf7f2
TL
938 nvme_ctrlr_destruct_finish(ctrlr);
939
7c673cae
FG
940 nvme_ctrlr_free_processes(ctrlr);
941
942 nvme_pcie_ctrlr_free_bars(pctrlr);
11fdf7f2
TL
943
944 if (devhandle) {
945 spdk_pci_device_detach(devhandle);
946 }
947
7c673cae
FG
948 spdk_free(pctrlr);
949
950 return 0;
951}
952
953static void
954nvme_qpair_construct_tracker(struct nvme_tracker *tr, uint16_t cid, uint64_t phys_addr)
955{
956 tr->prp_sgl_bus_addr = phys_addr + offsetof(struct nvme_tracker, u.prp);
957 tr->cid = cid;
9f95a23c 958 tr->req = NULL;
7c673cae
FG
959}
960
961int
962nvme_pcie_qpair_reset(struct spdk_nvme_qpair *qpair)
963{
964 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
965
9f95a23c 966 pqpair->last_sq_tail = pqpair->sq_tail = pqpair->cq_head = 0;
7c673cae
FG
967
968 /*
969 * First time through the completion queue, HW will set phase
970 * bit on completions to 1. So set this to 1 here, indicating
971 * we're looking for a 1 to know which entries have completed.
972 * we'll toggle the bit each time when the completion queue
973 * rolls over.
974 */
9f95a23c 975 pqpair->flags.phase = 1;
7c673cae
FG
976
977 memset(pqpair->cmd, 0,
978 pqpair->num_entries * sizeof(struct spdk_nvme_cmd));
979 memset(pqpair->cpl, 0,
980 pqpair->num_entries * sizeof(struct spdk_nvme_cpl));
981
982 return 0;
983}
984
985static int
986nvme_pcie_qpair_construct(struct spdk_nvme_qpair *qpair)
987{
988 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
989 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
990 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
991 struct nvme_tracker *tr;
992 uint16_t i;
993 volatile uint32_t *doorbell_base;
7c673cae
FG
994 uint64_t offset;
995 uint16_t num_trackers;
9f95a23c 996 size_t page_align = VALUE_2MB;
11fdf7f2 997 uint32_t flags = SPDK_MALLOC_DMA;
7c673cae 998
11fdf7f2
TL
999 /*
1000 * Limit the maximum number of completions to return per call to prevent wraparound,
1001 * and calculate how many trackers can be submitted at once without overflowing the
1002 * completion queue.
1003 */
1004 pqpair->max_completions_cap = pqpair->num_entries / 4;
1005 pqpair->max_completions_cap = spdk_max(pqpair->max_completions_cap, NVME_MIN_COMPLETIONS);
1006 pqpair->max_completions_cap = spdk_min(pqpair->max_completions_cap, NVME_MAX_COMPLETIONS);
1007 num_trackers = pqpair->num_entries - pqpair->max_completions_cap;
1008
1009 SPDK_INFOLOG(SPDK_LOG_NVME, "max_completions_cap = %" PRIu16 " num_trackers = %" PRIu16 "\n",
1010 pqpair->max_completions_cap, num_trackers);
7c673cae
FG
1011
1012 assert(num_trackers != 0);
1013
1014 pqpair->sq_in_cmb = false;
1015
11fdf7f2
TL
1016 if (nvme_qpair_is_admin_queue(&pqpair->qpair)) {
1017 flags |= SPDK_MALLOC_SHARE;
1018 }
1019
1020 /* cmd and cpl rings must be aligned on page size boundaries. */
7c673cae
FG
1021 if (ctrlr->opts.use_cmb_sqs) {
1022 if (nvme_pcie_ctrlr_alloc_cmb(ctrlr, pqpair->num_entries * sizeof(struct spdk_nvme_cmd),
11fdf7f2 1023 sysconf(_SC_PAGESIZE), &offset) == 0) {
7c673cae
FG
1024 pqpair->cmd = pctrlr->cmb_bar_virt_addr + offset;
1025 pqpair->cmd_bus_addr = pctrlr->cmb_bar_phys_addr + offset;
1026 pqpair->sq_in_cmb = true;
1027 }
1028 }
11fdf7f2
TL
1029
1030 /* To ensure physical address contiguity we make each ring occupy
1031 * a single hugepage only. See MAX_IO_QUEUE_ENTRIES.
1032 */
7c673cae
FG
1033 if (pqpair->sq_in_cmb == false) {
1034 pqpair->cmd = spdk_zmalloc(pqpair->num_entries * sizeof(struct spdk_nvme_cmd),
9f95a23c 1035 page_align, NULL,
11fdf7f2 1036 SPDK_ENV_SOCKET_ID_ANY, flags);
7c673cae
FG
1037 if (pqpair->cmd == NULL) {
1038 SPDK_ERRLOG("alloc qpair_cmd failed\n");
1039 return -ENOMEM;
1040 }
9f95a23c
TL
1041
1042 pqpair->cmd_bus_addr = spdk_vtophys(pqpair->cmd, NULL);
1043 if (pqpair->cmd_bus_addr == SPDK_VTOPHYS_ERROR) {
1044 SPDK_ERRLOG("spdk_vtophys(pqpair->cmd) failed\n");
1045 return -EFAULT;
1046 }
7c673cae
FG
1047 }
1048
1049 pqpair->cpl = spdk_zmalloc(pqpair->num_entries * sizeof(struct spdk_nvme_cpl),
9f95a23c 1050 page_align, NULL,
11fdf7f2 1051 SPDK_ENV_SOCKET_ID_ANY, flags);
7c673cae
FG
1052 if (pqpair->cpl == NULL) {
1053 SPDK_ERRLOG("alloc qpair_cpl failed\n");
1054 return -ENOMEM;
1055 }
1056
9f95a23c
TL
1057 pqpair->cpl_bus_addr = spdk_vtophys(pqpair->cpl, NULL);
1058 if (pqpair->cpl_bus_addr == SPDK_VTOPHYS_ERROR) {
1059 SPDK_ERRLOG("spdk_vtophys(pqpair->cpl) failed\n");
1060 return -EFAULT;
1061 }
1062
7c673cae
FG
1063 doorbell_base = &pctrlr->regs->doorbell[0].sq_tdbl;
1064 pqpair->sq_tdbl = doorbell_base + (2 * qpair->id + 0) * pctrlr->doorbell_stride_u32;
1065 pqpair->cq_hdbl = doorbell_base + (2 * qpair->id + 1) * pctrlr->doorbell_stride_u32;
1066
1067 /*
1068 * Reserve space for all of the trackers in a single allocation.
1069 * struct nvme_tracker must be padded so that its size is already a power of 2.
1070 * This ensures the PRP list embedded in the nvme_tracker object will not span a
1071 * 4KB boundary, while allowing access to trackers in tr[] via normal array indexing.
1072 */
11fdf7f2
TL
1073 pqpair->tr = spdk_zmalloc(num_trackers * sizeof(*tr), sizeof(*tr), NULL,
1074 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE);
7c673cae
FG
1075 if (pqpair->tr == NULL) {
1076 SPDK_ERRLOG("nvme_tr failed\n");
1077 return -ENOMEM;
1078 }
1079
1080 TAILQ_INIT(&pqpair->free_tr);
1081 TAILQ_INIT(&pqpair->outstanding_tr);
1082
1083 for (i = 0; i < num_trackers; i++) {
1084 tr = &pqpair->tr[i];
9f95a23c 1085 nvme_qpair_construct_tracker(tr, i, spdk_vtophys(tr, NULL));
7c673cae 1086 TAILQ_INSERT_HEAD(&pqpair->free_tr, tr, tq_list);
7c673cae
FG
1087 }
1088
1089 nvme_pcie_qpair_reset(qpair);
1090
1091 return 0;
1092}
1093
1094static inline void
1095nvme_pcie_copy_command(struct spdk_nvme_cmd *dst, const struct spdk_nvme_cmd *src)
1096{
1097 /* dst and src are known to be non-overlapping and 64-byte aligned. */
9f95a23c 1098#if defined(__SSE2__)
7c673cae
FG
1099 __m128i *d128 = (__m128i *)dst;
1100 const __m128i *s128 = (const __m128i *)src;
1101
9f95a23c
TL
1102 _mm_stream_si128(&d128[0], _mm_load_si128(&s128[0]));
1103 _mm_stream_si128(&d128[1], _mm_load_si128(&s128[1]));
1104 _mm_stream_si128(&d128[2], _mm_load_si128(&s128[2]));
1105 _mm_stream_si128(&d128[3], _mm_load_si128(&s128[3]));
7c673cae
FG
1106#else
1107 *dst = *src;
1108#endif
1109}
1110
1111/**
1112 * Note: the ctrlr_lock must be held when calling this function.
1113 */
1114static void
1115nvme_pcie_qpair_insert_pending_admin_request(struct spdk_nvme_qpair *qpair,
1116 struct nvme_request *req, struct spdk_nvme_cpl *cpl)
1117{
1118 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
1119 struct nvme_request *active_req = req;
1120 struct spdk_nvme_ctrlr_process *active_proc;
7c673cae
FG
1121
1122 /*
1123 * The admin request is from another process. Move to the per
1124 * process list for that process to handle it later.
1125 */
1126 assert(nvme_qpair_is_admin_queue(qpair));
1127 assert(active_req->pid != getpid());
1128
11fdf7f2
TL
1129 active_proc = spdk_nvme_ctrlr_get_process(ctrlr, active_req->pid);
1130 if (active_proc) {
1131 /* Save the original completion information */
1132 memcpy(&active_req->cpl, cpl, sizeof(*cpl));
1133 STAILQ_INSERT_TAIL(&active_proc->active_reqs, active_req, stailq);
1134 } else {
1135 SPDK_ERRLOG("The owning process (pid %d) is not found. Dropping the request.\n",
7c673cae
FG
1136 active_req->pid);
1137
1138 nvme_free_request(active_req);
1139 }
1140}
1141
1142/**
1143 * Note: the ctrlr_lock must be held when calling this function.
1144 */
1145static void
1146nvme_pcie_qpair_complete_pending_admin_request(struct spdk_nvme_qpair *qpair)
1147{
1148 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
1149 struct nvme_request *req, *tmp_req;
7c673cae
FG
1150 pid_t pid = getpid();
1151 struct spdk_nvme_ctrlr_process *proc;
1152
1153 /*
1154 * Check whether there is any pending admin request from
1155 * other active processes.
1156 */
1157 assert(nvme_qpair_is_admin_queue(qpair));
1158
11fdf7f2
TL
1159 proc = spdk_nvme_ctrlr_get_current_process(ctrlr);
1160 if (!proc) {
7c673cae 1161 SPDK_ERRLOG("the active process (pid %d) is not found for this controller.\n", pid);
11fdf7f2
TL
1162 assert(proc);
1163 return;
7c673cae
FG
1164 }
1165
1166 STAILQ_FOREACH_SAFE(req, &proc->active_reqs, stailq, tmp_req) {
1167 STAILQ_REMOVE(&proc->active_reqs, req, nvme_request, stailq);
1168
1169 assert(req->pid == pid);
1170
9f95a23c 1171 nvme_complete_request(req->cb_fn, req->cb_arg, qpair, req, &req->cpl);
7c673cae
FG
1172 nvme_free_request(req);
1173 }
1174}
1175
11fdf7f2
TL
1176static inline int
1177nvme_pcie_qpair_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old)
1178{
1179 return (uint16_t)(new_idx - event_idx) <= (uint16_t)(new_idx - old);
1180}
1181
1182static bool
1183nvme_pcie_qpair_update_mmio_required(struct spdk_nvme_qpair *qpair, uint16_t value,
1184 volatile uint32_t *shadow_db,
1185 volatile uint32_t *eventidx)
1186{
1187 uint16_t old;
1188
1189 if (!shadow_db) {
1190 return true;
1191 }
1192
1193 old = *shadow_db;
1194 *shadow_db = value;
1195
1196 if (!nvme_pcie_qpair_need_event(*eventidx, value, old)) {
1197 return false;
1198 }
1199
1200 return true;
1201}
1202
9f95a23c
TL
1203static inline void
1204nvme_pcie_qpair_ring_sq_doorbell(struct spdk_nvme_qpair *qpair)
1205{
1206 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1207 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(qpair->ctrlr);
1208 bool need_mmio = true;
1209
1210 if (spdk_unlikely(pqpair->flags.has_shadow_doorbell)) {
1211 need_mmio = nvme_pcie_qpair_update_mmio_required(qpair,
1212 pqpair->sq_tail,
1213 pqpair->shadow_doorbell.sq_tdbl,
1214 pqpair->shadow_doorbell.sq_eventidx);
1215 }
1216
1217 if (spdk_likely(need_mmio)) {
1218 spdk_wmb();
1219 g_thread_mmio_ctrlr = pctrlr;
1220 spdk_mmio_write_4(pqpair->sq_tdbl, pqpair->sq_tail);
1221 g_thread_mmio_ctrlr = NULL;
1222 }
1223}
1224
1225static inline void
1226nvme_pcie_qpair_ring_cq_doorbell(struct spdk_nvme_qpair *qpair)
1227{
1228 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1229 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(qpair->ctrlr);
1230 bool need_mmio = true;
1231
1232 if (spdk_unlikely(pqpair->flags.has_shadow_doorbell)) {
1233 need_mmio = nvme_pcie_qpair_update_mmio_required(qpair,
1234 pqpair->cq_head,
1235 pqpair->shadow_doorbell.cq_hdbl,
1236 pqpair->shadow_doorbell.cq_eventidx);
1237 }
1238
1239 if (spdk_likely(need_mmio)) {
1240 g_thread_mmio_ctrlr = pctrlr;
1241 spdk_mmio_write_4(pqpair->cq_hdbl, pqpair->cq_head);
1242 g_thread_mmio_ctrlr = NULL;
1243 }
1244}
1245
7c673cae
FG
1246static void
1247nvme_pcie_qpair_submit_tracker(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr)
1248{
1249 struct nvme_request *req;
1250 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
7c673cae 1251
7c673cae 1252 req = tr->req;
11fdf7f2 1253 assert(req != NULL);
7c673cae
FG
1254
1255 /* Copy the command from the tracker to the submission queue. */
1256 nvme_pcie_copy_command(&pqpair->cmd[pqpair->sq_tail], &req->cmd);
1257
9f95a23c 1258 if (spdk_unlikely(++pqpair->sq_tail == pqpair->num_entries)) {
7c673cae
FG
1259 pqpair->sq_tail = 0;
1260 }
1261
9f95a23c 1262 if (spdk_unlikely(pqpair->sq_tail == pqpair->sq_head)) {
11fdf7f2
TL
1263 SPDK_ERRLOG("sq_tail is passing sq_head!\n");
1264 }
1265
9f95a23c
TL
1266 if (!pqpair->flags.delay_pcie_doorbell) {
1267 nvme_pcie_qpair_ring_sq_doorbell(qpair);
11fdf7f2 1268 }
7c673cae
FG
1269}
1270
1271static void
1272nvme_pcie_qpair_complete_tracker(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr,
1273 struct spdk_nvme_cpl *cpl, bool print_on_error)
1274{
1275 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1276 struct nvme_request *req;
9f95a23c 1277 bool retry, error;
7c673cae
FG
1278 bool req_from_current_proc = true;
1279
1280 req = tr->req;
1281
1282 assert(req != NULL);
1283
1284 error = spdk_nvme_cpl_is_error(cpl);
1285 retry = error && nvme_completion_is_retry(cpl) &&
1286 req->retries < spdk_nvme_retry_count;
1287
9f95a23c 1288 if (error && print_on_error && !qpair->ctrlr->opts.disable_error_logging) {
7c673cae
FG
1289 nvme_qpair_print_command(qpair, &req->cmd);
1290 nvme_qpair_print_completion(qpair, cpl);
1291 }
1292
7c673cae
FG
1293 assert(cpl->cid == req->cmd.cid);
1294
1295 if (retry) {
1296 req->retries++;
1297 nvme_pcie_qpair_submit_tracker(qpair, tr);
1298 } else {
9f95a23c
TL
1299 /* Only check admin requests from different processes. */
1300 if (nvme_qpair_is_admin_queue(qpair) && req->pid != getpid()) {
1301 req_from_current_proc = false;
1302 nvme_pcie_qpair_insert_pending_admin_request(qpair, req, cpl);
1303 } else {
1304 nvme_complete_request(tr->cb_fn, tr->cb_arg, qpair, req, cpl);
7c673cae
FG
1305 }
1306
1307 if (req_from_current_proc == true) {
9f95a23c 1308 nvme_qpair_free_request(qpair, req);
7c673cae
FG
1309 }
1310
1311 tr->req = NULL;
1312
1313 TAILQ_REMOVE(&pqpair->outstanding_tr, tr, tq_list);
1314 TAILQ_INSERT_HEAD(&pqpair->free_tr, tr, tq_list);
1315
1316 /*
1317 * If the controller is in the middle of resetting, don't
1318 * try to submit queued requests here - let the reset logic
1319 * handle that instead.
1320 */
1321 if (!STAILQ_EMPTY(&qpair->queued_req) &&
1322 !qpair->ctrlr->is_resetting) {
1323 req = STAILQ_FIRST(&qpair->queued_req);
1324 STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
1325 nvme_qpair_submit_request(qpair, req);
1326 }
1327 }
1328}
1329
1330static void
1331nvme_pcie_qpair_manual_complete_tracker(struct spdk_nvme_qpair *qpair,
1332 struct nvme_tracker *tr, uint32_t sct, uint32_t sc, uint32_t dnr,
1333 bool print_on_error)
1334{
1335 struct spdk_nvme_cpl cpl;
1336
1337 memset(&cpl, 0, sizeof(cpl));
1338 cpl.sqid = qpair->id;
1339 cpl.cid = tr->cid;
1340 cpl.status.sct = sct;
1341 cpl.status.sc = sc;
1342 cpl.status.dnr = dnr;
1343 nvme_pcie_qpair_complete_tracker(qpair, tr, &cpl, print_on_error);
1344}
1345
1346static void
1347nvme_pcie_qpair_abort_trackers(struct spdk_nvme_qpair *qpair, uint32_t dnr)
1348{
1349 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1350 struct nvme_tracker *tr, *temp;
1351
1352 TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, temp) {
9f95a23c
TL
1353 if (!qpair->ctrlr->opts.disable_error_logging) {
1354 SPDK_ERRLOG("aborting outstanding command\n");
1355 }
7c673cae
FG
1356 nvme_pcie_qpair_manual_complete_tracker(qpair, tr, SPDK_NVME_SCT_GENERIC,
1357 SPDK_NVME_SC_ABORTED_BY_REQUEST, dnr, true);
1358 }
1359}
1360
9f95a23c 1361void
7c673cae
FG
1362nvme_pcie_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair)
1363{
1364 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1365 struct nvme_tracker *tr;
1366
1367 tr = TAILQ_FIRST(&pqpair->outstanding_tr);
1368 while (tr != NULL) {
1369 assert(tr->req != NULL);
1370 if (tr->req->cmd.opc == SPDK_NVME_OPC_ASYNC_EVENT_REQUEST) {
1371 nvme_pcie_qpair_manual_complete_tracker(qpair, tr,
1372 SPDK_NVME_SCT_GENERIC, SPDK_NVME_SC_ABORTED_SQ_DELETION, 0,
1373 false);
1374 tr = TAILQ_FIRST(&pqpair->outstanding_tr);
1375 } else {
1376 tr = TAILQ_NEXT(tr, tq_list);
1377 }
1378 }
1379}
1380
1381static void
1382nvme_pcie_admin_qpair_destroy(struct spdk_nvme_qpair *qpair)
1383{
1384 nvme_pcie_admin_qpair_abort_aers(qpair);
1385}
1386
1387static int
1388nvme_pcie_qpair_destroy(struct spdk_nvme_qpair *qpair)
1389{
1390 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1391
1392 if (nvme_qpair_is_admin_queue(qpair)) {
1393 nvme_pcie_admin_qpair_destroy(qpair);
1394 }
1395 if (pqpair->cmd && !pqpair->sq_in_cmb) {
1396 spdk_free(pqpair->cmd);
1397 }
1398 if (pqpair->cpl) {
1399 spdk_free(pqpair->cpl);
1400 }
1401 if (pqpair->tr) {
1402 spdk_free(pqpair->tr);
1403 }
1404
11fdf7f2
TL
1405 nvme_qpair_deinit(qpair);
1406
7c673cae
FG
1407 spdk_free(pqpair);
1408
1409 return 0;
1410}
1411
9f95a23c
TL
1412void
1413nvme_pcie_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr)
7c673cae 1414{
9f95a23c 1415 nvme_pcie_qpair_abort_trackers(qpair, dnr);
7c673cae
FG
1416}
1417
1418static int
1419nvme_pcie_ctrlr_cmd_create_io_cq(struct spdk_nvme_ctrlr *ctrlr,
1420 struct spdk_nvme_qpair *io_que, spdk_nvme_cmd_cb cb_fn,
1421 void *cb_arg)
1422{
1423 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(io_que);
1424 struct nvme_request *req;
1425 struct spdk_nvme_cmd *cmd;
1426
1427 req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
1428 if (req == NULL) {
1429 return -ENOMEM;
1430 }
1431
1432 cmd = &req->cmd;
1433 cmd->opc = SPDK_NVME_OPC_CREATE_IO_CQ;
1434
1435 /*
1436 * TODO: create a create io completion queue command data
1437 * structure.
1438 */
1439 cmd->cdw10 = ((pqpair->num_entries - 1) << 16) | io_que->id;
1440 /*
1441 * 0x2 = interrupts enabled
1442 * 0x1 = physically contiguous
1443 */
1444 cmd->cdw11 = 0x1;
1445 cmd->dptr.prp.prp1 = pqpair->cpl_bus_addr;
1446
1447 return nvme_ctrlr_submit_admin_request(ctrlr, req);
1448}
1449
1450static int
1451nvme_pcie_ctrlr_cmd_create_io_sq(struct spdk_nvme_ctrlr *ctrlr,
1452 struct spdk_nvme_qpair *io_que, spdk_nvme_cmd_cb cb_fn, void *cb_arg)
1453{
1454 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(io_que);
1455 struct nvme_request *req;
1456 struct spdk_nvme_cmd *cmd;
1457
1458 req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
1459 if (req == NULL) {
1460 return -ENOMEM;
1461 }
1462
1463 cmd = &req->cmd;
1464 cmd->opc = SPDK_NVME_OPC_CREATE_IO_SQ;
1465
1466 /*
1467 * TODO: create a create io submission queue command data
1468 * structure.
1469 */
1470 cmd->cdw10 = ((pqpair->num_entries - 1) << 16) | io_que->id;
1471 /* 0x1 = physically contiguous */
1472 cmd->cdw11 = (io_que->id << 16) | (io_que->qprio << 1) | 0x1;
1473 cmd->dptr.prp.prp1 = pqpair->cmd_bus_addr;
1474
1475 return nvme_ctrlr_submit_admin_request(ctrlr, req);
1476}
1477
1478static int
1479nvme_pcie_ctrlr_cmd_delete_io_cq(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair,
1480 spdk_nvme_cmd_cb cb_fn, void *cb_arg)
1481{
1482 struct nvme_request *req;
1483 struct spdk_nvme_cmd *cmd;
1484
1485 req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
1486 if (req == NULL) {
1487 return -ENOMEM;
1488 }
1489
1490 cmd = &req->cmd;
1491 cmd->opc = SPDK_NVME_OPC_DELETE_IO_CQ;
1492 cmd->cdw10 = qpair->id;
1493
1494 return nvme_ctrlr_submit_admin_request(ctrlr, req);
1495}
1496
1497static int
1498nvme_pcie_ctrlr_cmd_delete_io_sq(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair,
1499 spdk_nvme_cmd_cb cb_fn, void *cb_arg)
1500{
1501 struct nvme_request *req;
1502 struct spdk_nvme_cmd *cmd;
1503
1504 req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
1505 if (req == NULL) {
1506 return -ENOMEM;
1507 }
1508
1509 cmd = &req->cmd;
1510 cmd->opc = SPDK_NVME_OPC_DELETE_IO_SQ;
1511 cmd->cdw10 = qpair->id;
1512
1513 return nvme_ctrlr_submit_admin_request(ctrlr, req);
1514}
1515
1516static int
1517_nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair,
1518 uint16_t qid)
1519{
11fdf7f2
TL
1520 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
1521 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
7c673cae
FG
1522 struct nvme_completion_poll_status status;
1523 int rc;
1524
7c673cae
FG
1525 rc = nvme_pcie_ctrlr_cmd_create_io_cq(ctrlr, qpair, nvme_completion_poll_cb, &status);
1526 if (rc != 0) {
1527 return rc;
1528 }
1529
11fdf7f2 1530 if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) {
7c673cae
FG
1531 SPDK_ERRLOG("nvme_create_io_cq failed!\n");
1532 return -1;
1533 }
1534
7c673cae
FG
1535 rc = nvme_pcie_ctrlr_cmd_create_io_sq(qpair->ctrlr, qpair, nvme_completion_poll_cb, &status);
1536 if (rc != 0) {
1537 return rc;
1538 }
1539
11fdf7f2 1540 if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) {
7c673cae
FG
1541 SPDK_ERRLOG("nvme_create_io_sq failed!\n");
1542 /* Attempt to delete the completion queue */
7c673cae
FG
1543 rc = nvme_pcie_ctrlr_cmd_delete_io_cq(qpair->ctrlr, qpair, nvme_completion_poll_cb, &status);
1544 if (rc != 0) {
1545 return -1;
1546 }
11fdf7f2 1547 spdk_nvme_wait_for_completion(ctrlr->adminq, &status);
7c673cae
FG
1548 return -1;
1549 }
1550
11fdf7f2 1551 if (ctrlr->shadow_doorbell) {
9f95a23c
TL
1552 pqpair->shadow_doorbell.sq_tdbl = ctrlr->shadow_doorbell + (2 * qpair->id + 0) *
1553 pctrlr->doorbell_stride_u32;
1554 pqpair->shadow_doorbell.cq_hdbl = ctrlr->shadow_doorbell + (2 * qpair->id + 1) *
1555 pctrlr->doorbell_stride_u32;
1556 pqpair->shadow_doorbell.sq_eventidx = ctrlr->eventidx + (2 * qpair->id + 0) *
1557 pctrlr->doorbell_stride_u32;
1558 pqpair->shadow_doorbell.cq_eventidx = ctrlr->eventidx + (2 * qpair->id + 1) *
1559 pctrlr->doorbell_stride_u32;
1560 pqpair->flags.has_shadow_doorbell = 1;
1561 } else {
1562 pqpair->flags.has_shadow_doorbell = 0;
11fdf7f2 1563 }
7c673cae
FG
1564 nvme_pcie_qpair_reset(qpair);
1565
1566 return 0;
1567}
1568
1569struct spdk_nvme_qpair *
1570nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid,
11fdf7f2 1571 const struct spdk_nvme_io_qpair_opts *opts)
7c673cae
FG
1572{
1573 struct nvme_pcie_qpair *pqpair;
1574 struct spdk_nvme_qpair *qpair;
1575 int rc;
1576
1577 assert(ctrlr != NULL);
1578
11fdf7f2
TL
1579 pqpair = spdk_zmalloc(sizeof(*pqpair), 64, NULL,
1580 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE);
7c673cae
FG
1581 if (pqpair == NULL) {
1582 return NULL;
1583 }
1584
11fdf7f2 1585 pqpair->num_entries = opts->io_queue_size;
9f95a23c 1586 pqpair->flags.delay_pcie_doorbell = opts->delay_pcie_doorbell;
7c673cae
FG
1587
1588 qpair = &pqpair->qpair;
1589
11fdf7f2 1590 rc = nvme_qpair_init(qpair, qid, ctrlr, opts->qprio, opts->io_queue_requests);
7c673cae
FG
1591 if (rc != 0) {
1592 nvme_pcie_qpair_destroy(qpair);
1593 return NULL;
1594 }
1595
1596 rc = nvme_pcie_qpair_construct(qpair);
1597 if (rc != 0) {
1598 nvme_pcie_qpair_destroy(qpair);
1599 return NULL;
1600 }
1601
1602 rc = _nvme_pcie_ctrlr_create_io_qpair(ctrlr, qpair, qid);
1603
1604 if (rc != 0) {
1605 SPDK_ERRLOG("I/O queue creation failed\n");
1606 nvme_pcie_qpair_destroy(qpair);
1607 return NULL;
1608 }
1609
1610 return qpair;
1611}
1612
1613int
9f95a23c
TL
1614nvme_pcie_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
1615{
1616 if (nvme_qpair_is_admin_queue(qpair)) {
1617 return 0;
1618 } else {
1619 return _nvme_pcie_ctrlr_create_io_qpair(ctrlr, qpair, qpair->id);
1620 }
1621}
1622
1623void
1624nvme_pcie_ctrlr_disconnect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
7c673cae 1625{
7c673cae
FG
1626}
1627
1628int
1629nvme_pcie_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
1630{
1631 struct nvme_completion_poll_status status;
1632 int rc;
1633
1634 assert(ctrlr != NULL);
1635
1636 if (ctrlr->is_removed) {
1637 goto free;
1638 }
1639
11fdf7f2 1640 /* Delete the I/O submission queue */
7c673cae
FG
1641 rc = nvme_pcie_ctrlr_cmd_delete_io_sq(ctrlr, qpair, nvme_completion_poll_cb, &status);
1642 if (rc != 0) {
9f95a23c 1643 SPDK_ERRLOG("Failed to send request to delete_io_sq with rc=%d\n", rc);
7c673cae
FG
1644 return rc;
1645 }
11fdf7f2 1646 if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) {
7c673cae
FG
1647 return -1;
1648 }
1649
11fdf7f2 1650 /* Delete the completion queue */
7c673cae
FG
1651 rc = nvme_pcie_ctrlr_cmd_delete_io_cq(ctrlr, qpair, nvme_completion_poll_cb, &status);
1652 if (rc != 0) {
9f95a23c 1653 SPDK_ERRLOG("Failed to send request to delete_io_cq with rc=%d\n", rc);
7c673cae
FG
1654 return rc;
1655 }
11fdf7f2 1656 if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) {
7c673cae
FG
1657 return -1;
1658 }
1659
1660free:
9f95a23c
TL
1661 if (qpair->no_deletion_notification_needed == 0) {
1662 /* Abort the rest of the I/O */
1663 nvme_pcie_qpair_abort_trackers(qpair, 1);
1664 }
1665
7c673cae
FG
1666 nvme_pcie_qpair_destroy(qpair);
1667 return 0;
1668}
1669
1670static void
1671nvme_pcie_fail_request_bad_vtophys(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr)
1672{
1673 /*
1674 * Bad vtophys translation, so abort this request and return
1675 * immediately.
1676 */
1677 nvme_pcie_qpair_manual_complete_tracker(qpair, tr, SPDK_NVME_SCT_GENERIC,
1678 SPDK_NVME_SC_INVALID_FIELD,
1679 1 /* do not retry */, true);
1680}
1681
11fdf7f2
TL
1682/*
1683 * Append PRP list entries to describe a virtually contiguous buffer starting at virt_addr of len bytes.
1684 *
1685 * *prp_index will be updated to account for the number of PRP entries used.
7c673cae 1686 */
9f95a23c 1687static inline int
11fdf7f2
TL
1688nvme_pcie_prp_list_append(struct nvme_tracker *tr, uint32_t *prp_index, void *virt_addr, size_t len,
1689 uint32_t page_size)
7c673cae 1690{
11fdf7f2
TL
1691 struct spdk_nvme_cmd *cmd = &tr->req->cmd;
1692 uintptr_t page_mask = page_size - 1;
7c673cae 1693 uint64_t phys_addr;
11fdf7f2 1694 uint32_t i;
7c673cae 1695
11fdf7f2
TL
1696 SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp_index:%u virt_addr:%p len:%u\n",
1697 *prp_index, virt_addr, (uint32_t)len);
1698
1699 if (spdk_unlikely(((uintptr_t)virt_addr & 3) != 0)) {
1700 SPDK_ERRLOG("virt_addr %p not dword aligned\n", virt_addr);
1701 return -EINVAL;
7c673cae
FG
1702 }
1703
11fdf7f2
TL
1704 i = *prp_index;
1705 while (len) {
1706 uint32_t seg_len;
1707
1708 /*
1709 * prp_index 0 is stored in prp1, and the rest are stored in the prp[] array,
1710 * so prp_index == count is valid.
1711 */
1712 if (spdk_unlikely(i > SPDK_COUNTOF(tr->u.prp))) {
1713 SPDK_ERRLOG("out of PRP entries\n");
1714 return -EINVAL;
7c673cae 1715 }
7c673cae 1716
9f95a23c 1717 phys_addr = spdk_vtophys(virt_addr, NULL);
11fdf7f2
TL
1718 if (spdk_unlikely(phys_addr == SPDK_VTOPHYS_ERROR)) {
1719 SPDK_ERRLOG("vtophys(%p) failed\n", virt_addr);
1720 return -EINVAL;
1721 }
1722
1723 if (i == 0) {
1724 SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp1 = %p\n", (void *)phys_addr);
1725 cmd->dptr.prp.prp1 = phys_addr;
1726 seg_len = page_size - ((uintptr_t)virt_addr & page_mask);
1727 } else {
1728 if ((phys_addr & page_mask) != 0) {
1729 SPDK_ERRLOG("PRP %u not page aligned (%p)\n", i, virt_addr);
1730 return -EINVAL;
7c673cae 1731 }
11fdf7f2
TL
1732
1733 SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp[%u] = %p\n", i - 1, (void *)phys_addr);
1734 tr->u.prp[i - 1] = phys_addr;
1735 seg_len = page_size;
7c673cae 1736 }
11fdf7f2
TL
1737
1738 seg_len = spdk_min(seg_len, len);
1739 virt_addr += seg_len;
1740 len -= seg_len;
1741 i++;
1742 }
1743
1744 cmd->psdt = SPDK_NVME_PSDT_PRP;
1745 if (i <= 1) {
1746 cmd->dptr.prp.prp2 = 0;
1747 } else if (i == 2) {
1748 cmd->dptr.prp.prp2 = tr->u.prp[0];
1749 SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp2 = %p\n", (void *)cmd->dptr.prp.prp2);
1750 } else {
1751 cmd->dptr.prp.prp2 = tr->prp_sgl_bus_addr;
1752 SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp2 = %p (PRP list)\n", (void *)cmd->dptr.prp.prp2);
1753 }
1754
1755 *prp_index = i;
1756 return 0;
1757}
1758
1759/**
1760 * Build PRP list describing physically contiguous payload buffer.
1761 */
1762static int
1763nvme_pcie_qpair_build_contig_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req,
1764 struct nvme_tracker *tr)
1765{
1766 uint32_t prp_index = 0;
1767 int rc;
1768
1769 rc = nvme_pcie_prp_list_append(tr, &prp_index, req->payload.contig_or_cb_arg + req->payload_offset,
1770 req->payload_size, qpair->ctrlr->page_size);
1771 if (rc) {
1772 nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1773 return rc;
7c673cae
FG
1774 }
1775
1776 return 0;
1777}
1778
1779/**
1780 * Build SGL list describing scattered payload buffer.
1781 */
1782static int
1783nvme_pcie_qpair_build_hw_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req,
1784 struct nvme_tracker *tr)
1785{
1786 int rc;
1787 void *virt_addr;
1788 uint64_t phys_addr;
11fdf7f2 1789 uint32_t remaining_transfer_len, remaining_user_sge_len, length;
7c673cae
FG
1790 struct spdk_nvme_sgl_descriptor *sgl;
1791 uint32_t nseg = 0;
1792
1793 /*
1794 * Build scattered payloads.
1795 */
1796 assert(req->payload_size != 0);
11fdf7f2
TL
1797 assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL);
1798 assert(req->payload.reset_sgl_fn != NULL);
1799 assert(req->payload.next_sge_fn != NULL);
1800 req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset);
7c673cae
FG
1801
1802 sgl = tr->u.sgl;
11fdf7f2 1803 req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG;
7c673cae
FG
1804 req->cmd.dptr.sgl1.unkeyed.subtype = 0;
1805
1806 remaining_transfer_len = req->payload_size;
1807
1808 while (remaining_transfer_len > 0) {
11fdf7f2
TL
1809 rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg,
1810 &virt_addr, &remaining_user_sge_len);
7c673cae
FG
1811 if (rc) {
1812 nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1813 return -1;
1814 }
1815
11fdf7f2
TL
1816 remaining_user_sge_len = spdk_min(remaining_user_sge_len, remaining_transfer_len);
1817 remaining_transfer_len -= remaining_user_sge_len;
1818 while (remaining_user_sge_len > 0) {
1819 if (nseg >= NVME_MAX_SGL_DESCRIPTORS) {
1820 nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1821 return -1;
1822 }
7c673cae 1823
9f95a23c 1824 phys_addr = spdk_vtophys(virt_addr, NULL);
11fdf7f2
TL
1825 if (phys_addr == SPDK_VTOPHYS_ERROR) {
1826 nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1827 return -1;
1828 }
1829
9f95a23c 1830 length = spdk_min(remaining_user_sge_len, VALUE_2MB - _2MB_OFFSET(virt_addr));
11fdf7f2
TL
1831 remaining_user_sge_len -= length;
1832 virt_addr += length;
1833
1834 if (nseg > 0 && phys_addr ==
1835 (*(sgl - 1)).address + (*(sgl - 1)).unkeyed.length) {
1836 /* extend previous entry */
1837 (*(sgl - 1)).unkeyed.length += length;
1838 continue;
1839 }
7c673cae 1840
11fdf7f2
TL
1841 sgl->unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
1842 sgl->unkeyed.length = length;
1843 sgl->address = phys_addr;
1844 sgl->unkeyed.subtype = 0;
7c673cae 1845
11fdf7f2
TL
1846 sgl++;
1847 nseg++;
1848 }
7c673cae
FG
1849 }
1850
1851 if (nseg == 1) {
1852 /*
1853 * The whole transfer can be described by a single SGL descriptor.
1854 * Use the special case described by the spec where SGL1's type is Data Block.
1855 * This means the SGL in the tracker is not used at all, so copy the first (and only)
1856 * SGL element into SGL1.
1857 */
1858 req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
1859 req->cmd.dptr.sgl1.address = tr->u.sgl[0].address;
1860 req->cmd.dptr.sgl1.unkeyed.length = tr->u.sgl[0].unkeyed.length;
1861 } else {
1862 /* For now we can only support 1 SGL segment in NVMe controller */
1863 req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_LAST_SEGMENT;
1864 req->cmd.dptr.sgl1.address = tr->prp_sgl_bus_addr;
1865 req->cmd.dptr.sgl1.unkeyed.length = nseg * sizeof(struct spdk_nvme_sgl_descriptor);
1866 }
1867
1868 return 0;
1869}
1870
1871/**
1872 * Build PRP list describing scattered payload buffer.
1873 */
1874static int
1875nvme_pcie_qpair_build_prps_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req,
1876 struct nvme_tracker *tr)
1877{
1878 int rc;
1879 void *virt_addr;
11fdf7f2
TL
1880 uint32_t remaining_transfer_len, length;
1881 uint32_t prp_index = 0;
1882 uint32_t page_size = qpair->ctrlr->page_size;
7c673cae
FG
1883
1884 /*
1885 * Build scattered payloads.
1886 */
11fdf7f2
TL
1887 assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL);
1888 assert(req->payload.reset_sgl_fn != NULL);
1889 req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset);
7c673cae
FG
1890
1891 remaining_transfer_len = req->payload_size;
7c673cae 1892 while (remaining_transfer_len > 0) {
11fdf7f2
TL
1893 assert(req->payload.next_sge_fn != NULL);
1894 rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, &virt_addr, &length);
7c673cae
FG
1895 if (rc) {
1896 nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1897 return -1;
1898 }
1899
11fdf7f2 1900 length = spdk_min(remaining_transfer_len, length);
7c673cae
FG
1901
1902 /*
1903 * Any incompatible sges should have been handled up in the splitting routine,
1904 * but assert here as an additional check.
11fdf7f2
TL
1905 *
1906 * All SGEs except last must end on a page boundary.
7c673cae 1907 */
11fdf7f2
TL
1908 assert((length == remaining_transfer_len) ||
1909 _is_page_aligned((uintptr_t)virt_addr + length, page_size));
7c673cae 1910
11fdf7f2
TL
1911 rc = nvme_pcie_prp_list_append(tr, &prp_index, virt_addr, length, page_size);
1912 if (rc) {
1913 nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1914 return rc;
7c673cae
FG
1915 }
1916
11fdf7f2 1917 remaining_transfer_len -= length;
7c673cae
FG
1918 }
1919
1920 return 0;
1921}
1922
7c673cae
FG
1923int
1924nvme_pcie_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req)
1925{
1926 struct nvme_tracker *tr;
1927 int rc = 0;
11fdf7f2 1928 void *md_payload;
7c673cae
FG
1929 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
1930 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1931
9f95a23c 1932 if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) {
7c673cae
FG
1933 nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
1934 }
1935
1936 tr = TAILQ_FIRST(&pqpair->free_tr);
1937
9f95a23c 1938 if (tr == NULL) {
7c673cae 1939 /*
7c673cae
FG
1940 * Put the request on the qpair's request queue to be
1941 * processed when a tracker frees up via a command
9f95a23c 1942 * completion.
7c673cae
FG
1943 */
1944 STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq);
1945 goto exit;
1946 }
1947
1948 TAILQ_REMOVE(&pqpair->free_tr, tr, tq_list); /* remove tr from free_tr */
1949 TAILQ_INSERT_TAIL(&pqpair->outstanding_tr, tr, tq_list);
1950 tr->req = req;
9f95a23c
TL
1951 tr->cb_fn = req->cb_fn;
1952 tr->cb_arg = req->cb_arg;
7c673cae
FG
1953 req->cmd.cid = tr->cid;
1954
11fdf7f2
TL
1955 if (req->payload_size && req->payload.md) {
1956 md_payload = req->payload.md + req->md_offset;
9f95a23c 1957 tr->req->cmd.mptr = spdk_vtophys(md_payload, NULL);
11fdf7f2
TL
1958 if (tr->req->cmd.mptr == SPDK_VTOPHYS_ERROR) {
1959 nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1960 rc = -EINVAL;
1961 goto exit;
1962 }
1963 }
1964
7c673cae 1965 if (req->payload_size == 0) {
9f95a23c 1966 /* Null payload - leave PRP fields untouched */
7c673cae 1967 rc = 0;
11fdf7f2 1968 } else if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG) {
7c673cae 1969 rc = nvme_pcie_qpair_build_contig_request(qpair, req, tr);
11fdf7f2 1970 } else if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL) {
7c673cae
FG
1971 if (ctrlr->flags & SPDK_NVME_CTRLR_SGL_SUPPORTED) {
1972 rc = nvme_pcie_qpair_build_hw_sgl_request(qpair, req, tr);
1973 } else {
1974 rc = nvme_pcie_qpair_build_prps_sgl_request(qpair, req, tr);
1975 }
1976 } else {
1977 assert(0);
1978 nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1979 rc = -EINVAL;
1980 }
1981
1982 if (rc < 0) {
1983 goto exit;
1984 }
1985
1986 nvme_pcie_qpair_submit_tracker(qpair, tr);
1987
1988exit:
9f95a23c 1989 if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) {
7c673cae
FG
1990 nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
1991 }
1992
1993 return rc;
1994}
1995
1996static void
1997nvme_pcie_qpair_check_timeout(struct spdk_nvme_qpair *qpair)
1998{
1999 uint64_t t02;
2000 struct nvme_tracker *tr, *tmp;
2001 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
2002 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
11fdf7f2 2003 struct spdk_nvme_ctrlr_process *active_proc;
7c673cae 2004
11fdf7f2
TL
2005 /* Don't check timeouts during controller initialization. */
2006 if (ctrlr->state != NVME_CTRLR_STATE_READY) {
2007 return;
2008 }
2009
2010 if (nvme_qpair_is_admin_queue(qpair)) {
2011 active_proc = spdk_nvme_ctrlr_get_current_process(ctrlr);
2012 } else {
2013 active_proc = qpair->active_proc;
2014 }
2015
2016 /* Only check timeouts if the current process has a timeout callback. */
2017 if (active_proc == NULL || active_proc->timeout_cb_fn == NULL) {
2018 return;
7c673cae
FG
2019 }
2020
2021 t02 = spdk_get_ticks();
2022 TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, tmp) {
11fdf7f2 2023 assert(tr->req != NULL);
7c673cae 2024
11fdf7f2
TL
2025 if (nvme_request_check_timeout(tr->req, tr->cid, active_proc, t02)) {
2026 /*
2027 * The requests are in order, so as soon as one has not timed out,
7c673cae
FG
2028 * stop iterating.
2029 */
2030 break;
2031 }
7c673cae
FG
2032 }
2033}
2034
2035int32_t
2036nvme_pcie_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions)
2037{
2038 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
7c673cae 2039 struct nvme_tracker *tr;
9f95a23c 2040 struct spdk_nvme_cpl *cpl, *next_cpl;
7c673cae
FG
2041 uint32_t num_completions = 0;
2042 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
9f95a23c
TL
2043 uint16_t next_cq_head;
2044 uint8_t next_phase;
2045 bool next_is_valid = false;
7c673cae 2046
11fdf7f2 2047 if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) {
7c673cae
FG
2048 nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
2049 }
2050
11fdf7f2 2051 if (max_completions == 0 || max_completions > pqpair->max_completions_cap) {
7c673cae 2052 /*
11fdf7f2
TL
2053 * max_completions == 0 means unlimited, but complete at most
2054 * max_completions_cap batch of I/O at a time so that the completion
7c673cae
FG
2055 * queue doorbells don't wrap around.
2056 */
11fdf7f2 2057 max_completions = pqpair->max_completions_cap;
7c673cae
FG
2058 }
2059
2060 while (1) {
2061 cpl = &pqpair->cpl[pqpair->cq_head];
2062
9f95a23c 2063 if (!next_is_valid && cpl->status.p != pqpair->flags.phase) {
7c673cae 2064 break;
11fdf7f2 2065 }
9f95a23c
TL
2066
2067 if (spdk_likely(pqpair->cq_head + 1 != pqpair->num_entries)) {
2068 next_cq_head = pqpair->cq_head + 1;
2069 next_phase = pqpair->flags.phase;
2070 } else {
2071 next_cq_head = 0;
2072 next_phase = !pqpair->flags.phase;
2073 }
2074 next_cpl = &pqpair->cpl[next_cq_head];
2075 next_is_valid = (next_cpl->status.p == next_phase);
2076 if (next_is_valid) {
2077 __builtin_prefetch(&pqpair->tr[next_cpl->cid]);
2078 }
2079
11fdf7f2
TL
2080#ifdef __PPC64__
2081 /*
2082 * This memory barrier prevents reordering of:
2083 * - load after store from/to tr
2084 * - load after load cpl phase and cpl cid
2085 */
2086 spdk_mb();
9f95a23c
TL
2087#elif defined(__aarch64__)
2088 __asm volatile("dmb oshld" ::: "memory");
11fdf7f2
TL
2089#endif
2090
2091 if (spdk_unlikely(++pqpair->cq_head == pqpair->num_entries)) {
2092 pqpair->cq_head = 0;
9f95a23c 2093 pqpair->flags.phase = !pqpair->flags.phase;
11fdf7f2 2094 }
7c673cae
FG
2095
2096 tr = &pqpair->tr[cpl->cid];
9f95a23c
TL
2097 /* Prefetch the req's STAILQ_ENTRY since we'll need to access it
2098 * as part of putting the req back on the qpair's free list.
2099 */
2100 __builtin_prefetch(&tr->req->stailq);
11fdf7f2 2101 pqpair->sq_head = cpl->sqhd;
7c673cae 2102
9f95a23c 2103 if (tr->req) {
7c673cae
FG
2104 nvme_pcie_qpair_complete_tracker(qpair, tr, cpl, true);
2105 } else {
2106 SPDK_ERRLOG("cpl does not map to outstanding cmd\n");
2107 nvme_qpair_print_completion(qpair, cpl);
2108 assert(0);
2109 }
2110
7c673cae
FG
2111 if (++num_completions == max_completions) {
2112 break;
2113 }
2114 }
2115
2116 if (num_completions > 0) {
9f95a23c
TL
2117 nvme_pcie_qpair_ring_cq_doorbell(qpair);
2118 }
2119
2120 if (pqpair->flags.delay_pcie_doorbell) {
2121 if (pqpair->last_sq_tail != pqpair->sq_tail) {
2122 nvme_pcie_qpair_ring_sq_doorbell(qpair);
2123 pqpair->last_sq_tail = pqpair->sq_tail;
11fdf7f2 2124 }
7c673cae
FG
2125 }
2126
11fdf7f2
TL
2127 if (spdk_unlikely(ctrlr->timeout_enabled)) {
2128 /*
2129 * User registered for timeout callback
2130 */
2131 nvme_pcie_qpair_check_timeout(qpair);
7c673cae
FG
2132 }
2133
2134 /* Before returning, complete any pending admin request. */
11fdf7f2 2135 if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) {
7c673cae
FG
2136 nvme_pcie_qpair_complete_pending_admin_request(qpair);
2137
2138 nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
2139 }
2140
2141 return num_completions;
2142}