]> git.proxmox.com Git - ceph.git/blob - ceph/src/spdk/lib/nvme/nvme_pcie.c
import 15.2.0 Octopus source
[ceph.git] / ceph / src / spdk / lib / nvme / nvme_pcie.c
1 /*-
2 * BSD LICENSE
3 *
4 * Copyright (c) Intel Corporation.
5 * Copyright (c) 2017, IBM Corporation.
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 *
12 * * Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * * Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in
16 * the documentation and/or other materials provided with the
17 * distribution.
18 * * Neither the name of Intel Corporation nor the names of its
19 * contributors may be used to endorse or promote products derived
20 * from this software without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
26 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 */
34
35 /*
36 * NVMe over PCIe transport
37 */
38
39 #include "spdk/stdinc.h"
40 #include "spdk/env.h"
41 #include "spdk/likely.h"
42 #include "nvme_internal.h"
43 #include "nvme_uevent.h"
44
45 /*
46 * Number of completion queue entries to process before ringing the
47 * completion queue doorbell.
48 */
49 #define NVME_MIN_COMPLETIONS (1)
50 #define NVME_MAX_COMPLETIONS (128)
51
52 #define NVME_ADMIN_ENTRIES (128)
53
54 /*
55 * NVME_MAX_SGL_DESCRIPTORS defines the maximum number of descriptors in one SGL
56 * segment.
57 */
58 #define NVME_MAX_SGL_DESCRIPTORS (251)
59
60 #define NVME_MAX_PRP_LIST_ENTRIES (505)
61
62 struct nvme_pcie_enum_ctx {
63 struct spdk_nvme_probe_ctx *probe_ctx;
64 struct spdk_pci_addr pci_addr;
65 bool has_pci_addr;
66 };
67
68 /* PCIe transport extensions for spdk_nvme_ctrlr */
69 struct nvme_pcie_ctrlr {
70 struct spdk_nvme_ctrlr ctrlr;
71
72 /** NVMe MMIO register space */
73 volatile struct spdk_nvme_registers *regs;
74
75 /** NVMe MMIO register size */
76 uint64_t regs_size;
77
78 /* BAR mapping address which contains controller memory buffer */
79 void *cmb_bar_virt_addr;
80
81 /* BAR physical address which contains controller memory buffer */
82 uint64_t cmb_bar_phys_addr;
83
84 /* Controller memory buffer size in Bytes */
85 uint64_t cmb_size;
86
87 /* Current offset of controller memory buffer, relative to start of BAR virt addr */
88 uint64_t cmb_current_offset;
89
90 /* Last valid offset into CMB, this differs if CMB memory registration occurs or not */
91 uint64_t cmb_max_offset;
92
93 void *cmb_mem_register_addr;
94 size_t cmb_mem_register_size;
95
96 bool cmb_io_data_supported;
97
98 /** stride in uint32_t units between doorbell registers (1 = 4 bytes, 2 = 8 bytes, ...) */
99 uint32_t doorbell_stride_u32;
100
101 /* Opaque handle to associated PCI device. */
102 struct spdk_pci_device *devhandle;
103
104 /* File descriptor returned from spdk_pci_device_claim(). Closed when ctrlr is detached. */
105 int claim_fd;
106
107 /* Flag to indicate the MMIO register has been remapped */
108 bool is_remapped;
109 };
110
111 struct nvme_tracker {
112 TAILQ_ENTRY(nvme_tracker) tq_list;
113
114 struct nvme_request *req;
115 uint16_t cid;
116
117 uint16_t rsvd0;
118 uint32_t rsvd1;
119
120 spdk_nvme_cmd_cb cb_fn;
121 void *cb_arg;
122
123 uint64_t prp_sgl_bus_addr;
124
125 union {
126 uint64_t prp[NVME_MAX_PRP_LIST_ENTRIES];
127 struct spdk_nvme_sgl_descriptor sgl[NVME_MAX_SGL_DESCRIPTORS];
128 } u;
129 };
130 /*
131 * struct nvme_tracker must be exactly 4K so that the prp[] array does not cross a page boundary
132 * and so that there is no padding required to meet alignment requirements.
133 */
134 SPDK_STATIC_ASSERT(sizeof(struct nvme_tracker) == 4096, "nvme_tracker is not 4K");
135 SPDK_STATIC_ASSERT((offsetof(struct nvme_tracker, u.sgl) & 7) == 0, "SGL must be Qword aligned");
136
137 /* PCIe transport extensions for spdk_nvme_qpair */
138 struct nvme_pcie_qpair {
139 /* Submission queue tail doorbell */
140 volatile uint32_t *sq_tdbl;
141
142 /* Completion queue head doorbell */
143 volatile uint32_t *cq_hdbl;
144
145 /* Submission queue */
146 struct spdk_nvme_cmd *cmd;
147
148 /* Completion queue */
149 struct spdk_nvme_cpl *cpl;
150
151 TAILQ_HEAD(, nvme_tracker) free_tr;
152 TAILQ_HEAD(nvme_outstanding_tr_head, nvme_tracker) outstanding_tr;
153
154 /* Array of trackers indexed by command ID. */
155 struct nvme_tracker *tr;
156
157 uint16_t num_entries;
158
159 uint16_t max_completions_cap;
160
161 uint16_t last_sq_tail;
162 uint16_t sq_tail;
163 uint16_t cq_head;
164 uint16_t sq_head;
165
166 struct {
167 uint8_t phase : 1;
168 uint8_t delay_pcie_doorbell : 1;
169 uint8_t has_shadow_doorbell : 1;
170 } flags;
171
172 /*
173 * Base qpair structure.
174 * This is located after the hot data in this structure so that the important parts of
175 * nvme_pcie_qpair are in the same cache line.
176 */
177 struct spdk_nvme_qpair qpair;
178
179 struct {
180 /* Submission queue shadow tail doorbell */
181 volatile uint32_t *sq_tdbl;
182
183 /* Completion queue shadow head doorbell */
184 volatile uint32_t *cq_hdbl;
185
186 /* Submission queue event index */
187 volatile uint32_t *sq_eventidx;
188
189 /* Completion queue event index */
190 volatile uint32_t *cq_eventidx;
191 } shadow_doorbell;
192
193 /*
194 * Fields below this point should not be touched on the normal I/O path.
195 */
196
197 bool sq_in_cmb;
198
199 uint64_t cmd_bus_addr;
200 uint64_t cpl_bus_addr;
201 };
202
203 static int nvme_pcie_ctrlr_attach(struct spdk_nvme_probe_ctx *probe_ctx,
204 struct spdk_pci_addr *pci_addr);
205 static int nvme_pcie_qpair_construct(struct spdk_nvme_qpair *qpair);
206 static int nvme_pcie_qpair_destroy(struct spdk_nvme_qpair *qpair);
207
208 __thread struct nvme_pcie_ctrlr *g_thread_mmio_ctrlr = NULL;
209 static volatile uint16_t g_signal_lock;
210 static bool g_sigset = false;
211 static int hotplug_fd = -1;
212
213 static void
214 nvme_sigbus_fault_sighandler(int signum, siginfo_t *info, void *ctx)
215 {
216 void *map_address;
217
218 if (!__sync_bool_compare_and_swap(&g_signal_lock, 0, 1)) {
219 return;
220 }
221
222 assert(g_thread_mmio_ctrlr != NULL);
223
224 if (!g_thread_mmio_ctrlr->is_remapped) {
225 map_address = mmap((void *)g_thread_mmio_ctrlr->regs, g_thread_mmio_ctrlr->regs_size,
226 PROT_READ | PROT_WRITE,
227 MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
228 if (map_address == MAP_FAILED) {
229 SPDK_ERRLOG("mmap failed\n");
230 g_signal_lock = 0;
231 return;
232 }
233 memset(map_address, 0xFF, sizeof(struct spdk_nvme_registers));
234 g_thread_mmio_ctrlr->regs = (volatile struct spdk_nvme_registers *)map_address;
235 g_thread_mmio_ctrlr->is_remapped = true;
236 }
237 g_signal_lock = 0;
238 return;
239 }
240
241 static void
242 nvme_pcie_ctrlr_setup_signal(void)
243 {
244 struct sigaction sa;
245
246 sa.sa_sigaction = nvme_sigbus_fault_sighandler;
247 sigemptyset(&sa.sa_mask);
248 sa.sa_flags = SA_SIGINFO;
249 sigaction(SIGBUS, &sa, NULL);
250 }
251
252 static inline struct nvme_pcie_ctrlr *
253 nvme_pcie_ctrlr(struct spdk_nvme_ctrlr *ctrlr)
254 {
255 assert(ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE);
256 return SPDK_CONTAINEROF(ctrlr, struct nvme_pcie_ctrlr, ctrlr);
257 }
258
259 static int
260 _nvme_pcie_hotplug_monitor(struct spdk_nvme_probe_ctx *probe_ctx)
261 {
262 struct spdk_nvme_ctrlr *ctrlr, *tmp;
263 struct spdk_uevent event;
264 struct spdk_pci_addr pci_addr;
265 union spdk_nvme_csts_register csts;
266 struct spdk_nvme_ctrlr_process *proc;
267
268 while (spdk_get_uevent(hotplug_fd, &event) > 0) {
269 if (event.subsystem == SPDK_NVME_UEVENT_SUBSYSTEM_UIO ||
270 event.subsystem == SPDK_NVME_UEVENT_SUBSYSTEM_VFIO) {
271 if (event.action == SPDK_NVME_UEVENT_ADD) {
272 SPDK_DEBUGLOG(SPDK_LOG_NVME, "add nvme address: %s\n",
273 event.traddr);
274 if (spdk_process_is_primary()) {
275 if (!spdk_pci_addr_parse(&pci_addr, event.traddr)) {
276 nvme_pcie_ctrlr_attach(probe_ctx, &pci_addr);
277 }
278 }
279 } else if (event.action == SPDK_NVME_UEVENT_REMOVE) {
280 struct spdk_nvme_transport_id trid;
281
282 memset(&trid, 0, sizeof(trid));
283 trid.trtype = SPDK_NVME_TRANSPORT_PCIE;
284 snprintf(trid.traddr, sizeof(trid.traddr), "%s", event.traddr);
285
286 ctrlr = spdk_nvme_get_ctrlr_by_trid_unsafe(&trid);
287 if (ctrlr == NULL) {
288 return 0;
289 }
290 SPDK_DEBUGLOG(SPDK_LOG_NVME, "remove nvme address: %s\n",
291 event.traddr);
292
293 nvme_ctrlr_fail(ctrlr, true);
294
295 /* get the user app to clean up and stop I/O */
296 if (probe_ctx->remove_cb) {
297 nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock);
298 probe_ctx->remove_cb(probe_ctx->cb_ctx, ctrlr);
299 nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock);
300 }
301 }
302 }
303 }
304
305 /* This is a work around for vfio-attached device hot remove detection. */
306 TAILQ_FOREACH_SAFE(ctrlr, &g_spdk_nvme_driver->shared_attached_ctrlrs, tailq, tmp) {
307 bool do_remove = false;
308
309 if (ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
310 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
311
312 if (spdk_pci_device_is_removed(pctrlr->devhandle)) {
313 do_remove = true;
314 }
315 }
316
317 /* NVMe controller BAR must be mapped in the current process before any access. */
318 proc = spdk_nvme_ctrlr_get_current_process(ctrlr);
319 if (proc) {
320 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr);
321 if (csts.raw == 0xffffffffU) {
322 do_remove = true;
323 }
324 }
325
326 if (do_remove) {
327 nvme_ctrlr_fail(ctrlr, true);
328 if (probe_ctx->remove_cb) {
329 nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock);
330 probe_ctx->remove_cb(probe_ctx->cb_ctx, ctrlr);
331 nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock);
332 }
333 }
334 }
335 return 0;
336 }
337
338 static inline struct nvme_pcie_qpair *
339 nvme_pcie_qpair(struct spdk_nvme_qpair *qpair)
340 {
341 assert(qpair->trtype == SPDK_NVME_TRANSPORT_PCIE);
342 return SPDK_CONTAINEROF(qpair, struct nvme_pcie_qpair, qpair);
343 }
344
345 static volatile void *
346 nvme_pcie_reg_addr(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset)
347 {
348 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
349
350 return (volatile void *)((uintptr_t)pctrlr->regs + offset);
351 }
352
353 int
354 nvme_pcie_ctrlr_set_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value)
355 {
356 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
357
358 assert(offset <= sizeof(struct spdk_nvme_registers) - 4);
359 g_thread_mmio_ctrlr = pctrlr;
360 spdk_mmio_write_4(nvme_pcie_reg_addr(ctrlr, offset), value);
361 g_thread_mmio_ctrlr = NULL;
362 return 0;
363 }
364
365 int
366 nvme_pcie_ctrlr_set_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t value)
367 {
368 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
369
370 assert(offset <= sizeof(struct spdk_nvme_registers) - 8);
371 g_thread_mmio_ctrlr = pctrlr;
372 spdk_mmio_write_8(nvme_pcie_reg_addr(ctrlr, offset), value);
373 g_thread_mmio_ctrlr = NULL;
374 return 0;
375 }
376
377 int
378 nvme_pcie_ctrlr_get_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value)
379 {
380 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
381
382 assert(offset <= sizeof(struct spdk_nvme_registers) - 4);
383 assert(value != NULL);
384 g_thread_mmio_ctrlr = pctrlr;
385 *value = spdk_mmio_read_4(nvme_pcie_reg_addr(ctrlr, offset));
386 g_thread_mmio_ctrlr = NULL;
387 if (~(*value) == 0) {
388 return -1;
389 }
390
391 return 0;
392 }
393
394 int
395 nvme_pcie_ctrlr_get_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t *value)
396 {
397 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
398
399 assert(offset <= sizeof(struct spdk_nvme_registers) - 8);
400 assert(value != NULL);
401 g_thread_mmio_ctrlr = pctrlr;
402 *value = spdk_mmio_read_8(nvme_pcie_reg_addr(ctrlr, offset));
403 g_thread_mmio_ctrlr = NULL;
404 if (~(*value) == 0) {
405 return -1;
406 }
407
408 return 0;
409 }
410
411 static int
412 nvme_pcie_ctrlr_set_asq(struct nvme_pcie_ctrlr *pctrlr, uint64_t value)
413 {
414 return nvme_pcie_ctrlr_set_reg_8(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, asq),
415 value);
416 }
417
418 static int
419 nvme_pcie_ctrlr_set_acq(struct nvme_pcie_ctrlr *pctrlr, uint64_t value)
420 {
421 return nvme_pcie_ctrlr_set_reg_8(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, acq),
422 value);
423 }
424
425 static int
426 nvme_pcie_ctrlr_set_aqa(struct nvme_pcie_ctrlr *pctrlr, const union spdk_nvme_aqa_register *aqa)
427 {
428 return nvme_pcie_ctrlr_set_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, aqa.raw),
429 aqa->raw);
430 }
431
432 static int
433 nvme_pcie_ctrlr_get_cmbloc(struct nvme_pcie_ctrlr *pctrlr, union spdk_nvme_cmbloc_register *cmbloc)
434 {
435 return nvme_pcie_ctrlr_get_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, cmbloc.raw),
436 &cmbloc->raw);
437 }
438
439 static int
440 nvme_pcie_ctrlr_get_cmbsz(struct nvme_pcie_ctrlr *pctrlr, union spdk_nvme_cmbsz_register *cmbsz)
441 {
442 return nvme_pcie_ctrlr_get_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, cmbsz.raw),
443 &cmbsz->raw);
444 }
445
446 uint32_t
447 nvme_pcie_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr)
448 {
449 /*
450 * For commands requiring more than 2 PRP entries, one PRP will be
451 * embedded in the command (prp1), and the rest of the PRP entries
452 * will be in a list pointed to by the command (prp2). This means
453 * that real max number of PRP entries we support is 506+1, which
454 * results in a max xfer size of 506*ctrlr->page_size.
455 */
456 return NVME_MAX_PRP_LIST_ENTRIES * ctrlr->page_size;
457 }
458
459 uint16_t
460 nvme_pcie_ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr)
461 {
462 return NVME_MAX_SGL_DESCRIPTORS;
463 }
464
465 static void
466 nvme_pcie_ctrlr_map_cmb(struct nvme_pcie_ctrlr *pctrlr)
467 {
468 int rc;
469 void *addr;
470 uint32_t bir;
471 union spdk_nvme_cmbsz_register cmbsz;
472 union spdk_nvme_cmbloc_register cmbloc;
473 uint64_t size, unit_size, offset, bar_size, bar_phys_addr;
474 uint64_t mem_register_start, mem_register_end;
475
476 if (nvme_pcie_ctrlr_get_cmbsz(pctrlr, &cmbsz) ||
477 nvme_pcie_ctrlr_get_cmbloc(pctrlr, &cmbloc)) {
478 SPDK_ERRLOG("get registers failed\n");
479 goto exit;
480 }
481
482 if (!cmbsz.bits.sz) {
483 goto exit;
484 }
485
486 bir = cmbloc.bits.bir;
487 /* Values 0 2 3 4 5 are valid for BAR */
488 if (bir > 5 || bir == 1) {
489 goto exit;
490 }
491
492 /* unit size for 4KB/64KB/1MB/16MB/256MB/4GB/64GB */
493 unit_size = (uint64_t)1 << (12 + 4 * cmbsz.bits.szu);
494 /* controller memory buffer size in Bytes */
495 size = unit_size * cmbsz.bits.sz;
496 /* controller memory buffer offset from BAR in Bytes */
497 offset = unit_size * cmbloc.bits.ofst;
498
499 rc = spdk_pci_device_map_bar(pctrlr->devhandle, bir, &addr,
500 &bar_phys_addr, &bar_size);
501 if ((rc != 0) || addr == NULL) {
502 goto exit;
503 }
504
505 if (offset > bar_size) {
506 goto exit;
507 }
508
509 if (size > bar_size - offset) {
510 goto exit;
511 }
512
513 pctrlr->cmb_bar_virt_addr = addr;
514 pctrlr->cmb_bar_phys_addr = bar_phys_addr;
515 pctrlr->cmb_size = size;
516 pctrlr->cmb_current_offset = offset;
517 pctrlr->cmb_max_offset = offset + size;
518
519 if (!cmbsz.bits.sqs) {
520 pctrlr->ctrlr.opts.use_cmb_sqs = false;
521 }
522
523 /* If only SQS is supported use legacy mapping */
524 if (cmbsz.bits.sqs && !(cmbsz.bits.wds || cmbsz.bits.rds)) {
525 return;
526 }
527
528 /* If CMB is less than 4MiB in size then abort CMB mapping */
529 if (pctrlr->cmb_size < (1ULL << 22)) {
530 goto exit;
531 }
532
533 mem_register_start = _2MB_PAGE((uintptr_t)pctrlr->cmb_bar_virt_addr + offset + VALUE_2MB - 1);
534 mem_register_end = _2MB_PAGE((uintptr_t)pctrlr->cmb_bar_virt_addr + offset + pctrlr->cmb_size);
535 pctrlr->cmb_mem_register_addr = (void *)mem_register_start;
536 pctrlr->cmb_mem_register_size = mem_register_end - mem_register_start;
537
538 rc = spdk_mem_register(pctrlr->cmb_mem_register_addr, pctrlr->cmb_mem_register_size);
539 if (rc) {
540 SPDK_ERRLOG("spdk_mem_register() failed\n");
541 goto exit;
542 }
543 pctrlr->cmb_current_offset = mem_register_start - ((uint64_t)pctrlr->cmb_bar_virt_addr);
544 pctrlr->cmb_max_offset = mem_register_end - ((uint64_t)pctrlr->cmb_bar_virt_addr);
545 pctrlr->cmb_io_data_supported = true;
546
547 return;
548 exit:
549 pctrlr->cmb_bar_virt_addr = NULL;
550 pctrlr->ctrlr.opts.use_cmb_sqs = false;
551 return;
552 }
553
554 static int
555 nvme_pcie_ctrlr_unmap_cmb(struct nvme_pcie_ctrlr *pctrlr)
556 {
557 int rc = 0;
558 union spdk_nvme_cmbloc_register cmbloc;
559 void *addr = pctrlr->cmb_bar_virt_addr;
560
561 if (addr) {
562 if (pctrlr->cmb_mem_register_addr) {
563 spdk_mem_unregister(pctrlr->cmb_mem_register_addr, pctrlr->cmb_mem_register_size);
564 }
565
566 if (nvme_pcie_ctrlr_get_cmbloc(pctrlr, &cmbloc)) {
567 SPDK_ERRLOG("get_cmbloc() failed\n");
568 return -EIO;
569 }
570 rc = spdk_pci_device_unmap_bar(pctrlr->devhandle, cmbloc.bits.bir, addr);
571 }
572 return rc;
573 }
574
575 static int
576 nvme_pcie_ctrlr_alloc_cmb(struct spdk_nvme_ctrlr *ctrlr, uint64_t length, uint64_t aligned,
577 uint64_t *offset)
578 {
579 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
580 uint64_t round_offset;
581
582 round_offset = pctrlr->cmb_current_offset;
583 round_offset = (round_offset + (aligned - 1)) & ~(aligned - 1);
584
585 /* CMB may only consume part of the BAR, calculate accordingly */
586 if (round_offset + length > pctrlr->cmb_max_offset) {
587 SPDK_ERRLOG("Tried to allocate past valid CMB range!\n");
588 return -1;
589 }
590
591 *offset = round_offset;
592 pctrlr->cmb_current_offset = round_offset + length;
593
594 return 0;
595 }
596
597 volatile struct spdk_nvme_registers *
598 nvme_pcie_ctrlr_get_registers(struct spdk_nvme_ctrlr *ctrlr)
599 {
600 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
601
602 return pctrlr->regs;
603 }
604
605 void *
606 nvme_pcie_ctrlr_alloc_cmb_io_buffer(struct spdk_nvme_ctrlr *ctrlr, size_t size)
607 {
608 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
609 uint64_t offset;
610
611 if (pctrlr->cmb_bar_virt_addr == NULL) {
612 SPDK_DEBUGLOG(SPDK_LOG_NVME, "CMB not available\n");
613 return NULL;
614 }
615
616 if (!pctrlr->cmb_io_data_supported) {
617 SPDK_DEBUGLOG(SPDK_LOG_NVME, "CMB doesn't support I/O data\n");
618 return NULL;
619 }
620
621 if (nvme_pcie_ctrlr_alloc_cmb(ctrlr, size, 4, &offset) != 0) {
622 SPDK_DEBUGLOG(SPDK_LOG_NVME, "%zu-byte CMB allocation failed\n", size);
623 return NULL;
624 }
625
626 return pctrlr->cmb_bar_virt_addr + offset;
627 }
628
629 int
630 nvme_pcie_ctrlr_free_cmb_io_buffer(struct spdk_nvme_ctrlr *ctrlr, void *buf, size_t size)
631 {
632 /*
633 * Do nothing for now.
634 * TODO: Track free space so buffers may be reused.
635 */
636 SPDK_ERRLOG("%s: no deallocation for CMB buffers yet!\n",
637 __func__);
638 return 0;
639 }
640
641 static int
642 nvme_pcie_ctrlr_allocate_bars(struct nvme_pcie_ctrlr *pctrlr)
643 {
644 int rc;
645 void *addr;
646 uint64_t phys_addr, size;
647
648 rc = spdk_pci_device_map_bar(pctrlr->devhandle, 0, &addr,
649 &phys_addr, &size);
650 pctrlr->regs = (volatile struct spdk_nvme_registers *)addr;
651 if ((pctrlr->regs == NULL) || (rc != 0)) {
652 SPDK_ERRLOG("nvme_pcicfg_map_bar failed with rc %d or bar %p\n",
653 rc, pctrlr->regs);
654 return -1;
655 }
656
657 pctrlr->regs_size = size;
658 nvme_pcie_ctrlr_map_cmb(pctrlr);
659
660 return 0;
661 }
662
663 static int
664 nvme_pcie_ctrlr_free_bars(struct nvme_pcie_ctrlr *pctrlr)
665 {
666 int rc = 0;
667 void *addr = (void *)pctrlr->regs;
668
669 if (pctrlr->ctrlr.is_removed) {
670 return rc;
671 }
672
673 rc = nvme_pcie_ctrlr_unmap_cmb(pctrlr);
674 if (rc != 0) {
675 SPDK_ERRLOG("nvme_ctrlr_unmap_cmb failed with error code %d\n", rc);
676 return -1;
677 }
678
679 if (addr) {
680 /* NOTE: addr may have been remapped here. We're relying on DPDK to call
681 * munmap internally.
682 */
683 rc = spdk_pci_device_unmap_bar(pctrlr->devhandle, 0, addr);
684 }
685 return rc;
686 }
687
688 static int
689 nvme_pcie_ctrlr_construct_admin_qpair(struct spdk_nvme_ctrlr *ctrlr)
690 {
691 struct nvme_pcie_qpair *pqpair;
692 int rc;
693
694 pqpair = spdk_zmalloc(sizeof(*pqpair), 64, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE);
695 if (pqpair == NULL) {
696 return -ENOMEM;
697 }
698
699 pqpair->num_entries = NVME_ADMIN_ENTRIES;
700 pqpair->flags.delay_pcie_doorbell = 0;
701
702 ctrlr->adminq = &pqpair->qpair;
703
704 rc = nvme_qpair_init(ctrlr->adminq,
705 0, /* qpair ID */
706 ctrlr,
707 SPDK_NVME_QPRIO_URGENT,
708 NVME_ADMIN_ENTRIES);
709 if (rc != 0) {
710 return rc;
711 }
712
713 return nvme_pcie_qpair_construct(ctrlr->adminq);
714 }
715
716 /* This function must only be called while holding g_spdk_nvme_driver->lock */
717 static int
718 pcie_nvme_enum_cb(void *ctx, struct spdk_pci_device *pci_dev)
719 {
720 struct spdk_nvme_transport_id trid = {};
721 struct nvme_pcie_enum_ctx *enum_ctx = ctx;
722 struct spdk_nvme_ctrlr *ctrlr;
723 struct spdk_pci_addr pci_addr;
724
725 pci_addr = spdk_pci_device_get_addr(pci_dev);
726
727 trid.trtype = SPDK_NVME_TRANSPORT_PCIE;
728 spdk_pci_addr_fmt(trid.traddr, sizeof(trid.traddr), &pci_addr);
729
730 ctrlr = spdk_nvme_get_ctrlr_by_trid_unsafe(&trid);
731 if (!spdk_process_is_primary()) {
732 if (!ctrlr) {
733 SPDK_ERRLOG("Controller must be constructed in the primary process first.\n");
734 return -1;
735 }
736
737 return nvme_ctrlr_add_process(ctrlr, pci_dev);
738 }
739
740 /* check whether user passes the pci_addr */
741 if (enum_ctx->has_pci_addr &&
742 (spdk_pci_addr_compare(&pci_addr, &enum_ctx->pci_addr) != 0)) {
743 return 1;
744 }
745
746 return nvme_ctrlr_probe(&trid, enum_ctx->probe_ctx, pci_dev);
747 }
748
749 int
750 nvme_pcie_ctrlr_scan(struct spdk_nvme_probe_ctx *probe_ctx,
751 bool direct_connect)
752 {
753 struct nvme_pcie_enum_ctx enum_ctx = {};
754
755 enum_ctx.probe_ctx = probe_ctx;
756
757 if (strlen(probe_ctx->trid.traddr) != 0) {
758 if (spdk_pci_addr_parse(&enum_ctx.pci_addr, probe_ctx->trid.traddr)) {
759 return -1;
760 }
761 enum_ctx.has_pci_addr = true;
762 }
763
764 if (hotplug_fd < 0) {
765 hotplug_fd = spdk_uevent_connect();
766 if (hotplug_fd < 0) {
767 SPDK_DEBUGLOG(SPDK_LOG_NVME, "Failed to open uevent netlink socket\n");
768 }
769 } else {
770 _nvme_pcie_hotplug_monitor(probe_ctx);
771 }
772
773 if (enum_ctx.has_pci_addr == false) {
774 return spdk_pci_enumerate(spdk_pci_nvme_get_driver(),
775 pcie_nvme_enum_cb, &enum_ctx);
776 } else {
777 return spdk_pci_device_attach(spdk_pci_nvme_get_driver(),
778 pcie_nvme_enum_cb, &enum_ctx, &enum_ctx.pci_addr);
779 }
780 }
781
782 static int
783 nvme_pcie_ctrlr_attach(struct spdk_nvme_probe_ctx *probe_ctx, struct spdk_pci_addr *pci_addr)
784 {
785 struct nvme_pcie_enum_ctx enum_ctx;
786
787 enum_ctx.probe_ctx = probe_ctx;
788 enum_ctx.has_pci_addr = true;
789 enum_ctx.pci_addr = *pci_addr;
790
791 return spdk_pci_enumerate(spdk_pci_nvme_get_driver(), pcie_nvme_enum_cb, &enum_ctx);
792 }
793
794 struct spdk_nvme_ctrlr *nvme_pcie_ctrlr_construct(const struct spdk_nvme_transport_id *trid,
795 const struct spdk_nvme_ctrlr_opts *opts,
796 void *devhandle)
797 {
798 struct spdk_pci_device *pci_dev = devhandle;
799 struct nvme_pcie_ctrlr *pctrlr;
800 union spdk_nvme_cap_register cap;
801 union spdk_nvme_vs_register vs;
802 uint32_t cmd_reg;
803 int rc, claim_fd;
804 struct spdk_pci_id pci_id;
805 struct spdk_pci_addr pci_addr;
806
807 if (spdk_pci_addr_parse(&pci_addr, trid->traddr)) {
808 SPDK_ERRLOG("could not parse pci address\n");
809 return NULL;
810 }
811
812 claim_fd = spdk_pci_device_claim(&pci_addr);
813 if (claim_fd < 0) {
814 SPDK_ERRLOG("could not claim device %s\n", trid->traddr);
815 return NULL;
816 }
817
818 pctrlr = spdk_zmalloc(sizeof(struct nvme_pcie_ctrlr), 64, NULL,
819 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE);
820 if (pctrlr == NULL) {
821 close(claim_fd);
822 SPDK_ERRLOG("could not allocate ctrlr\n");
823 return NULL;
824 }
825
826 pctrlr->is_remapped = false;
827 pctrlr->ctrlr.is_removed = false;
828 pctrlr->ctrlr.trid.trtype = SPDK_NVME_TRANSPORT_PCIE;
829 pctrlr->devhandle = devhandle;
830 pctrlr->ctrlr.opts = *opts;
831 pctrlr->claim_fd = claim_fd;
832 memcpy(&pctrlr->ctrlr.trid, trid, sizeof(pctrlr->ctrlr.trid));
833
834 rc = nvme_pcie_ctrlr_allocate_bars(pctrlr);
835 if (rc != 0) {
836 close(claim_fd);
837 spdk_free(pctrlr);
838 return NULL;
839 }
840
841 /* Enable PCI busmaster and disable INTx */
842 spdk_pci_device_cfg_read32(pci_dev, &cmd_reg, 4);
843 cmd_reg |= 0x404;
844 spdk_pci_device_cfg_write32(pci_dev, cmd_reg, 4);
845
846 if (nvme_ctrlr_get_cap(&pctrlr->ctrlr, &cap)) {
847 SPDK_ERRLOG("get_cap() failed\n");
848 close(claim_fd);
849 spdk_free(pctrlr);
850 return NULL;
851 }
852
853 if (nvme_ctrlr_get_vs(&pctrlr->ctrlr, &vs)) {
854 SPDK_ERRLOG("get_vs() failed\n");
855 close(claim_fd);
856 spdk_free(pctrlr);
857 return NULL;
858 }
859
860 nvme_ctrlr_init_cap(&pctrlr->ctrlr, &cap, &vs);
861
862 /* Doorbell stride is 2 ^ (dstrd + 2),
863 * but we want multiples of 4, so drop the + 2 */
864 pctrlr->doorbell_stride_u32 = 1 << cap.bits.dstrd;
865
866 rc = nvme_ctrlr_construct(&pctrlr->ctrlr);
867 if (rc != 0) {
868 nvme_ctrlr_destruct(&pctrlr->ctrlr);
869 return NULL;
870 }
871
872 pci_id = spdk_pci_device_get_id(pci_dev);
873 pctrlr->ctrlr.quirks = nvme_get_quirks(&pci_id);
874
875 rc = nvme_pcie_ctrlr_construct_admin_qpair(&pctrlr->ctrlr);
876 if (rc != 0) {
877 nvme_ctrlr_destruct(&pctrlr->ctrlr);
878 return NULL;
879 }
880
881 /* Construct the primary process properties */
882 rc = nvme_ctrlr_add_process(&pctrlr->ctrlr, pci_dev);
883 if (rc != 0) {
884 nvme_ctrlr_destruct(&pctrlr->ctrlr);
885 return NULL;
886 }
887
888 if (g_sigset != true) {
889 nvme_pcie_ctrlr_setup_signal();
890 g_sigset = true;
891 }
892
893 return &pctrlr->ctrlr;
894 }
895
896 int
897 nvme_pcie_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr)
898 {
899 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
900 struct nvme_pcie_qpair *padminq = nvme_pcie_qpair(ctrlr->adminq);
901 union spdk_nvme_aqa_register aqa;
902
903 if (nvme_pcie_ctrlr_set_asq(pctrlr, padminq->cmd_bus_addr)) {
904 SPDK_ERRLOG("set_asq() failed\n");
905 return -EIO;
906 }
907
908 if (nvme_pcie_ctrlr_set_acq(pctrlr, padminq->cpl_bus_addr)) {
909 SPDK_ERRLOG("set_acq() failed\n");
910 return -EIO;
911 }
912
913 aqa.raw = 0;
914 /* acqs and asqs are 0-based. */
915 aqa.bits.acqs = nvme_pcie_qpair(ctrlr->adminq)->num_entries - 1;
916 aqa.bits.asqs = nvme_pcie_qpair(ctrlr->adminq)->num_entries - 1;
917
918 if (nvme_pcie_ctrlr_set_aqa(pctrlr, &aqa)) {
919 SPDK_ERRLOG("set_aqa() failed\n");
920 return -EIO;
921 }
922
923 return 0;
924 }
925
926 int
927 nvme_pcie_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr)
928 {
929 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
930 struct spdk_pci_device *devhandle = nvme_ctrlr_proc_get_devhandle(ctrlr);
931
932 close(pctrlr->claim_fd);
933
934 if (ctrlr->adminq) {
935 nvme_pcie_qpair_destroy(ctrlr->adminq);
936 }
937
938 nvme_ctrlr_destruct_finish(ctrlr);
939
940 nvme_ctrlr_free_processes(ctrlr);
941
942 nvme_pcie_ctrlr_free_bars(pctrlr);
943
944 if (devhandle) {
945 spdk_pci_device_detach(devhandle);
946 }
947
948 spdk_free(pctrlr);
949
950 return 0;
951 }
952
953 static void
954 nvme_qpair_construct_tracker(struct nvme_tracker *tr, uint16_t cid, uint64_t phys_addr)
955 {
956 tr->prp_sgl_bus_addr = phys_addr + offsetof(struct nvme_tracker, u.prp);
957 tr->cid = cid;
958 tr->req = NULL;
959 }
960
961 int
962 nvme_pcie_qpair_reset(struct spdk_nvme_qpair *qpair)
963 {
964 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
965
966 pqpair->last_sq_tail = pqpair->sq_tail = pqpair->cq_head = 0;
967
968 /*
969 * First time through the completion queue, HW will set phase
970 * bit on completions to 1. So set this to 1 here, indicating
971 * we're looking for a 1 to know which entries have completed.
972 * we'll toggle the bit each time when the completion queue
973 * rolls over.
974 */
975 pqpair->flags.phase = 1;
976
977 memset(pqpair->cmd, 0,
978 pqpair->num_entries * sizeof(struct spdk_nvme_cmd));
979 memset(pqpair->cpl, 0,
980 pqpair->num_entries * sizeof(struct spdk_nvme_cpl));
981
982 return 0;
983 }
984
985 static int
986 nvme_pcie_qpair_construct(struct spdk_nvme_qpair *qpair)
987 {
988 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
989 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
990 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
991 struct nvme_tracker *tr;
992 uint16_t i;
993 volatile uint32_t *doorbell_base;
994 uint64_t offset;
995 uint16_t num_trackers;
996 size_t page_align = VALUE_2MB;
997 uint32_t flags = SPDK_MALLOC_DMA;
998
999 /*
1000 * Limit the maximum number of completions to return per call to prevent wraparound,
1001 * and calculate how many trackers can be submitted at once without overflowing the
1002 * completion queue.
1003 */
1004 pqpair->max_completions_cap = pqpair->num_entries / 4;
1005 pqpair->max_completions_cap = spdk_max(pqpair->max_completions_cap, NVME_MIN_COMPLETIONS);
1006 pqpair->max_completions_cap = spdk_min(pqpair->max_completions_cap, NVME_MAX_COMPLETIONS);
1007 num_trackers = pqpair->num_entries - pqpair->max_completions_cap;
1008
1009 SPDK_INFOLOG(SPDK_LOG_NVME, "max_completions_cap = %" PRIu16 " num_trackers = %" PRIu16 "\n",
1010 pqpair->max_completions_cap, num_trackers);
1011
1012 assert(num_trackers != 0);
1013
1014 pqpair->sq_in_cmb = false;
1015
1016 if (nvme_qpair_is_admin_queue(&pqpair->qpair)) {
1017 flags |= SPDK_MALLOC_SHARE;
1018 }
1019
1020 /* cmd and cpl rings must be aligned on page size boundaries. */
1021 if (ctrlr->opts.use_cmb_sqs) {
1022 if (nvme_pcie_ctrlr_alloc_cmb(ctrlr, pqpair->num_entries * sizeof(struct spdk_nvme_cmd),
1023 sysconf(_SC_PAGESIZE), &offset) == 0) {
1024 pqpair->cmd = pctrlr->cmb_bar_virt_addr + offset;
1025 pqpair->cmd_bus_addr = pctrlr->cmb_bar_phys_addr + offset;
1026 pqpair->sq_in_cmb = true;
1027 }
1028 }
1029
1030 /* To ensure physical address contiguity we make each ring occupy
1031 * a single hugepage only. See MAX_IO_QUEUE_ENTRIES.
1032 */
1033 if (pqpair->sq_in_cmb == false) {
1034 pqpair->cmd = spdk_zmalloc(pqpair->num_entries * sizeof(struct spdk_nvme_cmd),
1035 page_align, NULL,
1036 SPDK_ENV_SOCKET_ID_ANY, flags);
1037 if (pqpair->cmd == NULL) {
1038 SPDK_ERRLOG("alloc qpair_cmd failed\n");
1039 return -ENOMEM;
1040 }
1041
1042 pqpair->cmd_bus_addr = spdk_vtophys(pqpair->cmd, NULL);
1043 if (pqpair->cmd_bus_addr == SPDK_VTOPHYS_ERROR) {
1044 SPDK_ERRLOG("spdk_vtophys(pqpair->cmd) failed\n");
1045 return -EFAULT;
1046 }
1047 }
1048
1049 pqpair->cpl = spdk_zmalloc(pqpair->num_entries * sizeof(struct spdk_nvme_cpl),
1050 page_align, NULL,
1051 SPDK_ENV_SOCKET_ID_ANY, flags);
1052 if (pqpair->cpl == NULL) {
1053 SPDK_ERRLOG("alloc qpair_cpl failed\n");
1054 return -ENOMEM;
1055 }
1056
1057 pqpair->cpl_bus_addr = spdk_vtophys(pqpair->cpl, NULL);
1058 if (pqpair->cpl_bus_addr == SPDK_VTOPHYS_ERROR) {
1059 SPDK_ERRLOG("spdk_vtophys(pqpair->cpl) failed\n");
1060 return -EFAULT;
1061 }
1062
1063 doorbell_base = &pctrlr->regs->doorbell[0].sq_tdbl;
1064 pqpair->sq_tdbl = doorbell_base + (2 * qpair->id + 0) * pctrlr->doorbell_stride_u32;
1065 pqpair->cq_hdbl = doorbell_base + (2 * qpair->id + 1) * pctrlr->doorbell_stride_u32;
1066
1067 /*
1068 * Reserve space for all of the trackers in a single allocation.
1069 * struct nvme_tracker must be padded so that its size is already a power of 2.
1070 * This ensures the PRP list embedded in the nvme_tracker object will not span a
1071 * 4KB boundary, while allowing access to trackers in tr[] via normal array indexing.
1072 */
1073 pqpair->tr = spdk_zmalloc(num_trackers * sizeof(*tr), sizeof(*tr), NULL,
1074 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE);
1075 if (pqpair->tr == NULL) {
1076 SPDK_ERRLOG("nvme_tr failed\n");
1077 return -ENOMEM;
1078 }
1079
1080 TAILQ_INIT(&pqpair->free_tr);
1081 TAILQ_INIT(&pqpair->outstanding_tr);
1082
1083 for (i = 0; i < num_trackers; i++) {
1084 tr = &pqpair->tr[i];
1085 nvme_qpair_construct_tracker(tr, i, spdk_vtophys(tr, NULL));
1086 TAILQ_INSERT_HEAD(&pqpair->free_tr, tr, tq_list);
1087 }
1088
1089 nvme_pcie_qpair_reset(qpair);
1090
1091 return 0;
1092 }
1093
1094 static inline void
1095 nvme_pcie_copy_command(struct spdk_nvme_cmd *dst, const struct spdk_nvme_cmd *src)
1096 {
1097 /* dst and src are known to be non-overlapping and 64-byte aligned. */
1098 #if defined(__SSE2__)
1099 __m128i *d128 = (__m128i *)dst;
1100 const __m128i *s128 = (const __m128i *)src;
1101
1102 _mm_stream_si128(&d128[0], _mm_load_si128(&s128[0]));
1103 _mm_stream_si128(&d128[1], _mm_load_si128(&s128[1]));
1104 _mm_stream_si128(&d128[2], _mm_load_si128(&s128[2]));
1105 _mm_stream_si128(&d128[3], _mm_load_si128(&s128[3]));
1106 #else
1107 *dst = *src;
1108 #endif
1109 }
1110
1111 /**
1112 * Note: the ctrlr_lock must be held when calling this function.
1113 */
1114 static void
1115 nvme_pcie_qpair_insert_pending_admin_request(struct spdk_nvme_qpair *qpair,
1116 struct nvme_request *req, struct spdk_nvme_cpl *cpl)
1117 {
1118 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
1119 struct nvme_request *active_req = req;
1120 struct spdk_nvme_ctrlr_process *active_proc;
1121
1122 /*
1123 * The admin request is from another process. Move to the per
1124 * process list for that process to handle it later.
1125 */
1126 assert(nvme_qpair_is_admin_queue(qpair));
1127 assert(active_req->pid != getpid());
1128
1129 active_proc = spdk_nvme_ctrlr_get_process(ctrlr, active_req->pid);
1130 if (active_proc) {
1131 /* Save the original completion information */
1132 memcpy(&active_req->cpl, cpl, sizeof(*cpl));
1133 STAILQ_INSERT_TAIL(&active_proc->active_reqs, active_req, stailq);
1134 } else {
1135 SPDK_ERRLOG("The owning process (pid %d) is not found. Dropping the request.\n",
1136 active_req->pid);
1137
1138 nvme_free_request(active_req);
1139 }
1140 }
1141
1142 /**
1143 * Note: the ctrlr_lock must be held when calling this function.
1144 */
1145 static void
1146 nvme_pcie_qpair_complete_pending_admin_request(struct spdk_nvme_qpair *qpair)
1147 {
1148 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
1149 struct nvme_request *req, *tmp_req;
1150 pid_t pid = getpid();
1151 struct spdk_nvme_ctrlr_process *proc;
1152
1153 /*
1154 * Check whether there is any pending admin request from
1155 * other active processes.
1156 */
1157 assert(nvme_qpair_is_admin_queue(qpair));
1158
1159 proc = spdk_nvme_ctrlr_get_current_process(ctrlr);
1160 if (!proc) {
1161 SPDK_ERRLOG("the active process (pid %d) is not found for this controller.\n", pid);
1162 assert(proc);
1163 return;
1164 }
1165
1166 STAILQ_FOREACH_SAFE(req, &proc->active_reqs, stailq, tmp_req) {
1167 STAILQ_REMOVE(&proc->active_reqs, req, nvme_request, stailq);
1168
1169 assert(req->pid == pid);
1170
1171 nvme_complete_request(req->cb_fn, req->cb_arg, qpair, req, &req->cpl);
1172 nvme_free_request(req);
1173 }
1174 }
1175
1176 static inline int
1177 nvme_pcie_qpair_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old)
1178 {
1179 return (uint16_t)(new_idx - event_idx) <= (uint16_t)(new_idx - old);
1180 }
1181
1182 static bool
1183 nvme_pcie_qpair_update_mmio_required(struct spdk_nvme_qpair *qpair, uint16_t value,
1184 volatile uint32_t *shadow_db,
1185 volatile uint32_t *eventidx)
1186 {
1187 uint16_t old;
1188
1189 if (!shadow_db) {
1190 return true;
1191 }
1192
1193 old = *shadow_db;
1194 *shadow_db = value;
1195
1196 if (!nvme_pcie_qpair_need_event(*eventidx, value, old)) {
1197 return false;
1198 }
1199
1200 return true;
1201 }
1202
1203 static inline void
1204 nvme_pcie_qpair_ring_sq_doorbell(struct spdk_nvme_qpair *qpair)
1205 {
1206 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1207 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(qpair->ctrlr);
1208 bool need_mmio = true;
1209
1210 if (spdk_unlikely(pqpair->flags.has_shadow_doorbell)) {
1211 need_mmio = nvme_pcie_qpair_update_mmio_required(qpair,
1212 pqpair->sq_tail,
1213 pqpair->shadow_doorbell.sq_tdbl,
1214 pqpair->shadow_doorbell.sq_eventidx);
1215 }
1216
1217 if (spdk_likely(need_mmio)) {
1218 spdk_wmb();
1219 g_thread_mmio_ctrlr = pctrlr;
1220 spdk_mmio_write_4(pqpair->sq_tdbl, pqpair->sq_tail);
1221 g_thread_mmio_ctrlr = NULL;
1222 }
1223 }
1224
1225 static inline void
1226 nvme_pcie_qpair_ring_cq_doorbell(struct spdk_nvme_qpair *qpair)
1227 {
1228 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1229 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(qpair->ctrlr);
1230 bool need_mmio = true;
1231
1232 if (spdk_unlikely(pqpair->flags.has_shadow_doorbell)) {
1233 need_mmio = nvme_pcie_qpair_update_mmio_required(qpair,
1234 pqpair->cq_head,
1235 pqpair->shadow_doorbell.cq_hdbl,
1236 pqpair->shadow_doorbell.cq_eventidx);
1237 }
1238
1239 if (spdk_likely(need_mmio)) {
1240 g_thread_mmio_ctrlr = pctrlr;
1241 spdk_mmio_write_4(pqpair->cq_hdbl, pqpair->cq_head);
1242 g_thread_mmio_ctrlr = NULL;
1243 }
1244 }
1245
1246 static void
1247 nvme_pcie_qpair_submit_tracker(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr)
1248 {
1249 struct nvme_request *req;
1250 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1251
1252 req = tr->req;
1253 assert(req != NULL);
1254
1255 /* Copy the command from the tracker to the submission queue. */
1256 nvme_pcie_copy_command(&pqpair->cmd[pqpair->sq_tail], &req->cmd);
1257
1258 if (spdk_unlikely(++pqpair->sq_tail == pqpair->num_entries)) {
1259 pqpair->sq_tail = 0;
1260 }
1261
1262 if (spdk_unlikely(pqpair->sq_tail == pqpair->sq_head)) {
1263 SPDK_ERRLOG("sq_tail is passing sq_head!\n");
1264 }
1265
1266 if (!pqpair->flags.delay_pcie_doorbell) {
1267 nvme_pcie_qpair_ring_sq_doorbell(qpair);
1268 }
1269 }
1270
1271 static void
1272 nvme_pcie_qpair_complete_tracker(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr,
1273 struct spdk_nvme_cpl *cpl, bool print_on_error)
1274 {
1275 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1276 struct nvme_request *req;
1277 bool retry, error;
1278 bool req_from_current_proc = true;
1279
1280 req = tr->req;
1281
1282 assert(req != NULL);
1283
1284 error = spdk_nvme_cpl_is_error(cpl);
1285 retry = error && nvme_completion_is_retry(cpl) &&
1286 req->retries < spdk_nvme_retry_count;
1287
1288 if (error && print_on_error && !qpair->ctrlr->opts.disable_error_logging) {
1289 nvme_qpair_print_command(qpair, &req->cmd);
1290 nvme_qpair_print_completion(qpair, cpl);
1291 }
1292
1293 assert(cpl->cid == req->cmd.cid);
1294
1295 if (retry) {
1296 req->retries++;
1297 nvme_pcie_qpair_submit_tracker(qpair, tr);
1298 } else {
1299 /* Only check admin requests from different processes. */
1300 if (nvme_qpair_is_admin_queue(qpair) && req->pid != getpid()) {
1301 req_from_current_proc = false;
1302 nvme_pcie_qpair_insert_pending_admin_request(qpair, req, cpl);
1303 } else {
1304 nvme_complete_request(tr->cb_fn, tr->cb_arg, qpair, req, cpl);
1305 }
1306
1307 if (req_from_current_proc == true) {
1308 nvme_qpair_free_request(qpair, req);
1309 }
1310
1311 tr->req = NULL;
1312
1313 TAILQ_REMOVE(&pqpair->outstanding_tr, tr, tq_list);
1314 TAILQ_INSERT_HEAD(&pqpair->free_tr, tr, tq_list);
1315
1316 /*
1317 * If the controller is in the middle of resetting, don't
1318 * try to submit queued requests here - let the reset logic
1319 * handle that instead.
1320 */
1321 if (!STAILQ_EMPTY(&qpair->queued_req) &&
1322 !qpair->ctrlr->is_resetting) {
1323 req = STAILQ_FIRST(&qpair->queued_req);
1324 STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
1325 nvme_qpair_submit_request(qpair, req);
1326 }
1327 }
1328 }
1329
1330 static void
1331 nvme_pcie_qpair_manual_complete_tracker(struct spdk_nvme_qpair *qpair,
1332 struct nvme_tracker *tr, uint32_t sct, uint32_t sc, uint32_t dnr,
1333 bool print_on_error)
1334 {
1335 struct spdk_nvme_cpl cpl;
1336
1337 memset(&cpl, 0, sizeof(cpl));
1338 cpl.sqid = qpair->id;
1339 cpl.cid = tr->cid;
1340 cpl.status.sct = sct;
1341 cpl.status.sc = sc;
1342 cpl.status.dnr = dnr;
1343 nvme_pcie_qpair_complete_tracker(qpair, tr, &cpl, print_on_error);
1344 }
1345
1346 static void
1347 nvme_pcie_qpair_abort_trackers(struct spdk_nvme_qpair *qpair, uint32_t dnr)
1348 {
1349 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1350 struct nvme_tracker *tr, *temp;
1351
1352 TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, temp) {
1353 if (!qpair->ctrlr->opts.disable_error_logging) {
1354 SPDK_ERRLOG("aborting outstanding command\n");
1355 }
1356 nvme_pcie_qpair_manual_complete_tracker(qpair, tr, SPDK_NVME_SCT_GENERIC,
1357 SPDK_NVME_SC_ABORTED_BY_REQUEST, dnr, true);
1358 }
1359 }
1360
1361 void
1362 nvme_pcie_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair)
1363 {
1364 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1365 struct nvme_tracker *tr;
1366
1367 tr = TAILQ_FIRST(&pqpair->outstanding_tr);
1368 while (tr != NULL) {
1369 assert(tr->req != NULL);
1370 if (tr->req->cmd.opc == SPDK_NVME_OPC_ASYNC_EVENT_REQUEST) {
1371 nvme_pcie_qpair_manual_complete_tracker(qpair, tr,
1372 SPDK_NVME_SCT_GENERIC, SPDK_NVME_SC_ABORTED_SQ_DELETION, 0,
1373 false);
1374 tr = TAILQ_FIRST(&pqpair->outstanding_tr);
1375 } else {
1376 tr = TAILQ_NEXT(tr, tq_list);
1377 }
1378 }
1379 }
1380
1381 static void
1382 nvme_pcie_admin_qpair_destroy(struct spdk_nvme_qpair *qpair)
1383 {
1384 nvme_pcie_admin_qpair_abort_aers(qpair);
1385 }
1386
1387 static int
1388 nvme_pcie_qpair_destroy(struct spdk_nvme_qpair *qpair)
1389 {
1390 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1391
1392 if (nvme_qpair_is_admin_queue(qpair)) {
1393 nvme_pcie_admin_qpair_destroy(qpair);
1394 }
1395 if (pqpair->cmd && !pqpair->sq_in_cmb) {
1396 spdk_free(pqpair->cmd);
1397 }
1398 if (pqpair->cpl) {
1399 spdk_free(pqpair->cpl);
1400 }
1401 if (pqpair->tr) {
1402 spdk_free(pqpair->tr);
1403 }
1404
1405 nvme_qpair_deinit(qpair);
1406
1407 spdk_free(pqpair);
1408
1409 return 0;
1410 }
1411
1412 void
1413 nvme_pcie_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr)
1414 {
1415 nvme_pcie_qpair_abort_trackers(qpair, dnr);
1416 }
1417
1418 static int
1419 nvme_pcie_ctrlr_cmd_create_io_cq(struct spdk_nvme_ctrlr *ctrlr,
1420 struct spdk_nvme_qpair *io_que, spdk_nvme_cmd_cb cb_fn,
1421 void *cb_arg)
1422 {
1423 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(io_que);
1424 struct nvme_request *req;
1425 struct spdk_nvme_cmd *cmd;
1426
1427 req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
1428 if (req == NULL) {
1429 return -ENOMEM;
1430 }
1431
1432 cmd = &req->cmd;
1433 cmd->opc = SPDK_NVME_OPC_CREATE_IO_CQ;
1434
1435 /*
1436 * TODO: create a create io completion queue command data
1437 * structure.
1438 */
1439 cmd->cdw10 = ((pqpair->num_entries - 1) << 16) | io_que->id;
1440 /*
1441 * 0x2 = interrupts enabled
1442 * 0x1 = physically contiguous
1443 */
1444 cmd->cdw11 = 0x1;
1445 cmd->dptr.prp.prp1 = pqpair->cpl_bus_addr;
1446
1447 return nvme_ctrlr_submit_admin_request(ctrlr, req);
1448 }
1449
1450 static int
1451 nvme_pcie_ctrlr_cmd_create_io_sq(struct spdk_nvme_ctrlr *ctrlr,
1452 struct spdk_nvme_qpair *io_que, spdk_nvme_cmd_cb cb_fn, void *cb_arg)
1453 {
1454 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(io_que);
1455 struct nvme_request *req;
1456 struct spdk_nvme_cmd *cmd;
1457
1458 req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
1459 if (req == NULL) {
1460 return -ENOMEM;
1461 }
1462
1463 cmd = &req->cmd;
1464 cmd->opc = SPDK_NVME_OPC_CREATE_IO_SQ;
1465
1466 /*
1467 * TODO: create a create io submission queue command data
1468 * structure.
1469 */
1470 cmd->cdw10 = ((pqpair->num_entries - 1) << 16) | io_que->id;
1471 /* 0x1 = physically contiguous */
1472 cmd->cdw11 = (io_que->id << 16) | (io_que->qprio << 1) | 0x1;
1473 cmd->dptr.prp.prp1 = pqpair->cmd_bus_addr;
1474
1475 return nvme_ctrlr_submit_admin_request(ctrlr, req);
1476 }
1477
1478 static int
1479 nvme_pcie_ctrlr_cmd_delete_io_cq(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair,
1480 spdk_nvme_cmd_cb cb_fn, void *cb_arg)
1481 {
1482 struct nvme_request *req;
1483 struct spdk_nvme_cmd *cmd;
1484
1485 req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
1486 if (req == NULL) {
1487 return -ENOMEM;
1488 }
1489
1490 cmd = &req->cmd;
1491 cmd->opc = SPDK_NVME_OPC_DELETE_IO_CQ;
1492 cmd->cdw10 = qpair->id;
1493
1494 return nvme_ctrlr_submit_admin_request(ctrlr, req);
1495 }
1496
1497 static int
1498 nvme_pcie_ctrlr_cmd_delete_io_sq(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair,
1499 spdk_nvme_cmd_cb cb_fn, void *cb_arg)
1500 {
1501 struct nvme_request *req;
1502 struct spdk_nvme_cmd *cmd;
1503
1504 req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
1505 if (req == NULL) {
1506 return -ENOMEM;
1507 }
1508
1509 cmd = &req->cmd;
1510 cmd->opc = SPDK_NVME_OPC_DELETE_IO_SQ;
1511 cmd->cdw10 = qpair->id;
1512
1513 return nvme_ctrlr_submit_admin_request(ctrlr, req);
1514 }
1515
1516 static int
1517 _nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair,
1518 uint16_t qid)
1519 {
1520 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
1521 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1522 struct nvme_completion_poll_status status;
1523 int rc;
1524
1525 rc = nvme_pcie_ctrlr_cmd_create_io_cq(ctrlr, qpair, nvme_completion_poll_cb, &status);
1526 if (rc != 0) {
1527 return rc;
1528 }
1529
1530 if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) {
1531 SPDK_ERRLOG("nvme_create_io_cq failed!\n");
1532 return -1;
1533 }
1534
1535 rc = nvme_pcie_ctrlr_cmd_create_io_sq(qpair->ctrlr, qpair, nvme_completion_poll_cb, &status);
1536 if (rc != 0) {
1537 return rc;
1538 }
1539
1540 if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) {
1541 SPDK_ERRLOG("nvme_create_io_sq failed!\n");
1542 /* Attempt to delete the completion queue */
1543 rc = nvme_pcie_ctrlr_cmd_delete_io_cq(qpair->ctrlr, qpair, nvme_completion_poll_cb, &status);
1544 if (rc != 0) {
1545 return -1;
1546 }
1547 spdk_nvme_wait_for_completion(ctrlr->adminq, &status);
1548 return -1;
1549 }
1550
1551 if (ctrlr->shadow_doorbell) {
1552 pqpair->shadow_doorbell.sq_tdbl = ctrlr->shadow_doorbell + (2 * qpair->id + 0) *
1553 pctrlr->doorbell_stride_u32;
1554 pqpair->shadow_doorbell.cq_hdbl = ctrlr->shadow_doorbell + (2 * qpair->id + 1) *
1555 pctrlr->doorbell_stride_u32;
1556 pqpair->shadow_doorbell.sq_eventidx = ctrlr->eventidx + (2 * qpair->id + 0) *
1557 pctrlr->doorbell_stride_u32;
1558 pqpair->shadow_doorbell.cq_eventidx = ctrlr->eventidx + (2 * qpair->id + 1) *
1559 pctrlr->doorbell_stride_u32;
1560 pqpair->flags.has_shadow_doorbell = 1;
1561 } else {
1562 pqpair->flags.has_shadow_doorbell = 0;
1563 }
1564 nvme_pcie_qpair_reset(qpair);
1565
1566 return 0;
1567 }
1568
1569 struct spdk_nvme_qpair *
1570 nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid,
1571 const struct spdk_nvme_io_qpair_opts *opts)
1572 {
1573 struct nvme_pcie_qpair *pqpair;
1574 struct spdk_nvme_qpair *qpair;
1575 int rc;
1576
1577 assert(ctrlr != NULL);
1578
1579 pqpair = spdk_zmalloc(sizeof(*pqpair), 64, NULL,
1580 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE);
1581 if (pqpair == NULL) {
1582 return NULL;
1583 }
1584
1585 pqpair->num_entries = opts->io_queue_size;
1586 pqpair->flags.delay_pcie_doorbell = opts->delay_pcie_doorbell;
1587
1588 qpair = &pqpair->qpair;
1589
1590 rc = nvme_qpair_init(qpair, qid, ctrlr, opts->qprio, opts->io_queue_requests);
1591 if (rc != 0) {
1592 nvme_pcie_qpair_destroy(qpair);
1593 return NULL;
1594 }
1595
1596 rc = nvme_pcie_qpair_construct(qpair);
1597 if (rc != 0) {
1598 nvme_pcie_qpair_destroy(qpair);
1599 return NULL;
1600 }
1601
1602 rc = _nvme_pcie_ctrlr_create_io_qpair(ctrlr, qpair, qid);
1603
1604 if (rc != 0) {
1605 SPDK_ERRLOG("I/O queue creation failed\n");
1606 nvme_pcie_qpair_destroy(qpair);
1607 return NULL;
1608 }
1609
1610 return qpair;
1611 }
1612
1613 int
1614 nvme_pcie_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
1615 {
1616 if (nvme_qpair_is_admin_queue(qpair)) {
1617 return 0;
1618 } else {
1619 return _nvme_pcie_ctrlr_create_io_qpair(ctrlr, qpair, qpair->id);
1620 }
1621 }
1622
1623 void
1624 nvme_pcie_ctrlr_disconnect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
1625 {
1626 }
1627
1628 int
1629 nvme_pcie_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
1630 {
1631 struct nvme_completion_poll_status status;
1632 int rc;
1633
1634 assert(ctrlr != NULL);
1635
1636 if (ctrlr->is_removed) {
1637 goto free;
1638 }
1639
1640 /* Delete the I/O submission queue */
1641 rc = nvme_pcie_ctrlr_cmd_delete_io_sq(ctrlr, qpair, nvme_completion_poll_cb, &status);
1642 if (rc != 0) {
1643 SPDK_ERRLOG("Failed to send request to delete_io_sq with rc=%d\n", rc);
1644 return rc;
1645 }
1646 if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) {
1647 return -1;
1648 }
1649
1650 /* Delete the completion queue */
1651 rc = nvme_pcie_ctrlr_cmd_delete_io_cq(ctrlr, qpair, nvme_completion_poll_cb, &status);
1652 if (rc != 0) {
1653 SPDK_ERRLOG("Failed to send request to delete_io_cq with rc=%d\n", rc);
1654 return rc;
1655 }
1656 if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) {
1657 return -1;
1658 }
1659
1660 free:
1661 if (qpair->no_deletion_notification_needed == 0) {
1662 /* Abort the rest of the I/O */
1663 nvme_pcie_qpair_abort_trackers(qpair, 1);
1664 }
1665
1666 nvme_pcie_qpair_destroy(qpair);
1667 return 0;
1668 }
1669
1670 static void
1671 nvme_pcie_fail_request_bad_vtophys(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr)
1672 {
1673 /*
1674 * Bad vtophys translation, so abort this request and return
1675 * immediately.
1676 */
1677 nvme_pcie_qpair_manual_complete_tracker(qpair, tr, SPDK_NVME_SCT_GENERIC,
1678 SPDK_NVME_SC_INVALID_FIELD,
1679 1 /* do not retry */, true);
1680 }
1681
1682 /*
1683 * Append PRP list entries to describe a virtually contiguous buffer starting at virt_addr of len bytes.
1684 *
1685 * *prp_index will be updated to account for the number of PRP entries used.
1686 */
1687 static inline int
1688 nvme_pcie_prp_list_append(struct nvme_tracker *tr, uint32_t *prp_index, void *virt_addr, size_t len,
1689 uint32_t page_size)
1690 {
1691 struct spdk_nvme_cmd *cmd = &tr->req->cmd;
1692 uintptr_t page_mask = page_size - 1;
1693 uint64_t phys_addr;
1694 uint32_t i;
1695
1696 SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp_index:%u virt_addr:%p len:%u\n",
1697 *prp_index, virt_addr, (uint32_t)len);
1698
1699 if (spdk_unlikely(((uintptr_t)virt_addr & 3) != 0)) {
1700 SPDK_ERRLOG("virt_addr %p not dword aligned\n", virt_addr);
1701 return -EINVAL;
1702 }
1703
1704 i = *prp_index;
1705 while (len) {
1706 uint32_t seg_len;
1707
1708 /*
1709 * prp_index 0 is stored in prp1, and the rest are stored in the prp[] array,
1710 * so prp_index == count is valid.
1711 */
1712 if (spdk_unlikely(i > SPDK_COUNTOF(tr->u.prp))) {
1713 SPDK_ERRLOG("out of PRP entries\n");
1714 return -EINVAL;
1715 }
1716
1717 phys_addr = spdk_vtophys(virt_addr, NULL);
1718 if (spdk_unlikely(phys_addr == SPDK_VTOPHYS_ERROR)) {
1719 SPDK_ERRLOG("vtophys(%p) failed\n", virt_addr);
1720 return -EINVAL;
1721 }
1722
1723 if (i == 0) {
1724 SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp1 = %p\n", (void *)phys_addr);
1725 cmd->dptr.prp.prp1 = phys_addr;
1726 seg_len = page_size - ((uintptr_t)virt_addr & page_mask);
1727 } else {
1728 if ((phys_addr & page_mask) != 0) {
1729 SPDK_ERRLOG("PRP %u not page aligned (%p)\n", i, virt_addr);
1730 return -EINVAL;
1731 }
1732
1733 SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp[%u] = %p\n", i - 1, (void *)phys_addr);
1734 tr->u.prp[i - 1] = phys_addr;
1735 seg_len = page_size;
1736 }
1737
1738 seg_len = spdk_min(seg_len, len);
1739 virt_addr += seg_len;
1740 len -= seg_len;
1741 i++;
1742 }
1743
1744 cmd->psdt = SPDK_NVME_PSDT_PRP;
1745 if (i <= 1) {
1746 cmd->dptr.prp.prp2 = 0;
1747 } else if (i == 2) {
1748 cmd->dptr.prp.prp2 = tr->u.prp[0];
1749 SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp2 = %p\n", (void *)cmd->dptr.prp.prp2);
1750 } else {
1751 cmd->dptr.prp.prp2 = tr->prp_sgl_bus_addr;
1752 SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp2 = %p (PRP list)\n", (void *)cmd->dptr.prp.prp2);
1753 }
1754
1755 *prp_index = i;
1756 return 0;
1757 }
1758
1759 /**
1760 * Build PRP list describing physically contiguous payload buffer.
1761 */
1762 static int
1763 nvme_pcie_qpair_build_contig_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req,
1764 struct nvme_tracker *tr)
1765 {
1766 uint32_t prp_index = 0;
1767 int rc;
1768
1769 rc = nvme_pcie_prp_list_append(tr, &prp_index, req->payload.contig_or_cb_arg + req->payload_offset,
1770 req->payload_size, qpair->ctrlr->page_size);
1771 if (rc) {
1772 nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1773 return rc;
1774 }
1775
1776 return 0;
1777 }
1778
1779 /**
1780 * Build SGL list describing scattered payload buffer.
1781 */
1782 static int
1783 nvme_pcie_qpair_build_hw_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req,
1784 struct nvme_tracker *tr)
1785 {
1786 int rc;
1787 void *virt_addr;
1788 uint64_t phys_addr;
1789 uint32_t remaining_transfer_len, remaining_user_sge_len, length;
1790 struct spdk_nvme_sgl_descriptor *sgl;
1791 uint32_t nseg = 0;
1792
1793 /*
1794 * Build scattered payloads.
1795 */
1796 assert(req->payload_size != 0);
1797 assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL);
1798 assert(req->payload.reset_sgl_fn != NULL);
1799 assert(req->payload.next_sge_fn != NULL);
1800 req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset);
1801
1802 sgl = tr->u.sgl;
1803 req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG;
1804 req->cmd.dptr.sgl1.unkeyed.subtype = 0;
1805
1806 remaining_transfer_len = req->payload_size;
1807
1808 while (remaining_transfer_len > 0) {
1809 rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg,
1810 &virt_addr, &remaining_user_sge_len);
1811 if (rc) {
1812 nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1813 return -1;
1814 }
1815
1816 remaining_user_sge_len = spdk_min(remaining_user_sge_len, remaining_transfer_len);
1817 remaining_transfer_len -= remaining_user_sge_len;
1818 while (remaining_user_sge_len > 0) {
1819 if (nseg >= NVME_MAX_SGL_DESCRIPTORS) {
1820 nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1821 return -1;
1822 }
1823
1824 phys_addr = spdk_vtophys(virt_addr, NULL);
1825 if (phys_addr == SPDK_VTOPHYS_ERROR) {
1826 nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1827 return -1;
1828 }
1829
1830 length = spdk_min(remaining_user_sge_len, VALUE_2MB - _2MB_OFFSET(virt_addr));
1831 remaining_user_sge_len -= length;
1832 virt_addr += length;
1833
1834 if (nseg > 0 && phys_addr ==
1835 (*(sgl - 1)).address + (*(sgl - 1)).unkeyed.length) {
1836 /* extend previous entry */
1837 (*(sgl - 1)).unkeyed.length += length;
1838 continue;
1839 }
1840
1841 sgl->unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
1842 sgl->unkeyed.length = length;
1843 sgl->address = phys_addr;
1844 sgl->unkeyed.subtype = 0;
1845
1846 sgl++;
1847 nseg++;
1848 }
1849 }
1850
1851 if (nseg == 1) {
1852 /*
1853 * The whole transfer can be described by a single SGL descriptor.
1854 * Use the special case described by the spec where SGL1's type is Data Block.
1855 * This means the SGL in the tracker is not used at all, so copy the first (and only)
1856 * SGL element into SGL1.
1857 */
1858 req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
1859 req->cmd.dptr.sgl1.address = tr->u.sgl[0].address;
1860 req->cmd.dptr.sgl1.unkeyed.length = tr->u.sgl[0].unkeyed.length;
1861 } else {
1862 /* For now we can only support 1 SGL segment in NVMe controller */
1863 req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_LAST_SEGMENT;
1864 req->cmd.dptr.sgl1.address = tr->prp_sgl_bus_addr;
1865 req->cmd.dptr.sgl1.unkeyed.length = nseg * sizeof(struct spdk_nvme_sgl_descriptor);
1866 }
1867
1868 return 0;
1869 }
1870
1871 /**
1872 * Build PRP list describing scattered payload buffer.
1873 */
1874 static int
1875 nvme_pcie_qpair_build_prps_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req,
1876 struct nvme_tracker *tr)
1877 {
1878 int rc;
1879 void *virt_addr;
1880 uint32_t remaining_transfer_len, length;
1881 uint32_t prp_index = 0;
1882 uint32_t page_size = qpair->ctrlr->page_size;
1883
1884 /*
1885 * Build scattered payloads.
1886 */
1887 assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL);
1888 assert(req->payload.reset_sgl_fn != NULL);
1889 req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset);
1890
1891 remaining_transfer_len = req->payload_size;
1892 while (remaining_transfer_len > 0) {
1893 assert(req->payload.next_sge_fn != NULL);
1894 rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, &virt_addr, &length);
1895 if (rc) {
1896 nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1897 return -1;
1898 }
1899
1900 length = spdk_min(remaining_transfer_len, length);
1901
1902 /*
1903 * Any incompatible sges should have been handled up in the splitting routine,
1904 * but assert here as an additional check.
1905 *
1906 * All SGEs except last must end on a page boundary.
1907 */
1908 assert((length == remaining_transfer_len) ||
1909 _is_page_aligned((uintptr_t)virt_addr + length, page_size));
1910
1911 rc = nvme_pcie_prp_list_append(tr, &prp_index, virt_addr, length, page_size);
1912 if (rc) {
1913 nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1914 return rc;
1915 }
1916
1917 remaining_transfer_len -= length;
1918 }
1919
1920 return 0;
1921 }
1922
1923 int
1924 nvme_pcie_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req)
1925 {
1926 struct nvme_tracker *tr;
1927 int rc = 0;
1928 void *md_payload;
1929 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
1930 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1931
1932 if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) {
1933 nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
1934 }
1935
1936 tr = TAILQ_FIRST(&pqpair->free_tr);
1937
1938 if (tr == NULL) {
1939 /*
1940 * Put the request on the qpair's request queue to be
1941 * processed when a tracker frees up via a command
1942 * completion.
1943 */
1944 STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq);
1945 goto exit;
1946 }
1947
1948 TAILQ_REMOVE(&pqpair->free_tr, tr, tq_list); /* remove tr from free_tr */
1949 TAILQ_INSERT_TAIL(&pqpair->outstanding_tr, tr, tq_list);
1950 tr->req = req;
1951 tr->cb_fn = req->cb_fn;
1952 tr->cb_arg = req->cb_arg;
1953 req->cmd.cid = tr->cid;
1954
1955 if (req->payload_size && req->payload.md) {
1956 md_payload = req->payload.md + req->md_offset;
1957 tr->req->cmd.mptr = spdk_vtophys(md_payload, NULL);
1958 if (tr->req->cmd.mptr == SPDK_VTOPHYS_ERROR) {
1959 nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1960 rc = -EINVAL;
1961 goto exit;
1962 }
1963 }
1964
1965 if (req->payload_size == 0) {
1966 /* Null payload - leave PRP fields untouched */
1967 rc = 0;
1968 } else if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG) {
1969 rc = nvme_pcie_qpair_build_contig_request(qpair, req, tr);
1970 } else if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL) {
1971 if (ctrlr->flags & SPDK_NVME_CTRLR_SGL_SUPPORTED) {
1972 rc = nvme_pcie_qpair_build_hw_sgl_request(qpair, req, tr);
1973 } else {
1974 rc = nvme_pcie_qpair_build_prps_sgl_request(qpair, req, tr);
1975 }
1976 } else {
1977 assert(0);
1978 nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1979 rc = -EINVAL;
1980 }
1981
1982 if (rc < 0) {
1983 goto exit;
1984 }
1985
1986 nvme_pcie_qpair_submit_tracker(qpair, tr);
1987
1988 exit:
1989 if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) {
1990 nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
1991 }
1992
1993 return rc;
1994 }
1995
1996 static void
1997 nvme_pcie_qpair_check_timeout(struct spdk_nvme_qpair *qpair)
1998 {
1999 uint64_t t02;
2000 struct nvme_tracker *tr, *tmp;
2001 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
2002 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
2003 struct spdk_nvme_ctrlr_process *active_proc;
2004
2005 /* Don't check timeouts during controller initialization. */
2006 if (ctrlr->state != NVME_CTRLR_STATE_READY) {
2007 return;
2008 }
2009
2010 if (nvme_qpair_is_admin_queue(qpair)) {
2011 active_proc = spdk_nvme_ctrlr_get_current_process(ctrlr);
2012 } else {
2013 active_proc = qpair->active_proc;
2014 }
2015
2016 /* Only check timeouts if the current process has a timeout callback. */
2017 if (active_proc == NULL || active_proc->timeout_cb_fn == NULL) {
2018 return;
2019 }
2020
2021 t02 = spdk_get_ticks();
2022 TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, tmp) {
2023 assert(tr->req != NULL);
2024
2025 if (nvme_request_check_timeout(tr->req, tr->cid, active_proc, t02)) {
2026 /*
2027 * The requests are in order, so as soon as one has not timed out,
2028 * stop iterating.
2029 */
2030 break;
2031 }
2032 }
2033 }
2034
2035 int32_t
2036 nvme_pcie_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions)
2037 {
2038 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
2039 struct nvme_tracker *tr;
2040 struct spdk_nvme_cpl *cpl, *next_cpl;
2041 uint32_t num_completions = 0;
2042 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
2043 uint16_t next_cq_head;
2044 uint8_t next_phase;
2045 bool next_is_valid = false;
2046
2047 if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) {
2048 nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
2049 }
2050
2051 if (max_completions == 0 || max_completions > pqpair->max_completions_cap) {
2052 /*
2053 * max_completions == 0 means unlimited, but complete at most
2054 * max_completions_cap batch of I/O at a time so that the completion
2055 * queue doorbells don't wrap around.
2056 */
2057 max_completions = pqpair->max_completions_cap;
2058 }
2059
2060 while (1) {
2061 cpl = &pqpair->cpl[pqpair->cq_head];
2062
2063 if (!next_is_valid && cpl->status.p != pqpair->flags.phase) {
2064 break;
2065 }
2066
2067 if (spdk_likely(pqpair->cq_head + 1 != pqpair->num_entries)) {
2068 next_cq_head = pqpair->cq_head + 1;
2069 next_phase = pqpair->flags.phase;
2070 } else {
2071 next_cq_head = 0;
2072 next_phase = !pqpair->flags.phase;
2073 }
2074 next_cpl = &pqpair->cpl[next_cq_head];
2075 next_is_valid = (next_cpl->status.p == next_phase);
2076 if (next_is_valid) {
2077 __builtin_prefetch(&pqpair->tr[next_cpl->cid]);
2078 }
2079
2080 #ifdef __PPC64__
2081 /*
2082 * This memory barrier prevents reordering of:
2083 * - load after store from/to tr
2084 * - load after load cpl phase and cpl cid
2085 */
2086 spdk_mb();
2087 #elif defined(__aarch64__)
2088 __asm volatile("dmb oshld" ::: "memory");
2089 #endif
2090
2091 if (spdk_unlikely(++pqpair->cq_head == pqpair->num_entries)) {
2092 pqpair->cq_head = 0;
2093 pqpair->flags.phase = !pqpair->flags.phase;
2094 }
2095
2096 tr = &pqpair->tr[cpl->cid];
2097 /* Prefetch the req's STAILQ_ENTRY since we'll need to access it
2098 * as part of putting the req back on the qpair's free list.
2099 */
2100 __builtin_prefetch(&tr->req->stailq);
2101 pqpair->sq_head = cpl->sqhd;
2102
2103 if (tr->req) {
2104 nvme_pcie_qpair_complete_tracker(qpair, tr, cpl, true);
2105 } else {
2106 SPDK_ERRLOG("cpl does not map to outstanding cmd\n");
2107 nvme_qpair_print_completion(qpair, cpl);
2108 assert(0);
2109 }
2110
2111 if (++num_completions == max_completions) {
2112 break;
2113 }
2114 }
2115
2116 if (num_completions > 0) {
2117 nvme_pcie_qpair_ring_cq_doorbell(qpair);
2118 }
2119
2120 if (pqpair->flags.delay_pcie_doorbell) {
2121 if (pqpair->last_sq_tail != pqpair->sq_tail) {
2122 nvme_pcie_qpair_ring_sq_doorbell(qpair);
2123 pqpair->last_sq_tail = pqpair->sq_tail;
2124 }
2125 }
2126
2127 if (spdk_unlikely(ctrlr->timeout_enabled)) {
2128 /*
2129 * User registered for timeout callback
2130 */
2131 nvme_pcie_qpair_check_timeout(qpair);
2132 }
2133
2134 /* Before returning, complete any pending admin request. */
2135 if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) {
2136 nvme_pcie_qpair_complete_pending_admin_request(qpair);
2137
2138 nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
2139 }
2140
2141 return num_completions;
2142 }